From db7e9e68411de074dee78c92657e983da4b89500 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 21 Feb 2024 20:59:42 -0800
Subject: [PATCH 001/546] [TypeProf][InstrPGO] Introduce raw and instr profile
 format change for type profiling. (#81691)

* Raw profile format
- Header: records the byte size of compressed vtable names, and the
number of profiled vtable entries (call it `VTableProfData`). Header
also records padded bytes of each section.
- Payload: adds a section for compressed vtable names, and a section to
store `VTableProfData`. Both sections are padded so the size is a
multiple of 8.
* Indexed profile format
  - Header: records the byte offset of compressed vtable names.
- Payload: adds a section to store compressed vtable names. This section
is used by `llvm-profdata` to show the list of vtables profiled for an
instrumented site.

[The originally reviewed
patch](https://github.com/llvm/llvm-project/pull/66825) will have
profile reader/write change and llvm-profdata change.
- To ensure this PR has all the necessary profile format change along
with profile version bump, created a copy of the originally reviewed
patch in https://github.com/llvm/llvm-project/pull/80761. The copy
doesn't have profile format change, but it has the set of tests which
covers type profile generation, profile read and profile merge. Tests
pass there.

rfc in
https://discourse.llvm.org/t/rfc-dynamic-type-profiling-and-optimizations-in-llvm/74600

---------

Co-authored-by: modiking <modiking213@gmail.com>
---
 compiler-rt/include/profile/InstrProfData.inc |  50 ++++++++-
 compiler-rt/lib/profile/InstrProfiling.h      |  35 +++++--
 .../lib/profile/InstrProfilingBuffer.c        |  96 +++++++++++++++---
 .../lib/profile/InstrProfilingInternal.h      |   8 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  23 ++++-
 .../lib/profile/InstrProfilingPlatformLinux.c |  20 ++++
 .../lib/profile/InstrProfilingWriter.c        |  37 +++++--
 .../profile/instrprof-write-buffer-internal.c |   6 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |  17 +++-
 .../llvm/ProfileData/InstrProfData.inc        |  50 ++++++++-
 .../llvm/ProfileData/InstrProfReader.h        |  13 +++
 llvm/lib/ProfileData/InstrProf.cpp            |  11 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  44 +++++++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  42 ++++++--
 .../InstrProfiling/coverage.ll                |   8 +-
 .../thinlto_indirect_call_promotion.profraw   | Bin 528 -> 544 bytes
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 2016 -> 2032 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1968 -> 1984 bytes
 .../thinlto_indirect_call_promotion.profraw   | Bin 0 -> 528 bytes
 .../llvm-profdata/binary-ids-padding.test     |   6 +-
 .../llvm-profdata/large-binary-id-size.test   |   4 +-
 ...alformed-not-space-for-another-header.test |   6 +-
 .../malformed-num-counters-zero.test          |   6 +-
 .../malformed-ptr-to-counter-array.test       |   6 +-
 .../misaligned-binary-ids-size.test           |   4 +-
 .../mismatched-raw-profile-header.test        |   2 +
 .../tools/llvm-profdata/raw-32-bits-be.test   |  11 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-two-profiles.test |   8 +-
 32 files changed, 458 insertions(+), 89 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index c907a9736f316..1f77853bb8baa 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
+                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+                       IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the address point of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (e.g., there might be an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (starting and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 0123908336918..be694a8d3330b 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,6 +49,12 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
+typedef void *IntPtrT;
+typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "profile/InstrProfData.inc"
+} VTableProfData;
+
 /*!
  * \brief Return 1 if profile counters are continuously synced to the raw
  * profile via an mmap(). This is in contrast to the default mode, in which
@@ -103,12 +109,16 @@ const __llvm_profile_data *__llvm_profile_begin_data(void);
 const __llvm_profile_data *__llvm_profile_end_data(void);
 const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
+const char *__llvm_profile_begin_vtabnames(void);
+const char *__llvm_profile_end_vtabnames(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
 char *__llvm_profile_begin_bitmap(void);
 char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
+VTableProfData *__llvm_profile_begin_vtables();
+VTableProfData *__llvm_profile_end_vtables();
 uint32_t *__llvm_profile_begin_orderfile();
 
 /*!
@@ -252,20 +262,31 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
 /*! \brief Get the size of the profile name section in bytes. */
 uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
 
-/* ! \brief Given the sizes of the data and counter information, return the
- * number of padding bytes before and after the counters, and after the names,
- * in the raw profile.
+/*! \brief Get the number of virtual table profile data entries */
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End);
+
+/*! \brief Get the size of virtual table profile data in bytes. */
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End);
+
+/* ! \brief Given the sizes of the data and counter information, computes the
+ * number of padding bytes before and after the counter section, as well as the
+ * number of padding bytes after other setions in the raw profile.
+ * Returns -1 upon errors and 0 upon success. Output parameters should be used
+ * iff return value is 0.
  *
  * Note: When mmap() mode is disabled, no padding bytes before/after counters
  * are needed. However, in mmap() mode, the counter section in the raw profile
  * must be page-aligned: this API computes the number of padding bytes
  * needed to achieve that.
  */
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
-    uint64_t *PaddingBytesAfterNames);
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames);
 
 /*!
  * \brief Set the flag that profile data has been dumped to the file.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index af52804b2b532..7c5c26f4d113b 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -51,16 +51,29 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
 
   return __llvm_profile_get_size_for_buffer_internal(
       DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd,
-      NamesBegin, NamesEnd);
+      NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
+  // `sizeof(__llvm_profile_data) - 1` is required in the numerator when
+  // [Begin, End] represents an inclusive range.
+  // For ELF, [Begin, End) represents the address of linker-inserted
+  // symbols  `__start__<elf-section>` and `__stop_<elf-section>`.
+  // Thereby, `End` is one byte past the inclusive range, and
+  // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get
+  // the correct number of profile data.
+  // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true
+  // across platforms.
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
 }
@@ -71,6 +84,26 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
+// Counts the number of `VTableProfData` elements within the range of [Begin,
+// End). Caller should guarantee that End points to one byte past the inclusive
+// range.
+// FIXME: Add a compiler-rt test to make sure the number of vtables in the
+// raw profile is the same as the number of vtable elements in the instrumented
+// binary.
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End) {
+  // Convert pointers to intptr_t to use integer arithmetic.
+  intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin;
+  return (EndI - BeginI) / sizeof(VTableProfData);
+}
+
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End) {
+  return (intptr_t)(End) - (intptr_t)(Begin);
+}
+
 COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) {
   if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE)
     return sizeof(uint8_t);
@@ -119,11 +152,13 @@ static int needsCounterPadding(void) {
 }
 
 COMPILER_RT_VISIBILITY
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
-    uint64_t *PaddingBytesAfterNames) {
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) {
+  // Counter padding is needed only if continuous mode is enabled.
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
@@ -131,9 +166,19 @@ void __llvm_profile_get_padding_sizes_for_counters(
     *PaddingBytesAfterBitmapBytes =
         __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
-    return;
+    if (PaddingBytesAfterVTable != NULL)
+      *PaddingBytesAfterVTable =
+          __llvm_profile_get_num_padding_bytes(VTableSize);
+    if (PaddingBytesAfterVName != NULL)
+      *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize);
+    return 0;
   }
 
+  // Value profiling not supported in continuous mode at profile-write time.
+  // Return -1 to alert the incompatibility.
+  if (VTableSize != 0 || VNameSize != 0)
+    return -1;
+
   // In continuous mode, the file offsets for headers and for the start of
   // counter sections need to be page-aligned.
   *PaddingBytesBeforeCounters =
@@ -142,13 +187,22 @@ void __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesAfterBitmapBytes =
       calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
+  // Set these two variables to zero to avoid uninitialized variables
+  // even if VTableSize and VNameSize are known to be zero.
+  if (PaddingBytesAfterVTable != NULL)
+    *PaddingBytesAfterVTable = 0;
+  if (PaddingBytesAfterVName != NULL)
+    *PaddingBytesAfterVName = 0;
+  return 0;
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) {
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
+    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
+    const char *VNamesBegin, const char *VNamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -156,20 +210,29 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
+  const uint64_t VTableSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNameSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */,
+      0 /* VNameSize */, &PaddingBytesBeforeCounters,
+      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+      &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+      &PaddingBytesAfterVNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
          PaddingBytesAfterCounters + NumBitmapBytes +
-         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames;
+         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames +
+         VTableSize + PaddingBytesAfterVTable + VNameSize +
+         PaddingBytesAfterVNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -191,7 +254,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
-  return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
-                            NamesEnd, 0);
+  // Set virtual table arguments to NULL since they are not supported yet.
+  return lprofWriteDataImpl(
+      &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
+      BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd,
+      /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL,
+      /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 03ed67fcfa766..d5bd0e41fb129 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -22,7 +22,9 @@
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
+    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
+    const char *VNamesBegin, const char *VNamesEnd);
 
 /*!
  * \brief Write instrumentation data to the given buffer, given explicit
@@ -156,7 +158,9 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const char *CountersBegin, const char *CountersEnd,
                        const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
-                       const char *NamesEnd, int SkipNameDataWrite);
+                       const char *NamesEnd, const VTableProfData *VTableBegin,
+                       const VTableProfData *VTableEnd, const char *VNamesBegin,
+                       const char *VNamesEnd, int SkipNameDataWrite);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index b5850e99ee37d..c0706b73e1668 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -107,6 +107,26 @@ static uintptr_t signextIfWin64(void *V) {
 #endif
 }
 
+// Skip names section, vtable profile data section and vtable names section
+// for runtime profile merge. To merge runtime addresses from multiple
+// profiles collected from the same instrumented binary, the binary should be
+// loaded at fixed base address (e.g., build with -no-pie, or run with ASLR
+// disabled). In this set-up these three sections remain unchanged.
+static uint64_t
+getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) {
+  const uint64_t VTableSectionSize =
+      Header->NumVTables * sizeof(VTableProfData);
+  const uint64_t PaddingBytesAfterVTableSection =
+      __llvm_profile_get_num_padding_bytes(VTableSectionSize);
+  const uint64_t VNamesSize = Header->VNamesSize;
+  const uint64_t PaddingBytesAfterVNamesSize =
+      __llvm_profile_get_num_padding_bytes(VNamesSize);
+  return Header->NamesSize +
+         __llvm_profile_get_num_padding_bytes(Header->NamesSize) +
+         VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize +
+         PaddingBytesAfterVNamesSize;
+}
+
 COMPILER_RT_VISIBILITY
 int __llvm_profile_merge_from_buffer(const char *ProfileData,
                                      uint64_t ProfileSize) {
@@ -137,8 +157,7 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcBitmapStart = SrcCountersEnd;
   SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
-      SrcNameStart + Header->NamesSize +
-      __llvm_profile_get_num_padding_bytes(Header->NamesSize);
+      SrcNameStart + getDistanceFromCounterToValueProf(Header);
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 19266ab6c6fb8..d2554a2702aaf 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -24,8 +24,12 @@
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
+#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON)
+#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
+#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON)
+#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON)
 #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
 #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
@@ -41,6 +45,10 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -63,6 +71,18 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) {
 COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) {
   return &PROF_NAME_STOP;
 }
+COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) {
+  return &PROF_VNAME_START;
+}
+COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) {
+  return &PROF_VNAME_STOP;
+}
+COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_begin_vtables(void) {
+  return &PROF_VTABLE_START;
+}
+COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_end_vtables(void) {
+  return &PROF_VTABLE_STOP;
+}
 COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
   return &PROF_CNTS_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 4d767d1385148..8816a71155511 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -250,9 +250,14 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
                             CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
-                            NamesBegin, NamesEnd, SkipNameDataWrite);
+                            NamesBegin, NamesEnd, VTableBegin, VTableEnd,
+                            VNamesBegin, VNamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
@@ -261,7 +266,9 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *CountersBegin, const char *CountersEnd,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
-                   const char *NamesEnd, int SkipNameDataWrite) {
+                   const char *NamesEnd, const VTableProfData *VTableBegin,
+                   const VTableProfData *VTableEnd, const char *VNamesBegin,
+                   const char *VNamesEnd, int SkipNameDataWrite) {
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
       __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -273,6 +280,12 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
+  const uint64_t NumVTables =
+      __llvm_profile_get_num_vtable(VTableBegin, VTableEnd);
+  const uint64_t VTableSectionSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNamesSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -280,11 +293,15 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
-  __llvm_profile_get_padding_sizes_for_counters(
-      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
+          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
+          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+          &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+          &PaddingBytesAfterVNames) == -1)
+    return -1;
 
   {
 /* Initialize header structure.  */
@@ -323,7 +340,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
+      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1},
+      {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1},
+      {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
index d9670f739ca98..2c1c29ac0c588 100644
--- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c
+++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
@@ -31,7 +31,8 @@ char *__llvm_profile_end_bitmap(void);
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const void *DataBegin, const void *DataEnd, const char *CountersBegin,
     const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
-    const char *NamesBegin, const char *NamesEnd);
+    const char *NamesBegin, const char *NamesEnd, const void *VTableBegin,
+    const void *VTableEnd, const char *VNamesBegin, const char *VNamesEnd);
 
 int __llvm_profile_write_buffer_internal(
     char *Buffer, const void *DataBegin, const void *DataEnd,
@@ -45,7 +46,8 @@ int main(int argc, const char *argv[]) {
       __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
       __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
-      __llvm_profile_begin_names(), __llvm_profile_end_names());
+      __llvm_profile_begin_names(), __llvm_profile_end_names(), NULL, NULL,
+      NULL, NULL);
 
   char *buf = malloc(bufsize);
   int ret = __llvm_profile_write_buffer_internal(
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index a928ba6961f36..25ec06a739202 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -831,6 +831,7 @@ struct InstrProfRecord {
   struct ValueProfData {
     std::vector<InstrProfValueSiteRecord> IndirectCallSites;
     std::vector<InstrProfValueSiteRecord> MemOPSizes;
+    std::vector<InstrProfValueSiteRecord> VTableTargets;
   };
   std::unique_ptr<ValueProfData> ValueData;
 
@@ -853,6 +854,8 @@ struct InstrProfRecord {
       return ValueData->IndirectCallSites;
     case IPVK_MemOPSize:
       return ValueData->MemOPSizes;
+    case IPVK_VTableTarget:
+      return ValueData->VTableTargets;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -1036,7 +1039,9 @@ enum ProfVersion {
   Version10 = 10,
   // An additional field is used for bitmap bytes.
   Version11 = 11,
-  // The current version is 11.
+  // VTable profiling,
+  Version12 = 12,
+  // The current version is 12.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1057,6 +1062,7 @@ struct Header {
   uint64_t MemProfOffset;
   uint64_t BinaryIdOffset;
   uint64_t TemporalProfTracesOffset;
+  uint64_t VTableNamesOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
@@ -1193,8 +1199,13 @@ template <> inline uint64_t getMagic<uint32_t>() {
 // It should also match the synthesized type in
 // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters.
 template <class IntPtrT> struct alignas(8) ProfileData {
-  #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
-  #include "llvm/ProfileData/InstrProfData.inc"
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+template <class IntPtrT> struct alignas(8) VTableProfileData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
 };
 
 // File header structure of the LLVM profile data in raw format.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index c907a9736f316..1f77853bb8baa 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
+                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+                       IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the address point of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (e.g., there might be an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (starting and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 87f15639a2c3c..cfde5d3fc77d6 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -326,12 +326,16 @@ class RawInstrProfReader : public InstrProfReader {
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableBegin = nullptr;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableEnd = nullptr;
   const char *CountersStart;
   const char *CountersEnd;
   const char *BitmapStart;
   const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
+  const char *VNamesStart = nullptr;
+  const char *VNamesEnd = nullptr;
   // After value profile is all read, this pointer points to
   // the header of next profile data (if exists)
   const uint8_t *ValueDataStart;
@@ -656,6 +660,15 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<MemProfRecordHashTable> MemProfRecordTable;
   /// MemProf frame profile data on-disk indexed via frame id.
   std::unique_ptr<MemProfFrameHashTable> MemProfFrameTable;
+  /// VTableNamePtr points to the beginning of compressed vtable names.
+  /// When a symtab is constructed from profiles by llvm-profdata, the list of
+  /// names could be decompressed based on `VTableNamePtr` and
+  /// `CompressedVTableNamesLen`.
+  /// A compiler that reads indexed profiles could construct symtab from module
+  /// IR so it doesn't need the decompressed names.
+  const char *VTableNamePtr = nullptr;
+  /// The length of compressed vtable names.
+  uint64_t CompressedVTableNamesLen = 0;
   /// Total size of binary ids.
   uint64_t BinaryIdsSize{0};
   /// Start address of binary id length and data pairs.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 2eeeff987399d..b9afee413853e 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1533,9 +1533,12 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
+  case 12ull:
+    H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset));
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
@@ -1561,10 +1564,14 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
+  case 12ull:
+    return offsetOf(&Header::VTableNamesOffset) +
+           sizeof(Header::VTableNamesOffset);
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 0d8d43daae960..31b742bca14d6 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -366,6 +366,11 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
               return E;
             Value = IndexedInstrProf::ComputeHash(VD.first);
           }
+        } else if (ValueKind == IPVK_VTableTarget) {
+          if (InstrProfSymtab::isExternalSymbol(VD.first))
+            Value = 0;
+          else
+            Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
           READ_NUM(VD.first, Value);
         }
@@ -582,10 +587,17 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto NumBitmapBytes = swap(Header.NumBitmapBytes);
   auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
+  auto VTableNameSize = swap(Header.VNamesSize);
+  auto NumVTables = swap(Header.NumVTables);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>);
-  auto PaddingSize = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize);
+
+  auto VTableSectionSize =
+      NumVTables * sizeof(RawInstrProf::VTableProfileData<IntPtrT>);
+  auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize;
@@ -594,7 +606,12 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
   ptrdiff_t NamesOffset =
       BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
-  ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
+  ptrdiff_t VTableProfDataOffset =
+      NamesOffset + NamesSize + PaddingBytesAfterNames;
+  ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize +
+                               PaddingBytesAfterVTableProfData;
+  ptrdiff_t ValueDataOffset =
+      VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
   if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
@@ -614,8 +631,14 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
         Start + DataOffset);
     DataEnd = Data + NumData;
+    VTableBegin =
+        reinterpret_cast<const RawInstrProf::VTableProfileData<IntPtrT> *>(
+            Start + VTableProfDataOffset);
+    VTableEnd = VTableBegin + NumVTables;
     NamesStart = Start + NamesOffset;
     NamesEnd = NamesStart + NamesSize;
+    VNamesStart = Start + VTableNameOffset;
+    VNamesEnd = VNamesStart + VTableNameSize;
   }
 
   CountersStart = Start + CountersOffset;
@@ -1260,6 +1283,23 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
+  if (GET_VERSION(Header->formatVersion()) >= 12) {
+    uint64_t VTableNamesOffset =
+        endian::byte_swap<uint64_t, llvm::endianness::little>(
+            Header->VTableNamesOffset);
+    const unsigned char *Ptr = Start + VTableNamesOffset;
+
+    CompressedVTableNamesLen =
+        support::endian::readNext<uint64_t, llvm::endianness::little,
+                                  unaligned>(Ptr);
+
+    // Writer first writes the length of compressed string, and then the actual
+    // content.
+    VTableNamePtr = (const char *)Ptr;
+    if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
+      return make_error<InstrProfError>(instrprof_error::truncated);
+  }
+
   if (GET_VERSION(Header->formatVersion()) >= 10 &&
       Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
     uint64_t TemporalProfTracesOffset =
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index d65f8fe50313d..e5163ebe8ae37 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -455,12 +455,12 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   Header.MemProfOffset = 0;
   Header.BinaryIdOffset = 0;
   Header.TemporalProfTracesOffset = 0;
+  Header.VTableNamesOffset = 0;
   int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out all the fields except 'HashOffset', 'MemProfOffset',
-  // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the
-  // offset of these fields to allow back patching later.
-  for (int I = 0; I < N - 4; I++)
+  // Only write out the first four fields. We need to remember the offset of the
+  // remaining fields to allow back patching later.
+  for (int I = 0; I < 4; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -484,6 +484,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   uint64_t TemporalProfTracesOffset = OS.tell();
   OS.write(0);
 
+  uint64_t VTableNamesOffset = OS.tell();
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -604,6 +607,31 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       OS.writeByte(0);
   }
 
+  uint64_t VTableNamesSectionStart = OS.tell();
+
+  // Use a dummy (and uncompressed) string as compressed vtable names and get
+  // the necessary profile format change in place for version 12.
+  // TODO: Store the list of vtable names in InstrProfWriter and use the
+  // real compressed name.
+  std::string CompressedVTableNames = "VTableNames";
+
+  uint64_t CompressedStringLen = CompressedVTableNames.length();
+
+  // Record the length of compressed string.
+  OS.write(CompressedStringLen);
+
+  // Write the chars in compressed strings.
+  for (auto &c : CompressedVTableNames)
+    OS.writeByte(static_cast<uint8_t>(c));
+
+  // Pad up to a multiple of 8.
+  // InstrProfReader would read bytes according to 'CompressedStringLen'.
+  uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
+
+  for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) {
+    OS.writeByte(0);
+  }
+
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
     TemporalProfTracesSectionStart = OS.tell();
@@ -647,6 +675,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
       // traces).
       {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
+      {VTableNamesOffset, &VTableNamesSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
@@ -699,7 +728,8 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) {
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       DenseSet<uint64_t> SeenValues;
       for (uint32_t I = 0; I < ND; I++)
-        if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second)
+        if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) &&
+            !SeenValues.insert(VD[I].Value).second)
           return make_error<InstrProfError>(instrprof_error::invalid_prof);
     }
   }
@@ -747,7 +777,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       OS << ND << "\n";
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
-        if (VK == IPVK_IndirectCallTarget)
+        if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget)
           OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":"
              << VD[I].Count << "\n";
         else
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index bbf895ea4b34e..08cbcaa962b76 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
index 5efda10bb98a941c04b6846db05d3691bc36aac0..3daa98f937b691880ffff203c9426bfacddf749d 100644
GIT binary patch
delta 133
zcmbQhvVeuNu_!ISs37M**F;W##f(djpGdFz|9^9xwDglu1`NP7F;ks2U=>hu;#6za
s1Tf>OHE#ik0aQLiPe%I5WLZXI)&n4s$)Sw16~Kysa*R;Jz`Bw604`-Eq5uE@

delta 117
zcmZ3$GJ%D&u_!ISs37M*=R{6_L67IVA1SZ;|9^9yv+SKv1_s87mFlblGl86mORZTI
rz>KHXyapf!P<n@?i|n1rx{SuG4Iq)psf@D~z=}Xx86W_x8;K796T>9f

diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 8c6942c0f527b..1bad0db1b4762 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index 9cd225587c92511e99f3497ce1d5f47c6fc5f0af..a3e884343942ebc70ba95ab4ee006630b6816d80 100644
GIT binary patch
delta 40
ycmV+@0N4NE5AY8OfpTVVa&T<_3Xus<4&W)M0UE0R|DByz{^eDZP6HaTaBv4~Q4!$)

delta 39
vcmeys|A3#fu_!ISs37M*=R{6_K?|$bHJ=*(|L<GyrHQwmfq`*jWjQ+lUJ(&8

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index 9966729d92ddc33bf89eeb3fee87215bbabbbef1..e3f77e870d4d20828119348e70eb44e6d39e0ec0 100644
GIT binary patch
delta 40
wcmdnMe}JE}u_!ISs37M**F;W#z6X_m+ZL?)|9`G8Q)PVUWItx9jRg+u0BX?@Q2+n{

delta 39
vcmX@Wzk#2#u_!ISs37M*=R{6_L5r?)Gq*SV|KArNnC4N>z`(e%(w!XuMI#TR

diff --git a/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw
new file mode 100644
index 0000000000000000000000000000000000000000..84707ba2070a92b8683010d9daaef747df35f9ac
GIT binary patch
literal 528
zcmZoHO3N=Q$obF700xW@ih+Rz#(>i3d^BkWXQ;q~{}8~jee0hktN#DrJkOIkI+TF{
zX0YI^%?f`vOg;fr_5L!KFBeQb%shva5cM!VOdpINJ<~YH=c-N(O#cd~eK7d|0{XA2
zYFH&6%DWHJCbaDydjXpM1gQQWo?dWwGr<f8(!Z|uJ~vC5dK;(>?0yS0{Tm3_5AzQ$
z+Q7KtR(HRVzu%dYp1!6!$!AXbT=MqY*4O{3u}gA_;W2kfsb$ZfsH;9ZvV7_@)#;23
r{WSu+S$HaLo%TI*hM9pynsFJ}wH81UW(Uaqj8G0Nd|-00@P_dLvBrhT

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index eda63203a304a..61881b69cfd5c 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -32,6 +34,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 38b838e0d100a..316a9a4c9df4c 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index c967e850dbe35..8b686d5c50cb7 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 2e747f81a6bfa..089afad420622 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 3c23bc7dd0f7f..e404ba4210cc1 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 4a5c42843ff4d..ee54bfb978567 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -10,6 +10,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index 2a92575ee3407..dfa163f1f3439 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -15,6 +15,8 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index 8220361df6cfa..63782c8b94d4a 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,5 +1,6 @@
+// Header
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +13,8 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +23,8 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
@@ -31,9 +33,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 9352ae132380d..e9569bec1178b 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index c3e995add6ff2..0bc579eec58ab 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,5 +1,5 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 0b3ef2a89abe5..ca9ea54c3f014 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index f4a9aa8e1bbc3..70a4210dea9f8 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -26,7 +28,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -39,6 +41,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw

From 4d73cbe863886add6742a8ebd00d19c1cab11095 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 21 Feb 2024 21:10:47 -0800
Subject: [PATCH 002/546] [nfc]remove unused variable after pr/81691 (#82578)

* `N` became unused after [pull request 81691](https://github.com/llvm/llvm-project/pull/81691)
* This should fix the build bot failure of `unused variable`
https://lab.llvm.org/buildbot/#/builders/77/builds/34840
---
 llvm/lib/ProfileData/InstrProfWriter.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index e5163ebe8ae37..3e0a0e0d70116 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -456,7 +456,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   Header.BinaryIdOffset = 0;
   Header.TemporalProfTracesOffset = 0;
   Header.VTableNamesOffset = 0;
-  int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
   // Only write out the first four fields. We need to remember the offset of the
   // remaining fields to allow back patching later.

From 0e8d1877cd145719b7acb707539287b7b877a555 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 21 Feb 2024 21:41:33 -0800
Subject: [PATCH 003/546] Revert type profiling change as compiler-rt test
 break on Windows. (#82583)

Examples
https://lab.llvm.org/buildbot/#/builders/127/builds/62532/steps/8/logs/stdio
---
 compiler-rt/include/profile/InstrProfData.inc |  50 +--------
 compiler-rt/lib/profile/InstrProfiling.h      |  35 ++-----
 .../lib/profile/InstrProfilingBuffer.c        |  96 +++---------------
 .../lib/profile/InstrProfilingInternal.h      |   8 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  23 +----
 .../lib/profile/InstrProfilingPlatformLinux.c |  20 ----
 .../lib/profile/InstrProfilingWriter.c        |  37 ++-----
 .../profile/instrprof-write-buffer-internal.c |   6 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |  17 +---
 .../llvm/ProfileData/InstrProfData.inc        |  50 +--------
 .../llvm/ProfileData/InstrProfReader.h        |  13 ---
 llvm/lib/ProfileData/InstrProf.cpp            |  11 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  44 +-------
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  43 ++------
 .../InstrProfiling/coverage.ll                |   8 +-
 .../thinlto_indirect_call_promotion.profraw   | Bin 544 -> 528 bytes
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 2032 -> 2016 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1984 -> 1968 bytes
 .../thinlto_indirect_call_promotion.profraw   | Bin 528 -> 0 bytes
 .../llvm-profdata/binary-ids-padding.test     |   6 +-
 .../llvm-profdata/large-binary-id-size.test   |   4 +-
 ...alformed-not-space-for-another-header.test |   6 +-
 .../malformed-num-counters-zero.test          |   6 +-
 .../malformed-ptr-to-counter-array.test       |   6 +-
 .../misaligned-binary-ids-size.test           |   4 +-
 .../mismatched-raw-profile-header.test        |   2 -
 .../tools/llvm-profdata/raw-32-bits-be.test   |  11 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-two-profiles.test |   8 +-
 32 files changed, 90 insertions(+), 458 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 1f77853bb8baa..c907a9736f316 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -96,25 +96,6 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
-/* For a virtual table object, record the name hash to associate profiled
- * addresses with global variables, and record {starting address, size in bytes}
- * to map the profiled virtual table (which usually have an offset from the
- * starting address) back to a virtual table object. */
-#ifndef INSTR_PROF_VTABLE_DATA
-#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
-#else
-#define INSTR_PROF_VTABLE_DATA_DEFINED
-#endif
-INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
-                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
-                       IndexedInstrProf::ComputeHash(PGOVTableName)))
-INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
-                       VTablePointer, VTableAddr)
-INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
-                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
-                                        VTableSizeVal))
-#undef INSTR_PROF_VTABLE_DATA
-/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -166,8 +147,6 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
-INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
-INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -209,26 +188,13 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
-/* For virtual table address profiling, the address point of the virtual table
- * (i.e., the address contained in objects pointing to a virtual table) are
- * profiled. Note this may not be the address of the per C++ class virtual table
- *  object (e.g., there might be an offset).
- *
- * The profiled addresses are stored in raw profile, together with the following
- * two types of information.
- * 1. The (starting and ending) addresses of per C++ class virtual table objects.
- * 2. The (compressed) virtual table object names.
- * RawInstrProfReader converts profiled virtual table addresses to virtual table
- *  objects' MD5 hash.
- */
-VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -318,18 +284,12 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
-INSTR_PROF_SECT_ENTRY(IPSK_vname, \
-                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
-                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
-INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
-                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
-                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -708,9 +668,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 10
+#define INSTR_PROF_RAW_VERSION 9
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 12
+#define INSTR_PROF_INDEX_VERSION 11
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -748,12 +708,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
-#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
-#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -764,12 +722,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
-#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
-#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index be694a8d3330b..0123908336918 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,12 +49,6 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
-typedef void *IntPtrT;
-typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
-#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
-#include "profile/InstrProfData.inc"
-} VTableProfData;
-
 /*!
  * \brief Return 1 if profile counters are continuously synced to the raw
  * profile via an mmap(). This is in contrast to the default mode, in which
@@ -109,16 +103,12 @@ const __llvm_profile_data *__llvm_profile_begin_data(void);
 const __llvm_profile_data *__llvm_profile_end_data(void);
 const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
-const char *__llvm_profile_begin_vtabnames(void);
-const char *__llvm_profile_end_vtabnames(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
 char *__llvm_profile_begin_bitmap(void);
 char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
-VTableProfData *__llvm_profile_begin_vtables();
-VTableProfData *__llvm_profile_end_vtables();
 uint32_t *__llvm_profile_begin_orderfile();
 
 /*!
@@ -262,31 +252,20 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
 /*! \brief Get the size of the profile name section in bytes. */
 uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
 
-/*! \brief Get the number of virtual table profile data entries */
-uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
-                                       const VTableProfData *End);
-
-/*! \brief Get the size of virtual table profile data in bytes. */
-uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
-                                                const VTableProfData *End);
-
-/* ! \brief Given the sizes of the data and counter information, computes the
- * number of padding bytes before and after the counter section, as well as the
- * number of padding bytes after other setions in the raw profile.
- * Returns -1 upon errors and 0 upon success. Output parameters should be used
- * iff return value is 0.
+/* ! \brief Given the sizes of the data and counter information, return the
+ * number of padding bytes before and after the counters, and after the names,
+ * in the raw profile.
  *
  * Note: When mmap() mode is disabled, no padding bytes before/after counters
  * are needed. However, in mmap() mode, the counter section in the raw profile
  * must be page-aligned: this API computes the number of padding bytes
  * needed to achieve that.
  */
-int __llvm_profile_get_padding_sizes_for_counters(
+void __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
-    uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames,
-    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames);
+    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
+    uint64_t *PaddingBytesAfterNames);
 
 /*!
  * \brief Set the flag that profile data has been dumped to the file.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 7c5c26f4d113b..af52804b2b532 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -51,29 +51,16 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
-  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
-  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
-  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
-  const char *VNamesEnd = __llvm_profile_end_vtabnames();
 
   return __llvm_profile_get_size_for_buffer_internal(
       DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd,
-      NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
+      NamesBegin, NamesEnd);
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
-  // `sizeof(__llvm_profile_data) - 1` is required in the numerator when
-  // [Begin, End] represents an inclusive range.
-  // For ELF, [Begin, End) represents the address of linker-inserted
-  // symbols  `__start__<elf-section>` and `__stop_<elf-section>`.
-  // Thereby, `End` is one byte past the inclusive range, and
-  // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get
-  // the correct number of profile data.
-  // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true
-  // across platforms.
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
 }
@@ -84,26 +71,6 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
-// Counts the number of `VTableProfData` elements within the range of [Begin,
-// End). Caller should guarantee that End points to one byte past the inclusive
-// range.
-// FIXME: Add a compiler-rt test to make sure the number of vtables in the
-// raw profile is the same as the number of vtable elements in the instrumented
-// binary.
-COMPILER_RT_VISIBILITY
-uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
-                                       const VTableProfData *End) {
-  // Convert pointers to intptr_t to use integer arithmetic.
-  intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin;
-  return (EndI - BeginI) / sizeof(VTableProfData);
-}
-
-COMPILER_RT_VISIBILITY
-uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
-                                                const VTableProfData *End) {
-  return (intptr_t)(End) - (intptr_t)(Begin);
-}
-
 COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) {
   if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE)
     return sizeof(uint8_t);
@@ -152,13 +119,11 @@ static int needsCounterPadding(void) {
 }
 
 COMPILER_RT_VISIBILITY
-int __llvm_profile_get_padding_sizes_for_counters(
+void __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
-    uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames,
-    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) {
-  // Counter padding is needed only if continuous mode is enabled.
+    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
+    uint64_t *PaddingBytesAfterNames) {
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
@@ -166,19 +131,9 @@ int __llvm_profile_get_padding_sizes_for_counters(
     *PaddingBytesAfterBitmapBytes =
         __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
-    if (PaddingBytesAfterVTable != NULL)
-      *PaddingBytesAfterVTable =
-          __llvm_profile_get_num_padding_bytes(VTableSize);
-    if (PaddingBytesAfterVName != NULL)
-      *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize);
-    return 0;
+    return;
   }
 
-  // Value profiling not supported in continuous mode at profile-write time.
-  // Return -1 to alert the incompatibility.
-  if (VTableSize != 0 || VNameSize != 0)
-    return -1;
-
   // In continuous mode, the file offsets for headers and for the start of
   // counter sections need to be page-aligned.
   *PaddingBytesBeforeCounters =
@@ -187,22 +142,13 @@ int __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesAfterBitmapBytes =
       calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
-  // Set these two variables to zero to avoid uninitialized variables
-  // even if VTableSize and VNameSize are known to be zero.
-  if (PaddingBytesAfterVTable != NULL)
-    *PaddingBytesAfterVTable = 0;
-  if (PaddingBytesAfterVName != NULL)
-    *PaddingBytesAfterVName = 0;
-  return 0;
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
-    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
-    const char *VNamesBegin, const char *VNamesEnd) {
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -210,29 +156,20 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
-  const uint64_t VTableSize =
-      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
-  const uint64_t VNameSize =
-      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
-      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */,
-      0 /* VNameSize */, &PaddingBytesBeforeCounters,
-      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
-      &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
-      &PaddingBytesAfterVNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
          PaddingBytesAfterCounters + NumBitmapBytes +
-         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames +
-         VTableSize + PaddingBytesAfterVTable + VNameSize +
-         PaddingBytesAfterVNames;
+         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -254,10 +191,7 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
-  // Set virtual table arguments to NULL since they are not supported yet.
-  return lprofWriteDataImpl(
-      &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
-      BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd,
-      /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL,
-      /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0);
+  return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
+                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
+                            NamesEnd, 0);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index d5bd0e41fb129..03ed67fcfa766 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -22,9 +22,7 @@
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
-    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
-    const char *VNamesBegin, const char *VNamesEnd);
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
 
 /*!
  * \brief Write instrumentation data to the given buffer, given explicit
@@ -158,9 +156,7 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const char *CountersBegin, const char *CountersEnd,
                        const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
-                       const char *NamesEnd, const VTableProfData *VTableBegin,
-                       const VTableProfData *VTableEnd, const char *VNamesBegin,
-                       const char *VNamesEnd, int SkipNameDataWrite);
+                       const char *NamesEnd, int SkipNameDataWrite);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index c0706b73e1668..b5850e99ee37d 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -107,26 +107,6 @@ static uintptr_t signextIfWin64(void *V) {
 #endif
 }
 
-// Skip names section, vtable profile data section and vtable names section
-// for runtime profile merge. To merge runtime addresses from multiple
-// profiles collected from the same instrumented binary, the binary should be
-// loaded at fixed base address (e.g., build with -no-pie, or run with ASLR
-// disabled). In this set-up these three sections remain unchanged.
-static uint64_t
-getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) {
-  const uint64_t VTableSectionSize =
-      Header->NumVTables * sizeof(VTableProfData);
-  const uint64_t PaddingBytesAfterVTableSection =
-      __llvm_profile_get_num_padding_bytes(VTableSectionSize);
-  const uint64_t VNamesSize = Header->VNamesSize;
-  const uint64_t PaddingBytesAfterVNamesSize =
-      __llvm_profile_get_num_padding_bytes(VNamesSize);
-  return Header->NamesSize +
-         __llvm_profile_get_num_padding_bytes(Header->NamesSize) +
-         VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize +
-         PaddingBytesAfterVNamesSize;
-}
-
 COMPILER_RT_VISIBILITY
 int __llvm_profile_merge_from_buffer(const char *ProfileData,
                                      uint64_t ProfileSize) {
@@ -157,7 +137,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcBitmapStart = SrcCountersEnd;
   SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
-      SrcNameStart + getDistanceFromCounterToValueProf(Header);
+      SrcNameStart + Header->NamesSize +
+      __llvm_profile_get_num_padding_bytes(Header->NamesSize);
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index d2554a2702aaf..19266ab6c6fb8 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -24,12 +24,8 @@
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
-#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON)
-#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
-#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON)
-#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON)
 #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
 #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
@@ -45,10 +41,6 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
-extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
-extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
-extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
-extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -71,18 +63,6 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) {
 COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) {
   return &PROF_NAME_STOP;
 }
-COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) {
-  return &PROF_VNAME_START;
-}
-COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) {
-  return &PROF_VNAME_STOP;
-}
-COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_begin_vtables(void) {
-  return &PROF_VTABLE_START;
-}
-COMPILER_RT_VISIBILITY VTableProfData *__llvm_profile_end_vtables(void) {
-  return &PROF_VTABLE_STOP;
-}
 COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
   return &PROF_CNTS_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 8816a71155511..4d767d1385148 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -250,14 +250,9 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
-  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
-  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
-  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
-  const char *VNamesEnd = __llvm_profile_end_vtabnames();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
                             CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
-                            NamesBegin, NamesEnd, VTableBegin, VTableEnd,
-                            VNamesBegin, VNamesEnd, SkipNameDataWrite);
+                            NamesBegin, NamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
@@ -266,9 +261,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *CountersBegin, const char *CountersEnd,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
-                   const char *NamesEnd, const VTableProfData *VTableBegin,
-                   const VTableProfData *VTableEnd, const char *VNamesBegin,
-                   const char *VNamesEnd, int SkipNameDataWrite) {
+                   const char *NamesEnd, int SkipNameDataWrite) {
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
       __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -280,12 +273,6 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
-  const uint64_t NumVTables =
-      __llvm_profile_get_num_vtable(VTableBegin, VTableEnd);
-  const uint64_t VTableSectionSize =
-      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
-  const uint64_t VNamesSize =
-      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -293,15 +280,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
-      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
-  if (__llvm_profile_get_padding_sizes_for_counters(
-          DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
-          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
-          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
-          &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
-          &PaddingBytesAfterVNames) == -1)
-    return -1;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
+  __llvm_profile_get_padding_sizes_for_counters(
+      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   {
 /* Initialize header structure.  */
@@ -340,11 +323,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1},
-      {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1},
-      {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}};
+      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
index 2c1c29ac0c588..d9670f739ca98 100644
--- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c
+++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
@@ -31,8 +31,7 @@ char *__llvm_profile_end_bitmap(void);
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const void *DataBegin, const void *DataEnd, const char *CountersBegin,
     const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
-    const char *NamesBegin, const char *NamesEnd, const void *VTableBegin,
-    const void *VTableEnd, const char *VNamesBegin, const char *VNamesEnd);
+    const char *NamesBegin, const char *NamesEnd);
 
 int __llvm_profile_write_buffer_internal(
     char *Buffer, const void *DataBegin, const void *DataEnd,
@@ -46,8 +45,7 @@ int main(int argc, const char *argv[]) {
       __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
       __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
-      __llvm_profile_begin_names(), __llvm_profile_end_names(), NULL, NULL,
-      NULL, NULL);
+      __llvm_profile_begin_names(), __llvm_profile_end_names());
 
   char *buf = malloc(bufsize);
   int ret = __llvm_profile_write_buffer_internal(
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 25ec06a739202..a928ba6961f36 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -831,7 +831,6 @@ struct InstrProfRecord {
   struct ValueProfData {
     std::vector<InstrProfValueSiteRecord> IndirectCallSites;
     std::vector<InstrProfValueSiteRecord> MemOPSizes;
-    std::vector<InstrProfValueSiteRecord> VTableTargets;
   };
   std::unique_ptr<ValueProfData> ValueData;
 
@@ -854,8 +853,6 @@ struct InstrProfRecord {
       return ValueData->IndirectCallSites;
     case IPVK_MemOPSize:
       return ValueData->MemOPSizes;
-    case IPVK_VTableTarget:
-      return ValueData->VTableTargets;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -1039,9 +1036,7 @@ enum ProfVersion {
   Version10 = 10,
   // An additional field is used for bitmap bytes.
   Version11 = 11,
-  // VTable profiling,
-  Version12 = 12,
-  // The current version is 12.
+  // The current version is 11.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1062,7 +1057,6 @@ struct Header {
   uint64_t MemProfOffset;
   uint64_t BinaryIdOffset;
   uint64_t TemporalProfTracesOffset;
-  uint64_t VTableNamesOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
@@ -1199,13 +1193,8 @@ template <> inline uint64_t getMagic<uint32_t>() {
 // It should also match the synthesized type in
 // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters.
 template <class IntPtrT> struct alignas(8) ProfileData {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
-#include "llvm/ProfileData/InstrProfData.inc"
-};
-
-template <class IntPtrT> struct alignas(8) VTableProfileData {
-#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name;
-#include "llvm/ProfileData/InstrProfData.inc"
+  #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
+  #include "llvm/ProfileData/InstrProfData.inc"
 };
 
 // File header structure of the LLVM profile data in raw format.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 1f77853bb8baa..c907a9736f316 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -96,25 +96,6 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
-/* For a virtual table object, record the name hash to associate profiled
- * addresses with global variables, and record {starting address, size in bytes}
- * to map the profiled virtual table (which usually have an offset from the
- * starting address) back to a virtual table object. */
-#ifndef INSTR_PROF_VTABLE_DATA
-#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
-#else
-#define INSTR_PROF_VTABLE_DATA_DEFINED
-#endif
-INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
-                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
-                       IndexedInstrProf::ComputeHash(PGOVTableName)))
-INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
-                       VTablePointer, VTableAddr)
-INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
-                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
-                                        VTableSizeVal))
-#undef INSTR_PROF_VTABLE_DATA
-/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -166,8 +147,6 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
-INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
-INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -209,26 +188,13 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
-/* For virtual table address profiling, the address point of the virtual table
- * (i.e., the address contained in objects pointing to a virtual table) are
- * profiled. Note this may not be the address of the per C++ class virtual table
- *  object (e.g., there might be an offset).
- *
- * The profiled addresses are stored in raw profile, together with the following
- * two types of information.
- * 1. The (starting and ending) addresses of per C++ class virtual table objects.
- * 2. The (compressed) virtual table object names.
- * RawInstrProfReader converts profiled virtual table addresses to virtual table
- *  objects' MD5 hash.
- */
-VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -318,18 +284,12 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
-INSTR_PROF_SECT_ENTRY(IPSK_vname, \
-                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
-                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
-INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
-                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
-                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -708,9 +668,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 10
+#define INSTR_PROF_RAW_VERSION 9
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 12
+#define INSTR_PROF_INDEX_VERSION 11
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -748,12 +708,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
-#define INSTR_PROF_VNAME_COMMON __llvm_prf_vtabnames
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
-#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -764,12 +722,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
-#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
-#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index cfde5d3fc77d6..87f15639a2c3c 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -326,16 +326,12 @@ class RawInstrProfReader : public InstrProfReader {
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
-  const RawInstrProf::VTableProfileData<IntPtrT> *VTableBegin = nullptr;
-  const RawInstrProf::VTableProfileData<IntPtrT> *VTableEnd = nullptr;
   const char *CountersStart;
   const char *CountersEnd;
   const char *BitmapStart;
   const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
-  const char *VNamesStart = nullptr;
-  const char *VNamesEnd = nullptr;
   // After value profile is all read, this pointer points to
   // the header of next profile data (if exists)
   const uint8_t *ValueDataStart;
@@ -660,15 +656,6 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<MemProfRecordHashTable> MemProfRecordTable;
   /// MemProf frame profile data on-disk indexed via frame id.
   std::unique_ptr<MemProfFrameHashTable> MemProfFrameTable;
-  /// VTableNamePtr points to the beginning of compressed vtable names.
-  /// When a symtab is constructed from profiles by llvm-profdata, the list of
-  /// names could be decompressed based on `VTableNamePtr` and
-  /// `CompressedVTableNamesLen`.
-  /// A compiler that reads indexed profiles could construct symtab from module
-  /// IR so it doesn't need the decompressed names.
-  const char *VTableNamePtr = nullptr;
-  /// The length of compressed vtable names.
-  uint64_t CompressedVTableNamesLen = 0;
   /// Total size of binary ids.
   uint64_t BinaryIdsSize{0};
   /// Start address of binary id length and data pairs.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index b9afee413853e..2eeeff987399d 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1533,12 +1533,9 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
-  case 12ull:
-    H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset));
-    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
@@ -1564,14 +1561,10 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
-  case 12ull:
-    return offsetOf(&Header::VTableNamesOffset) +
-           sizeof(Header::VTableNamesOffset);
-    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 31b742bca14d6..0d8d43daae960 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -366,11 +366,6 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
               return E;
             Value = IndexedInstrProf::ComputeHash(VD.first);
           }
-        } else if (ValueKind == IPVK_VTableTarget) {
-          if (InstrProfSymtab::isExternalSymbol(VD.first))
-            Value = 0;
-          else
-            Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
           READ_NUM(VD.first, Value);
         }
@@ -587,17 +582,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto NumBitmapBytes = swap(Header.NumBitmapBytes);
   auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
-  auto VTableNameSize = swap(Header.VNamesSize);
-  auto NumVTables = swap(Header.NumVTables);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>);
-  auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize);
-  auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize);
-
-  auto VTableSectionSize =
-      NumVTables * sizeof(RawInstrProf::VTableProfileData<IntPtrT>);
-  auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize);
+  auto PaddingSize = getNumPaddingBytes(NamesSize);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize;
@@ -606,12 +594,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
   ptrdiff_t NamesOffset =
       BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
-  ptrdiff_t VTableProfDataOffset =
-      NamesOffset + NamesSize + PaddingBytesAfterNames;
-  ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize +
-                               PaddingBytesAfterVTableProfData;
-  ptrdiff_t ValueDataOffset =
-      VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames;
+  ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
   if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
@@ -631,14 +614,8 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
         Start + DataOffset);
     DataEnd = Data + NumData;
-    VTableBegin =
-        reinterpret_cast<const RawInstrProf::VTableProfileData<IntPtrT> *>(
-            Start + VTableProfDataOffset);
-    VTableEnd = VTableBegin + NumVTables;
     NamesStart = Start + NamesOffset;
     NamesEnd = NamesStart + NamesSize;
-    VNamesStart = Start + VTableNameOffset;
-    VNamesEnd = VNamesStart + VTableNameSize;
   }
 
   CountersStart = Start + CountersOffset;
@@ -1283,23 +1260,6 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
-  if (GET_VERSION(Header->formatVersion()) >= 12) {
-    uint64_t VTableNamesOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->VTableNamesOffset);
-    const unsigned char *Ptr = Start + VTableNamesOffset;
-
-    CompressedVTableNamesLen =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
-
-    // Writer first writes the length of compressed string, and then the actual
-    // content.
-    VTableNamePtr = (const char *)Ptr;
-    if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
-      return make_error<InstrProfError>(instrprof_error::truncated);
-  }
-
   if (GET_VERSION(Header->formatVersion()) >= 10 &&
       Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
     uint64_t TemporalProfTracesOffset =
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 3e0a0e0d70116..d65f8fe50313d 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -455,11 +455,12 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   Header.MemProfOffset = 0;
   Header.BinaryIdOffset = 0;
   Header.TemporalProfTracesOffset = 0;
-  Header.VTableNamesOffset = 0;
+  int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
 
-  // Only write out the first four fields. We need to remember the offset of the
-  // remaining fields to allow back patching later.
-  for (int I = 0; I < 4; I++)
+  // Only write out all the fields except 'HashOffset', 'MemProfOffset',
+  // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the
+  // offset of these fields to allow back patching later.
+  for (int I = 0; I < N - 4; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -483,9 +484,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   uint64_t TemporalProfTracesOffset = OS.tell();
   OS.write(0);
 
-  uint64_t VTableNamesOffset = OS.tell();
-  OS.write(0);
-
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -606,31 +604,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       OS.writeByte(0);
   }
 
-  uint64_t VTableNamesSectionStart = OS.tell();
-
-  // Use a dummy (and uncompressed) string as compressed vtable names and get
-  // the necessary profile format change in place for version 12.
-  // TODO: Store the list of vtable names in InstrProfWriter and use the
-  // real compressed name.
-  std::string CompressedVTableNames = "VTableNames";
-
-  uint64_t CompressedStringLen = CompressedVTableNames.length();
-
-  // Record the length of compressed string.
-  OS.write(CompressedStringLen);
-
-  // Write the chars in compressed strings.
-  for (auto &c : CompressedVTableNames)
-    OS.writeByte(static_cast<uint8_t>(c));
-
-  // Pad up to a multiple of 8.
-  // InstrProfReader would read bytes according to 'CompressedStringLen'.
-  uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
-
-  for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) {
-    OS.writeByte(0);
-  }
-
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
     TemporalProfTracesSectionStart = OS.tell();
@@ -674,7 +647,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
       // traces).
       {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
-      {VTableNamesOffset, &VTableNamesSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
@@ -727,8 +699,7 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) {
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       DenseSet<uint64_t> SeenValues;
       for (uint32_t I = 0; I < ND; I++)
-        if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) &&
-            !SeenValues.insert(VD[I].Value).second)
+        if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second)
           return make_error<InstrProfError>(instrprof_error::invalid_prof);
     }
   }
@@ -776,7 +747,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       OS << ND << "\n";
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
-        if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget)
+        if (VK == IPVK_IndirectCallTarget)
           OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":"
              << VD[I].Count << "\n";
         else
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index 08cbcaa962b76..bbf895ea4b34e 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
index 3daa98f937b691880ffff203c9426bfacddf749d..5efda10bb98a941c04b6846db05d3691bc36aac0 100644
GIT binary patch
delta 117
zcmZ3$GJ%D&u_!ISs37M*=R{6_L67IVA1SZ;|9^9yv+SKv1_s87mFlblGl86mORZTI
rz>KHXyapf!P<n@?i|n1rx{SuG4Iq)psf@D~z=}Xx86W_x8;K796T>9f

delta 133
zcmbQhvVeuNu_!ISs37M**F;W##f(djpGdFz|9^9xwDglu1`NP7F;ks2U=>hu;#6za
s1Tf>OHE#ik0aQLiPe%I5WLZXI)&n4s$)Sw16~Kysa*R;Jz`Bw604`-Eq5uE@

diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 1bad0db1b4762..8c6942c0f527b 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index a3e884343942ebc70ba95ab4ee006630b6816d80..9cd225587c92511e99f3497ce1d5f47c6fc5f0af 100644
GIT binary patch
delta 39
vcmeys|A3#fu_!ISs37M*=R{6_K?|$bHJ=*(|L<GyrHQwmfq`*jWjQ+lUJ(&8

delta 40
ycmV+@0N4NE5AY8OfpTVVa&T<_3Xus<4&W)M0UE0R|DByz{^eDZP6HaTaBv4~Q4!$)

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index e3f77e870d4d20828119348e70eb44e6d39e0ec0..9966729d92ddc33bf89eeb3fee87215bbabbbef1 100644
GIT binary patch
delta 39
vcmX@Wzk#2#u_!ISs37M*=R{6_L5r?)Gq*SV|KArNnC4N>z`(e%(w!XuMI#TR

delta 40
wcmdnMe}JE}u_!ISs37M**F;W#z6X_m+ZL?)|9`G8Q)PVUWItx9jRg+u0BX?@Q2+n{

diff --git a/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw
deleted file mode 100644
index 84707ba2070a92b8683010d9daaef747df35f9ac..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 528
zcmZoHO3N=Q$obF700xW@ih+Rz#(>i3d^BkWXQ;q~{}8~jee0hktN#DrJkOIkI+TF{
zX0YI^%?f`vOg;fr_5L!KFBeQb%shva5cM!VOdpINJ<~YH=c-N(O#cd~eK7d|0{XA2
zYFH&6%DWHJCbaDydjXpM1gQQWo?dWwGr<f8(!Z|uJ~vC5dK;(>?0yS0{Tm3_5AzQ$
z+Q7KtR(HRVzu%dYp1!6!$!AXbT=MqY*4O{3u}gA_;W2kfsb$ZfsH;9ZvV7_@)#;23
r{WSu+S$HaLo%TI*hM9pynsFJ}wH81UW(Uaqj8G0Nd|-00@P_dLvBrhT

diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index 61881b69cfd5c..eda63203a304a 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -10,12 +10,10 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
-// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
-// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -34,8 +32,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 316a9a4c9df4c..38b838e0d100a 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -12,8 +12,6 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index 8b686d5c50cb7..c967e850dbe35 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -10,12 +10,10 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
-// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
-// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -28,8 +26,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 089afad420622..2e747f81a6bfa 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -10,12 +10,10 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
-// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
-// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -28,8 +26,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index e404ba4210cc1..3c23bc7dd0f7f 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -10,12 +10,10 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
-// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
-// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -28,8 +26,6 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index ee54bfb978567..4a5c42843ff4d 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -10,8 +10,6 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index dfa163f1f3439..2a92575ee3407 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -15,8 +15,6 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index 63782c8b94d4a..8220361df6cfa 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,6 +1,5 @@
-// Header
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\12' >> %t
+RUN: printf '\0\0\0\0\0\0\0\11' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -13,8 +12,6 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -23,8 +20,9 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
@@ -33,8 +31,9 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index e9569bec1178b..9352ae132380d 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,8 +12,6 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -22,8 +20,9 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -32,8 +31,9 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index 0bc579eec58ab..c3e995add6ff2 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,5 +1,5 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\12' >> %t
+RUN: printf '\0\0\0\0\0\0\0\11' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,8 +12,6 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -22,8 +20,9 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
@@ -32,8 +31,9 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index ca9ea54c3f014..0b3ef2a89abe5 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,8 +12,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -22,8 +20,9 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -32,8 +31,9 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index 70a4210dea9f8..f4a9aa8e1bbc3 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -12,8 +12,6 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -28,7 +26,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -41,8 +39,6 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw

From 386aa7b16977150da917a78423fd05cb19609850 Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Wed, 21 Feb 2024 22:52:02 -0800
Subject: [PATCH 004/546] [mlir][Vector] Replace `vector.shuffle` with
 `vector.interleave` in vector narrow type emulation (#82550)

This PR replaces the generation of `vector.shuffle` with
`vector.interleave` in the i4 conversions in vector narrow type
emulation. The multi dimensional semantics of `vector.interleave` allow
us to enable these conversion emulations also for multi dimensional
vectors.
---
 .../Transforms/VectorEmulateNarrowType.cpp    | 27 ++----
 .../Vector/vector-rewrite-narrow-types.mlir   | 82 +++++++++++++------
 2 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 36fb66708407b..fc11ae63e718a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -724,9 +724,8 @@ BitCastRewriter::BitCastRewriter(VectorType sourceVectorType,
 static LogicalResult commonConversionPrecondition(PatternRewriter &rewriter,
                                                   VectorType preconditionType,
                                                   Operation *op) {
-  if (!preconditionType || preconditionType.getRank() != 1 ||
-      preconditionType.isScalable())
-    return rewriter.notifyMatchFailure(op, "scalable or >1-D vector");
+  if (!preconditionType || preconditionType.isScalable())
+    return rewriter.notifyMatchFailure(op, "scalable vector");
 
   // TODO: consider relaxing this restriction in the future if we find ways
   // to really work with subbyte elements across the MLIR/LLVM boundary.
@@ -743,6 +742,9 @@ LogicalResult BitCastRewriter::commonPrecondition(PatternRewriter &rewriter,
   if (!enumerator.sourceVectorType || !enumerator.targetVectorType)
     return rewriter.notifyMatchFailure(op, "types are not vector");
 
+  if (!preconditionType || preconditionType.getRank() != 1)
+    return rewriter.notifyMatchFailure(op, "unsupported >1-D vector");
+
   return commonConversionPrecondition(rewriter, preconditionType, op);
 }
 
@@ -855,7 +857,6 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
          "Expected i4 type");
 
   // 1. Generate a bitcast vector<Xxi4> -> vector<X/2xi8>.
-  int64_t vecDimSize = srcVecType.getShape().back();
   SmallVector<int64_t> i8VecShape = llvm::to_vector(srcVecType.getShape());
   constexpr int64_t i4Toi8BitwidthFactor = 2;
   i8VecShape.back() = i8VecShape.back() / i4Toi8BitwidthFactor;
@@ -871,16 +872,8 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
   Value low = rewriter.create<arith::ShRSIOp>(loc, shl, shiftValues);
   Value high = rewriter.create<arith::ShRSIOp>(loc, i8Vector, shiftValues);
 
-  // 3. Interleave low and high i8 elements using a shuffle.
-  SmallVector<int64_t> interleaveMaskValues;
-  interleaveMaskValues.reserve(vecDimSize);
-  for (int i = 0, end = vecDimSize / 2; i < end; ++i) {
-    interleaveMaskValues.push_back(i);
-    interleaveMaskValues.push_back(i + (vecDimSize / 2));
-  }
-
-  return rewriter.create<vector::ShuffleOp>(
-      loc, low, high, rewriter.getI64ArrayAttr(interleaveMaskValues));
+  // 3. Interleave low and high i8 elements.
+  return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
 namespace {
@@ -1008,8 +1001,7 @@ struct RewriteExtOfBitCast : OpRewritePattern<ExtOpType> {
 ///        %1 = arith.shli %0, 4 : vector<4xi8>
 ///        %2 = arith.shrsi %1, 4 : vector<4xi8>
 ///        %3 = arith.shrsi %0, 4 : vector<4xi8>
-///        %4 = vector.shuffle %2, %3 [0, 4, 1, 5, 2, 6, 3, 7]
-///             : vector<4xi8>, vector<4xi8>
+///        %4 = vector.interleave %2, %3 : vector<4xi8>
 ///        %5 = arith.extsi %4 : vector<8xi8> to vector<8xi32>
 ///
 ///    arith.sitofp %in : vector<8xi4> to vector<8xf32>
@@ -1018,8 +1010,7 @@ struct RewriteExtOfBitCast : OpRewritePattern<ExtOpType> {
 ///        %1 = arith.shli %0, 4 : vector<4xi8>
 ///        %2 = arith.shrsi %1, 4 : vector<4xi8>
 ///        %3 = arith.shrsi %0, 4 : vector<4xi8>
-///        %4 = vector.shuffle %2, %3 [0, 4, 1, 5, 2, 6, 3, 7]
-///             : vector<4xi8>, vector<4xi8>
+///        %4 = vector.interleave %2, %3 : vector<4xi8>
 ///        %5 = arith.sitofp %4 : vector<8xi8> to vector<8xf32>
 ///
 template <typename ConversionOpType>
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 02063a81664b8..94e78ce40a3c1 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -195,53 +195,89 @@ func.func @f3ext(%a: vector<5xi8>) -> vector<8xi17> {
 
 // CHECK-LABEL: func.func @aligned_extsi(
 func.func @aligned_extsi(%a: vector<8xi4>) -> vector<8xi32> {
-  // CHECK: arith.shli
-  // CHECK: arith.shrsi
-  // CHECK: arith.shrsi
-  // CHECK: vector.shuffle
-  // CHECK: arith.extsi %{{.*}} : vector<8xi8> to vector<8xi32>
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8>
+// CHECK:           %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8>
+// CHECK:           %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32>
   %0 = arith.extsi %a : vector<8xi4> to vector<8xi32>
   return %0 : vector<8xi32>
 }
 
+// CHECK-LABEL: func.func @aligned_extsi_2d(
+func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xi32> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8>
+// CHECK:           %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8>
+// CHECK:           %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32>
+  %0 = arith.extsi %a : vector<8x32xi4> to vector<8x32xi32>
+  return %0 : vector<8x32xi32>
+}
+
 // CHECK-LABEL: func.func @aligned_extsi_base_case(
 func.func @aligned_extsi_base_case(%a: vector<8xi4>) -> vector<8xi8> {
-  // CHECK: arith.shli
-  // CHECK: arith.shrsi
-  // CHECK: arith.shrsi
-  // CHECK: vector.shuffle
-  // CHECK-NOT: arith.extsi
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8>
+// CHECK:           %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8>
   %0 = arith.extsi %a : vector<8xi4> to vector<8xi8>
   return %0 : vector<8xi8>
 }
 
 // CHECK-LABEL: func.func @aligned_sitofp(
 func.func @aligned_sitofp(%a: vector<8xi4>) -> vector<8xf32> {
-  // CHECK: arith.shli
-  // CHECK: arith.shrsi
-  // CHECK: arith.shrsi
-  // CHECK: shuffle
-  // CHECK: arith.sitofp %{{.*}} : vector<8xi8> to vector<8xf32>
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8>
+// CHECK:           %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8>
+// CHECK:           %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8xi8> to vector<8xf32>
   %0 = arith.sitofp %a : vector<8xi4> to vector<8xf32>
   return %0 : vector<8xf32>
 }
 
+// CHECK-LABEL: func.func @aligned_sitofp_2d(
+func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xf32> {
+// CHECK:           %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8>
+// CHECK:           %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8>
+// CHECK:           %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8>
+// CHECK:           %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8>
+// CHECK:           %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xf32>
+  %0 = arith.sitofp %a : vector<8x32xi4> to vector<8x32xf32>
+  return %0 : vector<8x32xf32>
+}
+
 // CHECK-LABEL: func.func @i4_transpose(
-//  CHECK-SAME: %[[A:[0-9a-z]*]]
 func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
-  // CHECK: %[[EXT:.*]] = arith.extsi %[[A]] : vector<8x16xi4> to vector<8x16xi8>
-  // CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8>
-  // CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi4>
+// CHECK-SAME:    %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> {
+// CHECK:           %[[EXT:.*]] = vector.interleave
+// CHECK:           %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8>
+// CHECK:           %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi4>
   %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4>
   return %0 : vector<16x8xi4>
 }
 
 // CHECK-LABEL: func.func @i7_transpose(
-//  CHECK-SAME: %[[A:[0-9a-z]*]]
 func.func @i7_transpose(%a: vector<8x16xi7>) -> vector<16x8xi7> {
-  // CHECK: %[[EXT:.*]] = arith.extsi %[[A]] : vector<8x16xi7> to vector<8x16xi8>
-  // CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8>
-  // CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi7>
+// CHECK-SAME:    %[[IN:.*]]: vector<8x16xi7>) -> vector<16x8xi7> {
+// CHECK:           %[[EXT:.*]] = arith.extsi %[[IN]] : vector<8x16xi7> to vector<8x16xi8>
+// CHECK:           %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8>
+// CHECK:           %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi7>
   %0 = vector.transpose %a, [1, 0] : vector<8x16xi7> to vector<16x8xi7>
   return %0 : vector<16x8xi7>
 }

From 675791335285fa86434dc46e5c92f543e0e79d19 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Wed, 21 Feb 2024 22:59:03 -0800
Subject: [PATCH 005/546] [lldb][test] Fix PythonDataObjectsTest

This is using `FileSystem::Instance()` w/o calling `FileSystem::Initialize()`. Use `SubsystemRAII` to do that.
---
 .../Python/PythonDataObjectsTests.cpp                  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp
index a4db4627f935b..b90fbb7830995 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonDataObjectsTests.cpp
@@ -11,6 +11,7 @@
 
 #include "Plugins/ScriptInterpreter/Python/PythonDataObjects.h"
 #include "Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.h"
+#include "TestingSupport/SubsystemRAII.h"
 #include "lldb/Host/File.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
@@ -26,6 +27,8 @@ using namespace lldb_private::python;
 using llvm::Expected;
 
 class PythonDataObjectsTest : public PythonTestSuite {
+  SubsystemRAII<FileSystem> subsystems;
+
 public:
   void SetUp() override {
     PythonTestSuite::SetUp();
@@ -209,8 +212,8 @@ TEST_F(PythonDataObjectsTest, TestPythonBoolean) {
   };
 
   // Test PythonBoolean constructed from long integer values.
-  test_from_long(0); // Test 'false' value.
-  test_from_long(1); // Test 'true' value.
+  test_from_long(0);  // Test 'false' value.
+  test_from_long(1);  // Test 'true' value.
   test_from_long(~0); // Any value != 0 is 'true'.
 }
 
@@ -811,7 +814,8 @@ main = foo
                                 testing::ContainsRegex("line 7, in baz"),
                                 testing::ContainsRegex("ZeroDivisionError")))));
 
-#if !((defined(_WIN32) || defined(_WIN64)) && (defined(__aarch64__) || defined(_M_ARM64)))
+#if !((defined(_WIN32) || defined(_WIN64)) &&                                  \
+      (defined(__aarch64__) || defined(_M_ARM64)))
 
   static const char script2[] = R"(
 class MyError(Exception):

From 6676f67e3103bb6779d226de6bb4f0f8f8ab99f2 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 22 Feb 2024 07:20:47 +0000
Subject: [PATCH 006/546] [mlir][Bazel] Remove stub target which is not needed
 anymore.

---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 40 -------------------
 1 file changed, 40 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index bb7a34ef76772..694602b1a7cbf 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -5540,7 +5540,6 @@ cc_library(
         ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVTarget",
-        ":SerializeToCubin_stub",
         ":SideEffectInterfaces",
         ":Support",
         ":ToLLVMIRTranslation",
@@ -5579,44 +5578,6 @@ cc_library(
     ]),
 )
 
-write_file(
-    name = "SerializeToCubin_stub_cc",
-    out = "SerializeToCubin_stub.cc",
-    content = [
-        """
-#include "mlir/Dialect/GPU/Transforms/Passes.h"
-
-// Provide a weak registration stub in case the real SerializeToCubin is not
-// linked in.
-
-#if defined(_MSC_VER)
-// This might not work correctly, but it avoids a compilation error because
-// MSVC does not support __attribute__((weak)).
-void mlir::registerGpuSerializeToCubinPass() {}
-#else
-__attribute__((weak)) void mlir::registerGpuSerializeToCubinPass() {}
-#endif
-""",
-    ],
-)
-
-cc_library(
-    name = "SerializeToCubin_stub",
-    srcs = [":SerializeToCubin_stub_cc"],
-    hdrs = glob(["include/mlir/Dialect/GPU/Transforms/*.h"]),
-    includes = ["include"],
-    deps = [
-        ":GPUDialect",
-        ":GPUPassIncGen",
-        ":IR",
-        ":Pass",
-        ":SPIRVDialect",
-        ":Support",
-        ":VectorDialect",
-        "//llvm:Support",
-    ],
-)
-
 td_library(
     name = "GPUTransformOpsTdFiles",
     srcs = [
@@ -13193,7 +13154,6 @@ cc_library(
     ],
 )
 
-
 ##---------------------------------------------------------------------------##
 # Allocation interfaces
 ##---------------------------------------------------------------------------##

From bc1c86b810e518a8e3fa90d5c26908c43788873d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 22 Feb 2024 07:24:46 +0000
Subject: [PATCH 007/546] [mlir][Bazel] Also remove SerializeToCubin target.

---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 30 +------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 694602b1a7cbf..a34874efa5b19 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3231,9 +3231,7 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorTransforms",
-    ] + if_cuda_available([
-        ":SerializeToCubin",
-    ]),
+    ],
 )
 
 ##---------------------------------------------------------------------------##
@@ -5504,9 +5502,6 @@ cc_library(
             "lib/Dialect/GPU/Transforms/*.cpp",
             "lib/Dialect/GPU/Transforms/*.h",
         ],
-        exclude = [
-            "lib/Dialect/GPU/Transforms/SerializeToCubin.cpp",
-        ],
     ),
     hdrs = glob(["include/mlir/Dialect/GPU/Transforms/*.h"]),
     includes = ["include"],
@@ -5556,28 +5551,6 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "SerializeToCubin",
-    srcs = [
-        "lib/Dialect/GPU/Transforms/SerializeToCubin.cpp",
-    ],
-    local_defines = if_cuda_available(["MLIR_GPU_TO_CUBIN_PASS_ENABLE"]),
-    deps = [
-        ":GPUDialect",
-        ":GPUPassIncGen",
-        ":GPUTransforms",
-        ":NVVMDialect",
-        ":NVVMToLLVMIRTranslation",
-        ":Pass",
-        ":Support",
-        ":ToLLVMIRTranslation",
-        "//llvm:Support",
-    ] + if_cuda_available([
-        "@cuda//:cuda_headers",
-        "@cuda//:libcuda",
-    ]),
-)
-
 td_library(
     name = "GPUTransformOpsTdFiles",
     srcs = [
@@ -9190,7 +9163,6 @@ cc_binary(
         ":Pass",
         ":QuantOps",
         ":SCFToGPU",
-        ":SerializeToCubin",
         ":Support",
         ":Transforms",
         "//llvm:AllTargetsCodeGens",

From 7e97ae35ae2d1c38d149e670139a538bdba86e93 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Thu, 22 Feb 2024 15:51:19 +0800
Subject: [PATCH 008/546] [RISCV] Teach RISCVMakeCompressible handle
 Zca/Zcf/Zce/Zcd. (#81844)

Make targets which don't have C but have Zca/Zcf/Zce/Zcd benefit from
this pass.
---
 .../Target/RISCV/RISCVMakeCompressible.cpp    |  31 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |   4 +
 llvm/test/CodeGen/RISCV/make-compressible.mir | 499 +++++++++++++-----
 3 files changed, 400 insertions(+), 134 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index ff21fe1d40646..af864ba0fbc46 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -143,19 +143,35 @@ static bool isCompressedReg(Register Reg) {
 // Return true if MI is a load for which there exists a compressed version.
 static bool isCompressibleLoad(const MachineInstr &MI) {
   const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
-  const unsigned Opcode = MI.getOpcode();
 
-  return Opcode == RISCV::LW || (!STI.is64Bit() && Opcode == RISCV::FLW) ||
-         Opcode == RISCV::LD || Opcode == RISCV::FLD;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::LW:
+  case RISCV::LD:
+    return STI.hasStdExtCOrZca();
+  case RISCV::FLW:
+    return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+  case RISCV::FLD:
+    return STI.hasStdExtCOrZcd();
+  }
 }
 
 // Return true if MI is a store for which there exists a compressed version.
 static bool isCompressibleStore(const MachineInstr &MI) {
   const RISCVSubtarget &STI = MI.getMF()->getSubtarget<RISCVSubtarget>();
-  const unsigned Opcode = MI.getOpcode();
 
-  return Opcode == RISCV::SW || (!STI.is64Bit() && Opcode == RISCV::FSW) ||
-         Opcode == RISCV::SD || Opcode == RISCV::FSD;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::SW:
+  case RISCV::SD:
+    return STI.hasStdExtCOrZca();
+  case RISCV::FSW:
+    return !STI.is64Bit() && STI.hasStdExtCOrZcfOrZce();
+  case RISCV::FSD:
+    return STI.hasStdExtCOrZcd();
+  }
 }
 
 // Find a single register and/or large offset which, if compressible, would
@@ -324,8 +340,7 @@ bool RISCVMakeCompressibleOpt::runOnMachineFunction(MachineFunction &Fn) {
   const RISCVInstrInfo &TII = *STI.getInstrInfo();
 
   // This optimization only makes sense if compressed instructions are emitted.
-  // FIXME: Support Zca, Zcf, Zcd granularity.
-  if (!STI.hasStdExtC())
+  if (!STI.hasStdExtCOrZca())
     return false;
 
   for (MachineBasicBlock &MBB : Fn) {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 4b60d7aff22a0..9ebf278d6749f 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -143,6 +143,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
 #include "RISCVGenSubtargetInfo.inc"
 
   bool hasStdExtCOrZca() const { return HasStdExtC || HasStdExtZca; }
+  bool hasStdExtCOrZcd() const { return HasStdExtC || HasStdExtZcd; }
+  bool hasStdExtCOrZcfOrZce() const {
+    return HasStdExtC || HasStdExtZcf || HasStdExtZce;
+  }
   bool hasStdExtZvl() const { return ZvlLen != 0; }
   bool hasStdExtFOrZfinx() const { return HasStdExtF || HasStdExtZfinx; }
   bool hasStdExtDOrZdinx() const { return HasStdExtD || HasStdExtZdinx; }
diff --git a/llvm/test/CodeGen/RISCV/make-compressible.mir b/llvm/test/CodeGen/RISCV/make-compressible.mir
index 2105a13bc8c7b..03da38a6863e7 100644
--- a/llvm/test/CodeGen/RISCV/make-compressible.mir
+++ b/llvm/test/CodeGen/RISCV/make-compressible.mir
@@ -1,8 +1,14 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -o - %s -mtriple=riscv32 -mattr=+c,+f,+d -simplify-mir \
-# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV32 %s
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32C %s
 # RUN: llc -o - %s -mtriple=riscv64 -mattr=+c,+f,+d -simplify-mir \
-# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV64 %s
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64C %s
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zcf -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCF %s
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zca -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCA %s
+# RUN: llc -o - %s -mtriple=riscv64 -mattr=+d,+zca -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64ZCA %s
 --- |
 
   define void @store_common_value(ptr %a, ptr %b, ptr %c) #0 {
@@ -288,7 +294,7 @@
     ret { double, double } %3
   }
 
-  attributes #0 = { minsize "target-features"="+c,+f,+d" }
+  attributes #0 = { minsize }
 
 ...
 ---
@@ -306,6 +312,7 @@ body:             |
     ; RV32-NEXT: SW $x13, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     ; RV32-NEXT: SW $x13, killed renamable $x12, 0 :: (store (s32) into %ir.c)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value
     ; RV64: liveins: $x10, $x11, $x12
     ; RV64-NEXT: {{  $}}
@@ -327,14 +334,15 @@ body:             |
   bb.0.entry:
     liveins: $x10, $x11, $x12, $f16_f
 
-    ; RV32-LABEL: name: store_common_value_float
-    ; RV32: liveins: $x10, $x11, $x12, $f16_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
-    ; RV32-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
-    ; RV32-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
-    ; RV32-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_value_float
+    ; RV32C: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
+    ; RV32C-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32C-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32C-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_float
     ; RV64: liveins: $x10, $x11, $x12, $f16_f
     ; RV64-NEXT: {{  $}}
@@ -342,6 +350,23 @@ body:             |
     ; RV64-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     ; RV64-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_value_float
+    ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
+    ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32ZCF-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_value_float
+    ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32ZCA-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
@@ -355,22 +380,47 @@ body:             |
   bb.0.entry:
     liveins: $x10, $x11, $x12, $f16_d
 
-    ; RV32-LABEL: name: store_common_value_double
-    ; RV32: liveins: $x10, $x11, $x12, $f16_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
-    ; RV32-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
-    ; RV32-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
-    ; RV32-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_common_value_double
-    ; RV64: liveins: $x10, $x11, $x12, $f16_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
-    ; RV64-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
-    ; RV64-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
-    ; RV64-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_value_double
+    ; RV32C: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
+    ; RV32C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_common_value_double
+    ; RV64C: liveins: $x10, $x11, $x12, $f16_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
+    ; RV64C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV64C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV64C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_value_double
+    ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32ZCF-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_value_double
+    ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_common_value_double
+    ; RV64ZCA: liveins: $x10, $x11, $x12, $f16_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV64ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
     FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
     FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
@@ -395,6 +445,7 @@ body:             |
     ; RV32-NEXT: renamable $x10 = ADDI $x0, 5
     ; RV32-NEXT: SW killed renamable $x10, killed $x11, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -432,6 +483,7 @@ body:             |
     ; RV32-NEXT: SW killed renamable $x10, $x11, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: SW killed $x11, $x11, 0 :: (volatile store (s32) into %ir.q)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_self
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -457,14 +509,15 @@ body:             |
   bb.0.entry:
     liveins: $x16, $f10_f, $f11_f, $f12_f
 
-    ; RV32-LABEL: name: store_common_ptr_float
-    ; RV32: liveins: $x16, $f10_f, $f11_f, $f12_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_ptr_float
+    ; RV32C: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_float
     ; RV64: liveins: $x16, $f10_f, $f11_f, $f12_f
     ; RV64-NEXT: {{  $}}
@@ -472,6 +525,23 @@ body:             |
     ; RV64-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV64-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_ptr_float
+    ; RV32ZCF: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0
+    ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_ptr_float
+    ; RV32ZCA: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
@@ -485,22 +555,47 @@ body:             |
   bb.0.entry:
     liveins: $x16, $f10_d, $f11_d, $f12_d
 
-    ; RV32-LABEL: name: store_common_ptr_double
-    ; RV32: liveins: $x16, $f10_d, $f11_d, $f12_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_common_ptr_double
-    ; RV64: liveins: $x16, $f10_d, $f11_d, $f12_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x10 = ADDI $x16, 0
-    ; RV64-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_ptr_double
+    ; RV32C: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_common_ptr_double
+    ; RV64C: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x10 = ADDI $x16, 0
+    ; RV64C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_ptr_double
+    ; RV32ZCF: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_ptr_double
+    ; RV32ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_common_ptr_double
+    ; RV64ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
@@ -522,6 +617,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x10 = LW $x11, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: dead renamable $x10 = LW killed $x11, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_common_ptr
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -543,14 +639,15 @@ body:             |
   bb.0.entry:
     liveins: $x16
 
-    ; RV32-LABEL: name: load_common_ptr_float
-    ; RV32: liveins: $x16
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
-    ; RV32-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ; RV32C-LABEL: name: load_common_ptr_float
+    ; RV32C: liveins: $x16
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
+    ; RV32C-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
     ; RV64-LABEL: name: load_common_ptr_float
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -558,6 +655,23 @@ body:             |
     ; RV64-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
     ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
     ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCF-LABEL: name: load_common_ptr_float
+    ; RV32ZCF: liveins: $x16
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0
+    ; RV32ZCF-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
+    ; RV32ZCF-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCA-LABEL: name: load_common_ptr_float
+    ; RV32ZCA: liveins: $x16
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g)
+    ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
     renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g)
     renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
     renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
@@ -571,22 +685,47 @@ body:             |
   bb.0.entry:
     liveins: $x16
 
-    ; RV32-LABEL: name: load_common_ptr_double
-    ; RV32: liveins: $x16
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
-    ; RV32-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
-    ; RV64-LABEL: name: load_common_ptr_double
-    ; RV64: liveins: $x16
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x10 = ADDI $x16, 0
-    ; RV64-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
-    ; RV64-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
-    ; RV64-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
-    ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ; RV32C-LABEL: name: load_common_ptr_double
+    ; RV32C: liveins: $x16
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
+    ; RV32C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64C-LABEL: name: load_common_ptr_double
+    ; RV64C: liveins: $x16
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x10 = ADDI $x16, 0
+    ; RV64C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
+    ; RV64C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV64C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCF-LABEL: name: load_common_ptr_double
+    ; RV32ZCF: liveins: $x16
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCA-LABEL: name: load_common_ptr_double
+    ; RV32ZCA: liveins: $x16
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64ZCA-LABEL: name: load_common_ptr_double
+    ; RV64ZCA: liveins: $x16
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
     renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
     renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
     renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
@@ -613,6 +752,7 @@ body:             |
     ; RV32-NEXT: renamable $x11 = ADDI $x0, 7
     ; RV32-NEXT: SW killed renamable $x11, killed $x12, 28 :: (volatile store (s32) into %ir.3)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -644,15 +784,16 @@ body:             |
   bb.0.entry:
     liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
 
-    ; RV32-LABEL: name: store_large_offset_float
-    ; RV32: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 384
-    ; RV32-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
-    ; RV32-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
-    ; RV32-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
-    ; RV32-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_large_offset_float
+    ; RV32C: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 384
+    ; RV32C-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
+    ; RV32C-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
+    ; RV32C-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
+    ; RV32C-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_float
     ; RV64: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
     ; RV64-NEXT: {{  $}}
@@ -661,6 +802,25 @@ body:             |
     ; RV64-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
     ; RV64-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_large_offset_float
+    ; RV32ZCF: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384
+    ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
+    ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
+    ; RV32ZCF-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
+    ; RV32ZCF-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_large_offset_float
+    ; RV32ZCA: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
+    ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1)
+    ; RV32ZCA-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
+    ; RV32ZCA-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
     FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
@@ -675,24 +835,52 @@ body:             |
   bb.0.entry:
     liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
 
-    ; RV32-LABEL: name: store_large_offset_double
-    ; RV32: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 768
-    ; RV32-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
-    ; RV32-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
-    ; RV32-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
-    ; RV32-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_large_offset_double
-    ; RV64: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x11 = ADDI $x10, 768
-    ; RV64-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
-    ; RV64-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
-    ; RV64-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
-    ; RV64-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_large_offset_double
+    ; RV32C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 768
+    ; RV32C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
+    ; RV32C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
+    ; RV32C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
+    ; RV32C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_large_offset_double
+    ; RV64C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x11 = ADDI $x10, 768
+    ; RV64C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
+    ; RV64C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
+    ; RV64C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
+    ; RV64C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_large_offset_double
+    ; RV32ZCF: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV32ZCF-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV32ZCF-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_large_offset_double
+    ; RV32ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV32ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV32ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_large_offset_double
+    ; RV64ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV64ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV64ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
     FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
     FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
@@ -716,6 +904,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x11 = LW $x12, 24 :: (volatile load (s32) from %ir.2)
     ; RV32-NEXT: dead renamable $x10 = LW killed $x12, 28 :: (volatile load (s32) from %ir.3)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_large_offset
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -739,14 +928,15 @@ body:             |
   bb.0.entry:
     liveins: $x10
 
-    ; RV32-LABEL: name: load_large_offset_float
-    ; RV32: liveins: $x10
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 384
-    ; RV32-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
-    ; RV32-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ; RV32C-LABEL: name: load_large_offset_float
+    ; RV32C: liveins: $x10
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 384
+    ; RV32C-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
+    ; RV32C-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
     ; RV64-LABEL: name: load_large_offset_float
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -754,6 +944,23 @@ body:             |
     ; RV64-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
     ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCF-LABEL: name: load_large_offset_float
+    ; RV32ZCF: liveins: $x10
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384
+    ; RV32ZCF-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
+    ; RV32ZCF-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCA-LABEL: name: load_large_offset_float
+    ; RV32ZCA: liveins: $x10
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
+    ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
     renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
     renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
@@ -767,22 +974,47 @@ body:             |
   bb.0.entry:
     liveins: $x10
 
-    ; RV32-LABEL: name: load_large_offset_double
-    ; RV32: liveins: $x10
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 768
-    ; RV32-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
-    ; RV32-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
-    ; RV64-LABEL: name: load_large_offset_double
-    ; RV64: liveins: $x10
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x11 = ADDI $x10, 768
-    ; RV64-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
-    ; RV64-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
-    ; RV64-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
-    ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ; RV32C-LABEL: name: load_large_offset_double
+    ; RV32C: liveins: $x10
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 768
+    ; RV32C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
+    ; RV32C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64C-LABEL: name: load_large_offset_double
+    ; RV64C: liveins: $x10
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x11 = ADDI $x10, 768
+    ; RV64C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
+    ; RV64C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
+    ; RV64C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
+    ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCF-LABEL: name: load_large_offset_double
+    ; RV32ZCF: liveins: $x10
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCA-LABEL: name: load_large_offset_double
+    ; RV32ZCA: liveins: $x10
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64ZCA-LABEL: name: load_large_offset_double
+    ; RV64ZCA: liveins: $x10
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
     renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
     renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
     renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
@@ -801,6 +1033,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: SW $x0, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -822,6 +1055,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSW killed renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_float_no_opt
     ; RV64: liveins: $x10, $f16_f
     ; RV64-NEXT: {{  $}}
@@ -843,6 +1077,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSD killed renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_double_no_opt
     ; RV64: liveins: $x10, $f16_d
     ; RV64-NEXT: {{  $}}
@@ -865,6 +1100,7 @@ body:             |
     ; RV32-NEXT: renamable $x10 = ADDI $x0, 1
     ; RV32-NEXT: SW killed renamable $x10, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -888,6 +1124,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSW killed renamable $f10_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_float_no_opt
     ; RV64: liveins: $x16, $f10_f
     ; RV64-NEXT: {{  $}}
@@ -909,6 +1146,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSD killed renamable $f10_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_double_no_opt
     ; RV64: liveins: $x16, $f10_d
     ; RV64-NEXT: {{  $}}
@@ -930,6 +1168,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x16, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_common_ptr_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -951,6 +1190,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $f10_f = FLW killed renamable $x16, 0 :: (load (s32) from %ir.g)
     ; RV32-NEXT: PseudoRET implicit $f10_f
+    ;
     ; RV64-LABEL: name: load_common_ptr_float_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -972,6 +1212,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $f10_d = FLD killed renamable $x16, 0 :: (load (s64) from %ir.g)
     ; RV32-NEXT: PseudoRET implicit $f10_d
+    ;
     ; RV64-LABEL: name: load_common_ptr_double_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -996,6 +1237,7 @@ body:             |
     ; RV32-NEXT: renamable $x11 = ADDI $x0, 3
     ; RV32-NEXT: SW killed renamable $x11, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1024,6 +1266,7 @@ body:             |
     ; RV32-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
     ; RV32-NEXT: FSW killed renamable $f11_f, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_float_no_opt
     ; RV64: liveins: $x10, $f10_f, $f11_f
     ; RV64-NEXT: {{  $}}
@@ -1048,6 +1291,7 @@ body:             |
     ; RV32-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
     ; RV32-NEXT: FSD killed renamable $f11_d, killed renamable $x10, 808 :: (volatile store (s64) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_double_no_opt
     ; RV64: liveins: $x10, $f10_d, $f11_d
     ; RV64-NEXT: {{  $}}
@@ -1072,6 +1316,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x11 = LW renamable $x10, 400 :: (volatile load (s32) from %ir.0)
     ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x10, 404 :: (volatile load (s32) from %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_large_offset_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1096,6 +1341,7 @@ body:             |
     ; RV32-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
     ; RV32-NEXT: renamable $f11_f = FLW killed renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     ; RV32-NEXT: PseudoRET implicit $f10_f, implicit $f11_f
+    ;
     ; RV64-LABEL: name: load_large_offset_float_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1120,6 +1366,7 @@ body:             |
     ; RV32-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
     ; RV32-NEXT: renamable $f11_d = FLD killed renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
     ; RV32-NEXT: PseudoRET implicit $f10_d, implicit $f11_d
+    ;
     ; RV64-LABEL: name: load_large_offset_double_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}

From edd4aee4dd9b5b98b2576a6f783e4086173d902a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 22 Feb 2024 15:57:57 +0800
Subject: [PATCH 009/546] [RISCV] Compute integers once in isSimpleVIDSequence.
 NFCI (#82590)

We need to iterate through the integers twice in isSimpleVIDSequence, so
instead of computing them twice just compute them once at the start.

This also replaces the individual checks that each element is constant
with a single call to BuildVectorSDNode::isConstant.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 64 ++++++++++-----------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 75be97ff32bbe..cf0dc36a51b61 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3242,44 +3242,47 @@ static std::optional<uint64_t> getExactInteger(const APFloat &APF,
 // determine whether this is worth generating code for.
 static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
                                                       unsigned EltSizeInBits) {
-  unsigned NumElts = Op.getNumOperands();
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unexpected BUILD_VECTOR");
+  if (!cast<BuildVectorSDNode>(Op)->isConstant())
+    return std::nullopt;
   bool IsInteger = Op.getValueType().isInteger();
 
   std::optional<unsigned> SeqStepDenom;
   std::optional<int64_t> SeqStepNum, SeqAddend;
   std::optional<std::pair<uint64_t, unsigned>> PrevElt;
   assert(EltSizeInBits >= Op.getValueType().getScalarSizeInBits());
-  for (unsigned Idx = 0; Idx < NumElts; Idx++) {
-    // Assume undef elements match the sequence; we just have to be careful
-    // when interpolating across them.
-    if (Op.getOperand(Idx).isUndef())
-      continue;
 
-    uint64_t Val;
+  // First extract the ops into a list of constant integer values. This may not
+  // be possible for floats if they're not all representable as integers.
+  SmallVector<std::optional<uint64_t>> Elts(Op.getNumOperands());
+  const unsigned OpSize = Op.getScalarValueSizeInBits();
+  for (auto [Idx, Elt] : enumerate(Op->op_values())) {
+    if (Elt.isUndef()) {
+      Elts[Idx] = std::nullopt;
+      continue;
+    }
     if (IsInteger) {
-      // The BUILD_VECTOR must be all constants.
-      if (!isa<ConstantSDNode>(Op.getOperand(Idx)))
-        return std::nullopt;
-      Val = Op.getConstantOperandVal(Idx) &
-            maskTrailingOnes<uint64_t>(Op.getScalarValueSizeInBits());
+      Elts[Idx] = Elt->getAsZExtVal() & maskTrailingOnes<uint64_t>(OpSize);
     } else {
-      // The BUILD_VECTOR must be all constants.
-      if (!isa<ConstantFPSDNode>(Op.getOperand(Idx)))
-        return std::nullopt;
-      if (auto ExactInteger = getExactInteger(
-              cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
-              Op.getScalarValueSizeInBits()))
-        Val = *ExactInteger;
-      else
+      auto ExactInteger =
+          getExactInteger(cast<ConstantFPSDNode>(Elt)->getValueAPF(), OpSize);
+      if (!ExactInteger)
         return std::nullopt;
+      Elts[Idx] = *ExactInteger;
     }
+  }
+
+  for (auto [Idx, Elt] : enumerate(Elts)) {
+    // Assume undef elements match the sequence; we just have to be careful
+    // when interpolating across them.
+    if (!Elt)
+      continue;
 
     if (PrevElt) {
       // Calculate the step since the last non-undef element, and ensure
       // it's consistent across the entire sequence.
       unsigned IdxDiff = Idx - PrevElt->second;
-      int64_t ValDiff = SignExtend64(Val - PrevElt->first, EltSizeInBits);
+      int64_t ValDiff = SignExtend64(*Elt - PrevElt->first, EltSizeInBits);
 
       // A zero-value value difference means that we're somewhere in the middle
       // of a fractional step, e.g. <0,0,0*,0,1,1,1,1>. Wait until we notice a
@@ -3309,8 +3312,8 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
     }
 
     // Record this non-undef element for later.
-    if (!PrevElt || PrevElt->first != Val)
-      PrevElt = std::make_pair(Val, Idx);
+    if (!PrevElt || PrevElt->first != *Elt)
+      PrevElt = std::make_pair(*Elt, Idx);
   }
 
   // We need to have logged a step for this to count as a legal index sequence.
@@ -3319,21 +3322,12 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
 
   // Loop back through the sequence and validate elements we might have skipped
   // while waiting for a valid step. While doing this, log any sequence addend.
-  for (unsigned Idx = 0; Idx < NumElts; Idx++) {
-    if (Op.getOperand(Idx).isUndef())
+  for (auto [Idx, Elt] : enumerate(Elts)) {
+    if (!Elt)
       continue;
-    uint64_t Val;
-    if (IsInteger) {
-      Val = Op.getConstantOperandVal(Idx) &
-            maskTrailingOnes<uint64_t>(Op.getScalarValueSizeInBits());
-    } else {
-      Val = *getExactInteger(
-          cast<ConstantFPSDNode>(Op.getOperand(Idx))->getValueAPF(),
-          Op.getScalarValueSizeInBits());
-    }
     uint64_t ExpectedVal =
         (int64_t)(Idx * (uint64_t)*SeqStepNum) / *SeqStepDenom;
-    int64_t Addend = SignExtend64(Val - ExpectedVal, EltSizeInBits);
+    int64_t Addend = SignExtend64(*Elt - ExpectedVal, EltSizeInBits);
     if (!SeqAddend)
       SeqAddend = Addend;
     else if (Addend != SeqAddend)

From e899641df2391179e8ec29ca14c53b09ae7ce85c Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Thu, 22 Feb 2024 09:00:20 +0100
Subject: [PATCH 010/546] [clang][dataflow] Fix inaccuracies in
 `buildStmtToBasicBlockMap()`. (#82496)

See the comments added to the code for details on the inaccuracies that
have
now been fixed.

The patch adds tests that fail with the old implementation.
---
 .../FlowSensitive/ControlFlowContext.cpp      |  31 +++-
 .../TypeErasedDataflowAnalysisTest.cpp        | 143 ++++++++++++++----
 2 files changed, 140 insertions(+), 34 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
index c9ebffe6f3780..8aed19544be6a 100644
--- a/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/ControlFlowContext.cpp
@@ -39,8 +39,35 @@ buildStmtToBasicBlockMap(const CFG &Cfg) {
 
       StmtToBlock[Stmt->getStmt()] = Block;
     }
-    if (const Stmt *TerminatorStmt = Block->getTerminatorStmt())
-      StmtToBlock[TerminatorStmt] = Block;
+  }
+  // Some terminator conditions don't appear as a `CFGElement` anywhere else -
+  // for example, this is true if the terminator condition is a `&&` or `||`
+  // operator.
+  // We associate these conditions with the block the terminator appears in,
+  // but only if the condition has not already appeared as a regular
+  // `CFGElement`. (The `insert()` below does nothing if the key already exists
+  // in the map.)
+  for (const CFGBlock *Block : Cfg) {
+    if (Block != nullptr)
+      if (const Stmt *TerminatorCond = Block->getTerminatorCondition())
+        StmtToBlock.insert({TerminatorCond, Block});
+  }
+  // Terminator statements typically don't appear as a `CFGElement` anywhere
+  // else, so we want to associate them with the block that they terminate.
+  // However, there are some important special cases:
+  // -  The conditional operator is a type of terminator, but it also appears
+  //    as a regular `CFGElement`, and we want to associate it with the block
+  //    in which it appears as a `CFGElement`.
+  // -  The `&&` and `||` operators are types of terminators, but like the
+  //    conditional operator, they can appear as a regular `CFGElement` or
+  //    as a terminator condition (see above).
+  // We process terminators last to make sure that we only associate them with
+  // the block they terminate if they haven't previously occurred as a regular
+  // `CFGElement` or as a terminator condition.
+  for (const CFGBlock *Block : Cfg) {
+    if (Block != nullptr)
+      if (const Stmt *TerminatorStmt = Block->getTerminatorStmt())
+        StmtToBlock.insert({TerminatorStmt, Block});
   }
   return StmtToBlock;
 }
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 3bca9cced8d6f..34f9b0b23719f 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -77,17 +77,33 @@ class DataflowAnalysisTest : public Test {
     return runDataflowAnalysis(*CFCtx, Analysis, Env);
   }
 
+  /// Returns the `CFGBlock` containing `S` (and asserts that it exists).
+  const CFGBlock *blockForStmt(const Stmt &S) {
+    const CFGBlock *Block = CFCtx->getStmtToBlock().lookup(&S);
+    assert(Block != nullptr);
+    return Block;
+  }
+
   template <typename StateT>
   const StateT &
   blockStateForStmt(const std::vector<std::optional<StateT>> &BlockStates,
-                    const Stmt *S) {
-    const CFGBlock *Block = CFCtx->getStmtToBlock().lookup(S);
-    assert(Block != nullptr);
-    const std::optional<StateT> &MaybeState = BlockStates[Block->getBlockID()];
+                    const Stmt &S) {
+    const std::optional<StateT> &MaybeState =
+        BlockStates[blockForStmt(S)->getBlockID()];
     assert(MaybeState.has_value());
     return *MaybeState;
   }
 
+  /// Returns the first node that matches `Matcher` (and asserts that the match
+  /// was successful, i.e. the returned node is not null).
+  template <typename NodeT, typename MatcherT>
+  const NodeT &matchNode(MatcherT Matcher) {
+    const auto *Node = selectFirst<NodeT>(
+        "node", match(Matcher.bind("node"), AST->getASTContext()));
+    assert(Node != nullptr);
+    return *Node;
+  }
+
   std::unique_ptr<ASTUnit> AST;
   std::unique_ptr<ControlFlowContext> CFCtx;
   std::unique_ptr<DataflowAnalysisContext> DACtx;
@@ -130,6 +146,79 @@ TEST_F(DataflowAnalysisTest, DiagnoseFunctionDiagnoserCalledOnEachElement) {
                                    " (Lifetime ends)\n")));
 }
 
+// Tests for the statement-to-block map.
+using StmtToBlockTest = DataflowAnalysisTest;
+
+TEST_F(StmtToBlockTest, ConditionalOperator) {
+  std::string Code = R"(
+    void target(bool b) {
+      int i = b ? 1 : 0;
+    }
+  )";
+  ASSERT_THAT_ERROR(runAnalysis<NoopAnalysis>(
+                        Code, [](ASTContext &C) { return NoopAnalysis(C); })
+                        .takeError(),
+                    llvm::Succeeded());
+
+  const auto &IDecl = matchNode<DeclStmt>(declStmt(has(varDecl(hasName("i")))));
+  const auto &ConditionalOp =
+      matchNode<ConditionalOperator>(conditionalOperator());
+
+  // The conditional operator should be associated with the same block as the
+  // `DeclStmt` for `i`. (Specifically, the conditional operator should not be
+  // associated with the block for which it is the terminator.)
+  EXPECT_EQ(blockForStmt(IDecl), blockForStmt(ConditionalOp));
+}
+
+TEST_F(StmtToBlockTest, LogicalAnd) {
+  std::string Code = R"(
+    void target(bool b1, bool b2) {
+      bool b = b1 && b2;
+    }
+  )";
+  ASSERT_THAT_ERROR(runAnalysis<NoopAnalysis>(
+                        Code, [](ASTContext &C) { return NoopAnalysis(C); })
+                        .takeError(),
+                    llvm::Succeeded());
+
+  const auto &BDecl = matchNode<DeclStmt>(declStmt(has(varDecl(hasName("b")))));
+  const auto &AndOp =
+      matchNode<BinaryOperator>(binaryOperator(hasOperatorName("&&")));
+
+  // The `&&` operator should be associated with the same block as the
+  // `DeclStmt` for `b`. (Specifically, the `&&` operator should not be
+  // associated with the block for which it is the terminator.)
+  EXPECT_EQ(blockForStmt(BDecl), blockForStmt(AndOp));
+}
+
+TEST_F(StmtToBlockTest, IfStatementWithLogicalAnd) {
+  std::string Code = R"(
+    void target(bool b1, bool b2) {
+      if (b1 && b2)
+        ;
+    }
+  )";
+  ASSERT_THAT_ERROR(runAnalysis<NoopAnalysis>(
+                        Code, [](ASTContext &C) { return NoopAnalysis(C); })
+                        .takeError(),
+                    llvm::Succeeded());
+
+  const auto &If = matchNode<IfStmt>(ifStmt());
+  const auto &B2 =
+      matchNode<DeclRefExpr>(declRefExpr(to(varDecl(hasName("b2")))));
+  const auto &AndOp =
+      matchNode<BinaryOperator>(binaryOperator(hasOperatorName("&&")));
+
+  // The if statement is the terminator for the block that contains both `b2`
+  // and the `&&` operator (which appears only as a terminator condition, not
+  // as a regular `CFGElement`).
+  const CFGBlock *IfBlock = blockForStmt(If);
+  const CFGBlock *B2Block = blockForStmt(B2);
+  const CFGBlock *AndOpBlock = blockForStmt(AndOp);
+  EXPECT_EQ(IfBlock, B2Block);
+  EXPECT_EQ(IfBlock, AndOpBlock);
+}
+
 // Tests that check we discard state for expressions correctly.
 using DiscardExprStateTest = DataflowAnalysisTest;
 
@@ -144,25 +233,20 @@ TEST_F(DiscardExprStateTest, WhileStatement) {
   auto BlockStates = llvm::cantFail(runAnalysis<NoopAnalysis>(
       Code, [](ASTContext &C) { return NoopAnalysis(C); }));
 
-  auto *NotEqOp = selectFirst<BinaryOperator>(
-      "op", match(binaryOperator(hasOperatorName("!=")).bind("op"),
-                  AST->getASTContext()));
-  ASSERT_NE(NotEqOp, nullptr);
-
-  auto *CallFoo = selectFirst<CallExpr>(
-      "call", match(callExpr(callee(functionDecl(hasName("foo")))).bind("call"),
-                    AST->getASTContext()));
-  ASSERT_NE(CallFoo, nullptr);
+  const auto &NotEqOp =
+      matchNode<BinaryOperator>(binaryOperator(hasOperatorName("!=")));
+  const auto &CallFoo =
+      matchNode<CallExpr>(callExpr(callee(functionDecl(hasName("foo")))));
 
   // In the block that evaluates the expression `p != nullptr`, this expression
   // is associated with a value.
   const auto &NotEqOpState = blockStateForStmt(BlockStates, NotEqOp);
-  EXPECT_NE(NotEqOpState.Env.getValue(*NotEqOp), nullptr);
+  EXPECT_NE(NotEqOpState.Env.getValue(NotEqOp), nullptr);
 
   // In the block that calls `foo(p)`, the value for `p != nullptr` is discarded
   // because it is not consumed by this block.
   const auto &CallFooState = blockStateForStmt(BlockStates, CallFoo);
-  EXPECT_EQ(CallFooState.Env.getValue(*NotEqOp), nullptr);
+  EXPECT_EQ(CallFooState.Env.getValue(NotEqOp), nullptr);
 }
 
 TEST_F(DiscardExprStateTest, BooleanOperator) {
@@ -174,29 +258,24 @@ TEST_F(DiscardExprStateTest, BooleanOperator) {
   auto BlockStates = llvm::cantFail(runAnalysis<NoopAnalysis>(
       Code, [](ASTContext &C) { return NoopAnalysis(C); }));
 
-  auto *AndOp = selectFirst<BinaryOperator>(
-      "op", match(binaryOperator(hasOperatorName("&&")).bind("op"),
-                  AST->getASTContext()));
-  ASSERT_NE(AndOp, nullptr);
-
-  auto *Return = selectFirst<ReturnStmt>(
-      "return", match(returnStmt().bind("return"), AST->getASTContext()));
-  ASSERT_NE(Return, nullptr);
+  const auto &AndOp =
+      matchNode<BinaryOperator>(binaryOperator(hasOperatorName("&&")));
+  const auto &Return = matchNode<ReturnStmt>(returnStmt());
 
   // In the block that evaluates the LHS of the `&&` operator, the LHS is
   // associated with a value, while the right-hand side is not (unsurprisingly,
   // as it hasn't been evaluated yet).
-  const auto &LHSState = blockStateForStmt(BlockStates, AndOp->getLHS());
-  auto *LHSValue = cast<BoolValue>(LHSState.Env.getValue(*AndOp->getLHS()));
+  const auto &LHSState = blockStateForStmt(BlockStates, *AndOp.getLHS());
+  auto *LHSValue = cast<BoolValue>(LHSState.Env.getValue(*AndOp.getLHS()));
   ASSERT_NE(LHSValue, nullptr);
-  EXPECT_EQ(LHSState.Env.getValue(*AndOp->getRHS()), nullptr);
+  EXPECT_EQ(LHSState.Env.getValue(*AndOp.getRHS()), nullptr);
 
   // In the block that evaluates the RHS, the RHS is associated with a
   // value. The value for the LHS has been discarded as it is not consumed by
   // this block.
-  const auto &RHSState = blockStateForStmt(BlockStates, AndOp->getRHS());
-  EXPECT_EQ(RHSState.Env.getValue(*AndOp->getLHS()), nullptr);
-  auto *RHSValue = cast<BoolValue>(RHSState.Env.getValue(*AndOp->getRHS()));
+  const auto &RHSState = blockStateForStmt(BlockStates, *AndOp.getRHS());
+  EXPECT_EQ(RHSState.Env.getValue(*AndOp.getLHS()), nullptr);
+  auto *RHSValue = cast<BoolValue>(RHSState.Env.getValue(*AndOp.getRHS()));
   ASSERT_NE(RHSValue, nullptr);
 
   // In the block that evaluates the return statement, the expression `b1 && b2`
@@ -217,9 +296,9 @@ TEST_F(DiscardExprStateTest, BooleanOperator) {
   // operands, rather than from the environment for the block that contains the
   // `&&`.
   const auto &ReturnState = blockStateForStmt(BlockStates, Return);
-  EXPECT_EQ(ReturnState.Env.getValue(*AndOp->getLHS()), nullptr);
-  EXPECT_EQ(ReturnState.Env.getValue(*AndOp->getRHS()), nullptr);
-  EXPECT_EQ(ReturnState.Env.getValue(*AndOp),
+  EXPECT_EQ(ReturnState.Env.getValue(*AndOp.getLHS()), nullptr);
+  EXPECT_EQ(ReturnState.Env.getValue(*AndOp.getRHS()), nullptr);
+  EXPECT_EQ(ReturnState.Env.getValue(AndOp),
             &ReturnState.Env.makeAnd(*LHSValue, *RHSValue));
 }
 

From 8bd327d6fed5a4ae99bdbd039f5503700030cf53 Mon Sep 17 00:00:00 2001
From: Nick Anderson <nickleus27@gmail.com>
Date: Thu, 22 Feb 2024 00:47:36 -0800
Subject: [PATCH 011/546] [AMDGPU][GlobalISel] Add fdiv / sqrt to rsq combine
 (#78673)

Fixes #64743
---
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   8 +-
 .../AMDGPU/AMDGPUPostLegalizerCombiner.cpp    |  23 +
 .../GlobalISel/combine-fdiv-sqrt-to-rsq.mir   | 584 ++++++++++++++++++
 3 files changed, 614 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b9411e2052120..9218760538dc5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -33,6 +33,12 @@ def rcp_sqrt_to_rsq : GICombineRule<
          [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
   (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
 
+def fdiv_by_sqrt_to_rsq_f16 : GICombineRule<
+  (defs root:$root),
+  (match (G_FSQRT f16:$sqrt, $x, (MIFlags FmContract)),
+         (G_FDIV f16:$dst, $y, $sqrt, (MIFlags FmContract)):$root,
+         [{ return matchFDivSqrtToRsqF16(*${root}); }]),
+  (apply [{ applyFDivSqrtToRsqF16(*${root}, ${x}.getReg()); }])>;
 
 def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
 
@@ -156,7 +162,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   "AMDGPUPostLegalizerCombinerImpl",
   [all_combines, gfx6gfx7_combines, gfx8_combines,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
-   rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {
+   rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index a1c34e92a57f3..82e17ddad851f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -83,6 +83,9 @@ class AMDGPUPostLegalizerCombinerImpl : public Combiner {
   matchRcpSqrtToRsq(MachineInstr &MI,
                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 
+  bool matchFDivSqrtToRsqF16(MachineInstr &MI) const;
+  void applyFDivSqrtToRsqF16(MachineInstr &MI, const Register &X) const;
+
   // FIXME: Should be able to have 2 separate matchdatas rather than custom
   // struct boilerplate.
   struct CvtF32UByteMatchInfo {
@@ -334,6 +337,26 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
   return false;
 }
 
+bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
+    MachineInstr &MI) const {
+  Register Sqrt = MI.getOperand(2).getReg();
+  return MRI.hasOneNonDBGUse(Sqrt);
+}
+
+void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
+    MachineInstr &MI, const Register &X) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Y = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  uint32_t Flags = MI.getFlags();
+  Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
+                     .addUse(X)
+                     .setMIFlags(Flags)
+                     .getReg(0);
+  B.buildFMul(Dst, RSQ, Y, Flags);
+  MI.eraseFromParent();
+}
+
 bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
   Register SrcReg = MI.getOperand(1).getReg();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
new file mode 100644
index 0000000000000..6c5339e36c77f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -0,0 +1,584 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f16_multi_use_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f16_multi_use_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            afn_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            afn_rsq_f32_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f32_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, %rsq
+    ; GCN-NEXT: $vgpr0 = COPY %ret(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    %ret:_(s32) = G_FSUB %sqrt, %rsq
+    $vgpr0 = COPY %ret
+
+...
+
+---
+name:            afn_neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+
+---
+name:            rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            afn_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract afn G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            afn_neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract afn G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+
+---
+name:            rsq_fract_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_fract_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %fract:_(s16) = G_FCONSTANT half 0xH3800
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %fract
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %fract:_(s16) = G_FCONSTANT half 0.5
+    %rsq:_(s16) = contract G_FDIV %fract, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_fract_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_fract_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_fract:_(s16) = G_FCONSTANT half 0xHB800
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_fract
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_fract:_(s16) = G_FCONSTANT half -0.5
+    %rsq:_(s16) = contract G_FDIV %neg_fract, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+
+...
+
+---
+name:            rsq_large_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_large_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %ten:_(s16) = G_FCONSTANT half 0xH4900
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %ten
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %ten:_(s16) = G_FCONSTANT half 10.0
+    %rsq:_(s16) = contract G_FDIV %ten, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_large_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_large_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_ten:_(s16) = G_FCONSTANT half 0xHC900
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_ten
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_ten:_(s16) = G_FCONSTANT half -10.0
+    %rsq:_(s16) = contract G_FDIV %neg_ten, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...

From fde344aef20bc4280f01294ac6e14a5c2db2d572 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 22 Feb 2024 09:55:50 +0100
Subject: [PATCH 012/546] [mlir][Transforms] Dialect conversion: Improve
 signature conversion API (#81997)

This commit improves the block signature conversion API of the dialect
conversion.

There is the following comment in
`ArgConverter::applySignatureConversion`:
```
// If no arguments are being changed or added, there is nothing to do.
```

However, the implementation actually used to replace a block with a new
block even if the block argument types do not change (i.e., there is
"nothing to do"). This is fixed in this commit. The documentation of the
public `ConversionPatternRewriter` API is updated accordingly.

This commit also removes a check that used to *sometimes* skip a block
signature conversion if the block was already converted. This is not
consistent with the public `ConversionPatternRewriter` API; blocks
should always be converted, regardless of whether they were already
converted or not.

Block signature conversion also used to be silently skipped when the
specified block was detached. Instead of silently skipping, an assertion
is triggered. Attempting to convert a detached block (which is likely an
erased block) is invalid API usage.
---
 mlir/include/mlir/Transforms/DialectConversion.h | 12 +++++++++---
 mlir/lib/Transforms/Utils/DialectConversion.cpp  | 10 +++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 0d7722aa07ee3..2575be4cdea1a 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -663,6 +663,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// Apply a signature conversion to the entry block of the given region. This
   /// replaces the entry block with a new block containing the updated
   /// signature. The new entry block to the region is returned for convenience.
+  /// If no block argument types are changing, the entry original block will be
+  /// left in place and returned.
   ///
   /// If provided, `converter` will be used for any materializations.
   Block *
@@ -671,8 +673,11 @@ class ConversionPatternRewriter final : public PatternRewriter {
                            const TypeConverter *converter = nullptr);
 
   /// Convert the types of block arguments within the given region. This
-  /// replaces each block with a new block containing the updated signature. The
-  /// entry block may have a special conversion if `entryConversion` is
+  /// replaces each block with a new block containing the updated signature. If
+  /// an updated signature would match the current signature, the respective
+  /// block is left in place as is.
+  ///
+  /// The entry block may have a special conversion if `entryConversion` is
   /// provided. On success, the new entry block to the region is returned for
   /// convenience. Otherwise, failure is returned.
   FailureOr<Block *> convertRegionTypes(
@@ -681,7 +686,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
 
   /// Convert the types of block arguments within the given region except for
   /// the entry region. This replaces each non-entry block with a new block
-  /// containing the updated signature.
+  /// containing the updated signature. If an updated signature would match the
+  /// current signature, the respective block is left in place as is.
   ///
   /// If special conversion behavior is needed for the non-entry blocks (for
   /// example, we need to convert only a subset of a BB arguments), such
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4989ddc3ec94f..afdd31a748c8c 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -544,12 +544,8 @@ FailureOr<Block *> ArgConverter::convertSignature(
     Block *block, const TypeConverter *converter,
     ConversionValueMapping &mapping,
     SmallVectorImpl<BlockArgument> &argReplacements) {
-  // Check if the block was already converted.
-  // * If the block is mapped in `conversionInfo`, it is a converted block.
-  // * If the block is detached, conservatively assume that it is going to be
-  //   deleted; it is likely the old block (before it was converted).
-  if (conversionInfo.count(block) || !block->getParent())
-    return block;
+  assert(block->getParent() && "cannot convert signature of detached block");
+
   // If a converter wasn't provided, and the block wasn't already converted,
   // there is nothing we can do.
   if (!converter)
@@ -570,7 +566,7 @@ Block *ArgConverter::applySignatureConversion(
   // If no arguments are being changed or added, there is nothing to do.
   unsigned origArgCount = block->getNumArguments();
   auto convertedTypes = signatureConversion.getConvertedTypes();
-  if (origArgCount == 0 && convertedTypes.empty())
+  if (llvm::equal(block->getArgumentTypes(), convertedTypes))
     return block;
 
   // Split the block at the beginning to get a new block to use for the updated

From 25e7e8d993f12f391ad90d23b5c3e2385ebafc81 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 20 Feb 2024 22:13:46 +0100
Subject: [PATCH 013/546] [CGP] Permit tail call optimization on undefined
 return value

We may freely allow tail call optzs on undef values as well.

Fixes: https://github.com/llvm/llvm-project/issues/82387.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  5 +-
 llvm/test/CodeGen/AArch64/addsub.ll           |  6 +-
 .../CodeGen/AArch64/callbr-asm-obj-file.ll    |  2 +-
 llvm/test/CodeGen/RISCV/pr51206.ll            | 12 ++--
 llvm/test/CodeGen/X86/tailcall-cgp-dup.ll     | 58 ++++++++++++++++++-
 5 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 4036f18dbc679..feefe87f40636 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2686,8 +2686,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
             attributesPermitTailCall(F, CI, RetI, *TLI)) {
           // Either we return void or the return value must be the first
           // argument of a known intrinsic or library function.
-          if (!V || (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
-                     V == CI->getArgOperand(0))) {
+          if (!V || isa<UndefValue>(V) ||
+              (isIntrinsicOrLFToBeTailCalled(TLInfo, CI) &&
+               V == CI->getArgOperand(0))) {
             TailCallBBs.push_back(Pred);
           }
         }
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 1b86fe6c707c8..20215fe914692 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -662,17 +662,13 @@ define dso_local i32 @_extract_crng_crng() {
 ; CHECK-NEXT:    cmn x8, #1272
 ; CHECK-NEXT:    b.pl .LBB36_3
 ; CHECK-NEXT:  .LBB36_2: // %if.then
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, primary_crng
 ; CHECK-NEXT:    ldr w8, [x8, :lo12:primary_crng]
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    adrp x8, input_pool
 ; CHECK-NEXT:    add x8, x8, :lo12:input_pool
 ; CHECK-NEXT:    csel x0, xzr, x8, eq
-; CHECK-NEXT:    bl crng_reseed
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    b crng_reseed
 ; CHECK-NEXT:  .LBB36_3: // %if.end
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
index 94041bf00218c..e601f03d524a4 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
@@ -40,7 +40,7 @@ declare dso_local i32 @g(...) local_unnamed_addr
 declare dso_local i32 @i(...) local_unnamed_addr
 
 ; CHECK-LABEL: <test2>:
-; CHECK:         bl {{.*}} <test2+0x18>
+; CHECK:         b {{.*}} <test2+0x1c>
 ; CHECK-LABEL: <$d.5>:
 ; CHECK-LABEL: <$x.6>:
 ; CHECK-NEXT:    b {{.*}} <test2+0x18>
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index f54031af0de5e..8aa145f6ac5ef 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -27,16 +27,12 @@ define signext i32 @wobble() nounwind {
 ; CHECK-NEXT:    lui a2, %hi(global.3)
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    sw a1, %lo(global.3)(a2)
-; CHECK-NEXT:    bltu a0, a3, .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %bb10
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    call quux
-; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:  .LBB0_2: # %bb12
+; CHECK-NEXT:    bgeu a0, a3, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %bb12
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %bb10
+; CHECK-NEXT:    tail quux
 bb:
   %tmp = load i8, ptr @global, align 1
   %tmp1 = zext i8 %tmp to i32
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index 401ed9f7bc5a9..8a9ee60f341c2 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -339,7 +339,7 @@ return:
 
 define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) nounwind {
 ; CHECK-LABEL: strcpy_illegal_tailc:
-; CHECK:       ## %bb.0:
+; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movq %rdx, %rbx
 ; CHECK-NEXT:    testq %rsi, %rsi
@@ -351,6 +351,7 @@ define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src)
 ; CHECK-NEXT:    movq %rbx, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
+entry:
   %cmp = icmp eq i64 %sz, 0
   br i1 %cmp, label %return, label %if.then
 
@@ -362,8 +363,63 @@ return:
   ret ptr %src
 }
 
+@i = global i32 0, align 4
+
+define i32 @undef_tailc() nounwind {
+; CHECK-LABEL: undef_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    cmpl $0, _i(%rip)
+; CHECK-NEXT:    jne _qux ## TAILCALL
+; CHECK-NEXT:  ## %bb.1: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %val = load i32, ptr @i, align 4
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %rv_unused = tail call i32 @qux()
+  br label %return
+
+return:
+  ret i32 undef
+}
+
+define i32 @undef_and_known_tailc() nounwind {
+; CHECK-LABEL: undef_and_known_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl _i(%rip), %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    je _qux ## TAILCALL
+; CHECK-NEXT:  ## %bb.1: ## %entry
+; CHECK-NEXT:    cmpl $2, %eax
+; CHECK-NEXT:    je _quux ## TAILCALL
+; CHECK-NEXT:  ## %bb.2: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %val = load i32, ptr @i, align 4
+  switch i32 %val, label %return [
+    i32 2, label %case_2
+    i32 5, label %case_5
+  ]
+
+case_2:
+  %rv_unused = tail call i32 @quux()
+  br label %return
+
+case_5:
+  %rv = tail call i32 @qux()
+  br label %return
+
+return:
+  %phi = phi i32 [ undef, %case_2 ], [ %rv, %case_5 ], [ undef, %entry ]
+  ret i32 %phi
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1)
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1)
 declare noalias ptr @malloc(i64)
 declare ptr @strcpy(ptr noalias returned writeonly, ptr noalias nocapture readonly)
 declare ptr @baz(ptr, ptr)
+declare i32 @qux()
+declare i32 @quux()

From c5253aa136ac6ba683b367b2bae0dde1a543d1df Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Thu, 22 Feb 2024 09:19:48 +0000
Subject: [PATCH 014/546] [AArch64] Restore Z-registers before P-registers
 (#79623) (#82492)

This is needed by PR#77665[1] that uses a P-register while restoring
Z-registers.

The reverse for SVE register restore in the epilogue was added to
guarantee performance, but further work was done to improve sve frame
restore and besides that the schedule also may change the order of the
restore, undoing the reverse restore.

This also fix the problem reported in (PR #79623) on Windows with
std::reverse and .base().

[1]https://github.com/llvm/llvm-project/pull/77665
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 19 ++--
 .../framelayout-sve-calleesaves-fix.mir       |  2 +-
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 24 ++---
 .../sme-streaming-compatible-interface.ll     | 32 +++----
 .../AArch64/sme-streaming-interface.ll        | 32 +++----
 .../CodeGen/AArch64/sme2-intrinsics-ld1.ll    | 32 +++----
 .../CodeGen/AArch64/sme2-intrinsics-ldnt1.ll  | 32 +++----
 .../test/CodeGen/AArch64/stack-probing-sve.ll |  4 +-
 llvm/test/CodeGen/AArch64/sve-alloca.ll       | 16 ++--
 .../AArch64/sve-calling-convention-mixed.ll   | 32 +++----
 llvm/test/CodeGen/AArch64/sve-tailcall.ll     | 32 +++----
 llvm/test/CodeGen/AArch64/unwind-preserved.ll | 96 +++++++++----------
 12 files changed, 177 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3485edb69c910..503b1c199650f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3195,11 +3195,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     return MIB->getIterator();
   };
 
-  // SVE objects are always restored in reverse order.
-  for (const RegPairInfo &RPI : reverse(RegPairs))
-    if (RPI.isScalable())
-      EmitMI(RPI);
-
   if (homogeneousPrologEpilog(MF, &MBB)) {
     auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
                    .setMIFlag(MachineInstr::FrameDestroy);
@@ -3210,11 +3205,19 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     return true;
   }
 
+  // For performance reasons restore SVE register in increasing order
+  auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
+  auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+  auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
+  std::reverse(PPRBegin, PPREnd);
+  auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+  auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+  auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
+  std::reverse(ZPRBegin, ZPREnd);
+
   if (ReverseCSRRestoreSeq) {
     MachineBasicBlock::iterator First = MBB.end();
     for (const RegPairInfo &RPI : reverse(RegPairs)) {
-      if (RPI.isScalable())
-        continue;
       MachineBasicBlock::iterator It = EmitMI(RPI);
       if (First == MBB.end())
         First = It;
@@ -3223,8 +3226,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       MBB.splice(MBBI, &MBB, First);
   } else {
     for (const RegPairInfo &RPI : RegPairs) {
-      if (RPI.isScalable())
-        continue;
       (void)EmitMI(RPI);
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index 3dba21d59b408..aed3145073619 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
   ; CHECK-NEXT:    // implicit-def: $p4
   ; CHECK-NEXT:    addvl sp, sp, #1
   ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    addvl sp, sp, #2
   ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
   ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 213d7919e4a72..f7920e595e44b 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -772,9 +772,9 @@ body:             |
 
 # CHECK:      $sp  = frame-destroy ADDXri $sp, 32, 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10  = frame-destroy LDR_ZXI $sp, 0
 # CHECK-NEXT: $z9  = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8  = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
 # CHECK-NEXT: $sp  = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -873,14 +873,14 @@ body:             |
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
-# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
-# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
-# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
 # CHECK:      $z22 = frame-destroy LDR_ZXI $sp, 3
 # CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
 # CHECK:      $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 18
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1037,14 +1037,14 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
+# CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
 # CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
 # CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1198,10 +1198,10 @@ body:             |
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 7
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
-# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
 # CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 296f2be9cfee5..6d2abf7e18419 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -226,30 +226,30 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -318,30 +318,30 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 86918a59f3810..de676ac5e0d2e 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -187,30 +187,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -267,30 +267,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc082567..ea7808d73093e 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -129,7 +129,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -145,6 +144,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -284,7 +284,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -300,6 +299,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -440,7 +440,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -456,6 +455,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -595,7 +595,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -611,6 +610,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -751,7 +751,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -767,6 +766,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -906,7 +906,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -922,6 +921,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1062,7 +1062,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1078,6 +1077,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1217,7 +1217,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1233,6 +1232,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1380,7 +1380,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1395,6 +1394,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1545,7 +1545,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1560,6 +1559,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1711,7 +1711,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1726,6 +1725,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1877,7 +1877,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1892,6 +1891,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2043,7 +2043,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2058,6 +2057,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2209,7 +2209,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2224,6 +2223,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2375,7 +2375,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2390,6 +2389,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2541,7 +2541,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2556,6 +2555,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 1fb251a4f628e..7e2d28fbf7982 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -82,7 +82,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -98,6 +97,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -190,7 +190,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -206,6 +205,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -299,7 +299,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -315,6 +314,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -407,7 +407,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -423,6 +422,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -516,7 +516,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -532,6 +531,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -624,7 +624,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -640,6 +639,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -733,7 +733,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -749,6 +748,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -841,7 +841,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -857,6 +856,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -955,7 +955,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -970,6 +969,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1071,7 +1071,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1086,6 +1085,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1188,7 +1188,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1203,6 +1202,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1304,7 +1304,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1319,6 +1318,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1421,7 +1421,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1436,6 +1435,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1537,7 +1537,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1552,6 +1551,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1654,7 +1654,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1669,6 +1668,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1770,7 +1770,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1785,6 +1784,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 1ad78709d5012..56d865ef83e6b 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -380,7 +380,6 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -397,6 +396,7 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #17
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -697,10 +697,10 @@ define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index 47e49b84aaaff..d227538043fce 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,30 +66,30 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:    addvl sp, x29, #-18
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 9851583b950eb..3965af6a9066d 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,30 +567,30 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -659,30 +659,30 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    fmov s7, #7.00000000
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index f32c80d392b63..4ddf007768fd2 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index f3c4d217e6fca..822be14faaeb1 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,18 +63,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -91,6 +79,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -112,18 +112,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -140,6 +128,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -215,18 +215,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -243,6 +231,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
@@ -264,18 +264,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -292,6 +280,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8

From 55558cd05c998f1b287b0af97aa6db0db0bdfaa0 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 22 Feb 2024 10:22:27 +0100
Subject: [PATCH 015/546] [mlir][Transforms][NFC] Turn block type conversion
 into `IRRewrite` (#81756)

This commit is a refactoring of the dialect conversion. The dialect
conversion maintains a list of "IR rewrites" that can be committed (upon
success) or rolled back (upon failure).

Until now, the signature conversion of a block was only a "partial" IR
rewrite. Rollbacks were triggered via
`BlockTypeConversionRewrite::rollback`, but there was no
`BlockTypeConversionRewrite::commit` equivalent.

Overview of changes:
* Remove `ArgConverter`, an internal helper class that kept track of all
block type conversions. There is now a separate
`BlockTypeConversionRewrite` for each block type conversion.
* No more special handling for block type conversions. They are now
normal "IR rewrites", just like "block creation" or "block movement". In
particular, trigger "commits" of block type conversion via
`BlockTypeConversionRewrite::commit`.
* Remove `ArgConverter::notifyOpRemoved`. This function was used to
inform the `ArgConverter` that an operation was erased, to prevent a
double-free of operations in certain situations. It would be unpractical
to add a `notifyOpRemoved` API to `IRRewrite`. Instead, erasing
ops/block should go through a new `SingleEraseRewriter` (that is owned
by the `ConversionPatternRewriterImpl`) if there is chance of
double-free. This rewriter ignores `eraseOp`/`eraseBlock` if the
op/block was already freed.
---
 .../Transforms/Utils/DialectConversion.cpp    | 794 ++++++++----------
 1 file changed, 364 insertions(+), 430 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index afdd31a748c8c..db41b9f19e7e8 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -154,12 +154,13 @@ namespace {
 struct RewriterState {
   RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations,
                 unsigned numReplacements, unsigned numArgReplacements,
-                unsigned numRewrites, unsigned numIgnoredOperations)
+                unsigned numRewrites, unsigned numIgnoredOperations,
+                unsigned numErased)
       : numCreatedOps(numCreatedOps),
         numUnresolvedMaterializations(numUnresolvedMaterializations),
         numReplacements(numReplacements),
         numArgReplacements(numArgReplacements), numRewrites(numRewrites),
-        numIgnoredOperations(numIgnoredOperations) {}
+        numIgnoredOperations(numIgnoredOperations), numErased(numErased) {}
 
   /// The current number of created operations.
   unsigned numCreatedOps;
@@ -178,6 +179,9 @@ struct RewriterState {
 
   /// The current number of ignored operations.
   unsigned numIgnoredOperations;
+
+  /// The current number of erased operations/blocks.
+  unsigned numErased;
 };
 
 //===----------------------------------------------------------------------===//
@@ -292,370 +296,6 @@ static Value buildUnresolvedTargetMaterialization(
       outputType, outputType, converter, unresolvedMaterializations);
 }
 
-//===----------------------------------------------------------------------===//
-// ArgConverter
-//===----------------------------------------------------------------------===//
-namespace {
-/// This class provides a simple interface for converting the types of block
-/// arguments. This is done by creating a new block that contains the new legal
-/// types and extracting the block that contains the old illegal types to allow
-/// for undoing pending rewrites in the case of failure.
-struct ArgConverter {
-  ArgConverter(
-      PatternRewriter &rewriter,
-      SmallVectorImpl<UnresolvedMaterialization> &unresolvedMaterializations)
-      : rewriter(rewriter),
-        unresolvedMaterializations(unresolvedMaterializations) {}
-
-  /// This structure contains the information pertaining to an argument that has
-  /// been converted.
-  struct ConvertedArgInfo {
-    ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize,
-                     Value castValue = nullptr)
-        : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {}
-
-    /// The start index of in the new argument list that contains arguments that
-    /// replace the original.
-    unsigned newArgIdx;
-
-    /// The number of arguments that replaced the original argument.
-    unsigned newArgSize;
-
-    /// The cast value that was created to cast from the new arguments to the
-    /// old. This only used if 'newArgSize' > 1.
-    Value castValue;
-  };
-
-  /// This structure contains information pertaining to a block that has had its
-  /// signature converted.
-  struct ConvertedBlockInfo {
-    ConvertedBlockInfo(Block *origBlock, const TypeConverter *converter)
-        : origBlock(origBlock), converter(converter) {}
-
-    /// The original block that was requested to have its signature converted.
-    Block *origBlock;
-
-    /// The conversion information for each of the arguments. The information is
-    /// std::nullopt if the argument was dropped during conversion.
-    SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo;
-
-    /// The type converter used to convert the arguments.
-    const TypeConverter *converter;
-  };
-
-  //===--------------------------------------------------------------------===//
-  // Rewrite Application
-  //===--------------------------------------------------------------------===//
-
-  /// Erase any rewrites registered for the blocks within the given operation
-  /// which is about to be removed. This merely drops the rewrites without
-  /// undoing them.
-  void notifyOpRemoved(Operation *op);
-
-  /// Cleanup and undo any generated conversions for the arguments of block.
-  /// This method replaces the new block with the original, reverting the IR to
-  /// its original state.
-  void discardRewrites(Block *block);
-
-  /// Fully replace uses of the old arguments with the new.
-  void applyRewrites(ConversionValueMapping &mapping);
-
-  /// Materialize any necessary conversions for converted arguments that have
-  /// live users, using the provided `findLiveUser` to search for a user that
-  /// survives the conversion process.
-  LogicalResult
-  materializeLiveConversions(ConversionValueMapping &mapping,
-                             OpBuilder &builder,
-                             function_ref<Operation *(Value)> findLiveUser);
-
-  //===--------------------------------------------------------------------===//
-  // Conversion
-  //===--------------------------------------------------------------------===//
-
-  /// Attempt to convert the signature of the given block, if successful a new
-  /// block is returned containing the new arguments. Returns `block` if it did
-  /// not require conversion.
-  FailureOr<Block *>
-  convertSignature(Block *block, const TypeConverter *converter,
-                   ConversionValueMapping &mapping,
-                   SmallVectorImpl<BlockArgument> &argReplacements);
-
-  /// Apply the given signature conversion on the given block. The new block
-  /// containing the updated signature is returned. If no conversions were
-  /// necessary, e.g. if the block has no arguments, `block` is returned.
-  /// `converter` is used to generate any necessary cast operations that
-  /// translate between the origin argument types and those specified in the
-  /// signature conversion.
-  Block *applySignatureConversion(
-      Block *block, const TypeConverter *converter,
-      TypeConverter::SignatureConversion &signatureConversion,
-      ConversionValueMapping &mapping,
-      SmallVectorImpl<BlockArgument> &argReplacements);
-
-  /// A collection of blocks that have had their arguments converted. This is a
-  /// map from the new replacement block, back to the original block.
-  llvm::MapVector<Block *, ConvertedBlockInfo> conversionInfo;
-
-  /// The pattern rewriter to use when materializing conversions.
-  PatternRewriter &rewriter;
-
-  /// An ordered set of unresolved materializations during conversion.
-  SmallVectorImpl<UnresolvedMaterialization> &unresolvedMaterializations;
-};
-} // namespace
-
-//===----------------------------------------------------------------------===//
-// Rewrite Application
-
-void ArgConverter::notifyOpRemoved(Operation *op) {
-  if (conversionInfo.empty())
-    return;
-
-  for (Region &region : op->getRegions()) {
-    for (Block &block : region) {
-      // Drop any rewrites from within.
-      for (Operation &nestedOp : block)
-        if (nestedOp.getNumRegions())
-          notifyOpRemoved(&nestedOp);
-
-      // Check if this block was converted.
-      auto *it = conversionInfo.find(&block);
-      if (it == conversionInfo.end())
-        continue;
-
-      // Drop all uses of the original arguments and delete the original block.
-      Block *origBlock = it->second.origBlock;
-      for (BlockArgument arg : origBlock->getArguments())
-        arg.dropAllUses();
-      conversionInfo.erase(it);
-    }
-  }
-}
-
-void ArgConverter::discardRewrites(Block *block) {
-  auto *it = conversionInfo.find(block);
-  if (it == conversionInfo.end())
-    return;
-  Block *origBlock = it->second.origBlock;
-
-  // Drop all uses of the new block arguments and replace uses of the new block.
-  for (int i = block->getNumArguments() - 1; i >= 0; --i)
-    block->getArgument(i).dropAllUses();
-  block->replaceAllUsesWith(origBlock);
-
-  // Move the operations back the original block, move the original block back
-  // into its original location and the delete the new block.
-  origBlock->getOperations().splice(origBlock->end(), block->getOperations());
-  block->getParent()->getBlocks().insert(Region::iterator(block), origBlock);
-  block->erase();
-
-  conversionInfo.erase(it);
-}
-
-void ArgConverter::applyRewrites(ConversionValueMapping &mapping) {
-  for (auto &info : conversionInfo) {
-    ConvertedBlockInfo &blockInfo = info.second;
-    Block *origBlock = blockInfo.origBlock;
-
-    // Process the remapping for each of the original arguments.
-    for (unsigned i = 0, e = origBlock->getNumArguments(); i != e; ++i) {
-      std::optional<ConvertedArgInfo> &argInfo = blockInfo.argInfo[i];
-      BlockArgument origArg = origBlock->getArgument(i);
-
-      // Handle the case of a 1->0 value mapping.
-      if (!argInfo) {
-        if (Value newArg = mapping.lookupOrNull(origArg, origArg.getType()))
-          origArg.replaceAllUsesWith(newArg);
-        continue;
-      }
-
-      // Otherwise this is a 1->1+ value mapping.
-      Value castValue = argInfo->castValue;
-      assert(argInfo->newArgSize >= 1 && castValue && "expected 1->1+ mapping");
-
-      // If the argument is still used, replace it with the generated cast.
-      if (!origArg.use_empty()) {
-        origArg.replaceAllUsesWith(
-            mapping.lookupOrDefault(castValue, origArg.getType()));
-      }
-    }
-
-    delete origBlock;
-    blockInfo.origBlock = nullptr;
-  }
-}
-
-LogicalResult ArgConverter::materializeLiveConversions(
-    ConversionValueMapping &mapping, OpBuilder &builder,
-    function_ref<Operation *(Value)> findLiveUser) {
-  for (auto &info : conversionInfo) {
-    Block *newBlock = info.first;
-    ConvertedBlockInfo &blockInfo = info.second;
-    Block *origBlock = blockInfo.origBlock;
-
-    // Process the remapping for each of the original arguments.
-    for (unsigned i = 0, e = origBlock->getNumArguments(); i != e; ++i) {
-      // If the type of this argument changed and the argument is still live, we
-      // need to materialize a conversion.
-      BlockArgument origArg = origBlock->getArgument(i);
-      if (mapping.lookupOrNull(origArg, origArg.getType()))
-        continue;
-      Operation *liveUser = findLiveUser(origArg);
-      if (!liveUser)
-        continue;
-
-      Value replacementValue = mapping.lookupOrDefault(origArg);
-      bool isDroppedArg = replacementValue == origArg;
-      if (isDroppedArg)
-        rewriter.setInsertionPointToStart(newBlock);
-      else
-        rewriter.setInsertionPointAfterValue(replacementValue);
-      Value newArg;
-      if (blockInfo.converter) {
-        newArg = blockInfo.converter->materializeSourceConversion(
-            rewriter, origArg.getLoc(), origArg.getType(),
-            isDroppedArg ? ValueRange() : ValueRange(replacementValue));
-        assert((!newArg || newArg.getType() == origArg.getType()) &&
-               "materialization hook did not provide a value of the expected "
-               "type");
-      }
-      if (!newArg) {
-        InFlightDiagnostic diag =
-            emitError(origArg.getLoc())
-            << "failed to materialize conversion for block argument #" << i
-            << " that remained live after conversion, type was "
-            << origArg.getType();
-        if (!isDroppedArg)
-          diag << ", with target type " << replacementValue.getType();
-        diag.attachNote(liveUser->getLoc())
-            << "see existing live user here: " << *liveUser;
-        return failure();
-      }
-      mapping.map(origArg, newArg);
-    }
-  }
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Conversion
-
-FailureOr<Block *> ArgConverter::convertSignature(
-    Block *block, const TypeConverter *converter,
-    ConversionValueMapping &mapping,
-    SmallVectorImpl<BlockArgument> &argReplacements) {
-  assert(block->getParent() && "cannot convert signature of detached block");
-
-  // If a converter wasn't provided, and the block wasn't already converted,
-  // there is nothing we can do.
-  if (!converter)
-    return failure();
-
-  // Try to convert the signature for the block with the provided converter.
-  if (auto conversion = converter->convertBlockSignature(block))
-    return applySignatureConversion(block, converter, *conversion, mapping,
-                                    argReplacements);
-  return failure();
-}
-
-Block *ArgConverter::applySignatureConversion(
-    Block *block, const TypeConverter *converter,
-    TypeConverter::SignatureConversion &signatureConversion,
-    ConversionValueMapping &mapping,
-    SmallVectorImpl<BlockArgument> &argReplacements) {
-  // If no arguments are being changed or added, there is nothing to do.
-  unsigned origArgCount = block->getNumArguments();
-  auto convertedTypes = signatureConversion.getConvertedTypes();
-  if (llvm::equal(block->getArgumentTypes(), convertedTypes))
-    return block;
-
-  // Split the block at the beginning to get a new block to use for the updated
-  // signature.
-  Block *newBlock = block->splitBlock(block->begin());
-  block->replaceAllUsesWith(newBlock);
-  // Unlink the block, but do not erase it yet, so that the change can be rolled
-  // back.
-  block->getParent()->getBlocks().remove(block);
-
-  // Map all new arguments to the location of the argument they originate from.
-  SmallVector<Location> newLocs(convertedTypes.size(),
-                                rewriter.getUnknownLoc());
-  for (unsigned i = 0; i < origArgCount; ++i) {
-    auto inputMap = signatureConversion.getInputMapping(i);
-    if (!inputMap || inputMap->replacementValue)
-      continue;
-    Location origLoc = block->getArgument(i).getLoc();
-    for (unsigned j = 0; j < inputMap->size; ++j)
-      newLocs[inputMap->inputNo + j] = origLoc;
-  }
-
-  SmallVector<Value, 4> newArgRange(
-      newBlock->addArguments(convertedTypes, newLocs));
-  ArrayRef<Value> newArgs(newArgRange);
-
-  // Remap each of the original arguments as determined by the signature
-  // conversion.
-  ConvertedBlockInfo info(block, converter);
-  info.argInfo.resize(origArgCount);
-
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(newBlock);
-  for (unsigned i = 0; i != origArgCount; ++i) {
-    auto inputMap = signatureConversion.getInputMapping(i);
-    if (!inputMap)
-      continue;
-    BlockArgument origArg = block->getArgument(i);
-
-    // If inputMap->replacementValue is not nullptr, then the argument is
-    // dropped and a replacement value is provided to be the remappedValue.
-    if (inputMap->replacementValue) {
-      assert(inputMap->size == 0 &&
-             "invalid to provide a replacement value when the argument isn't "
-             "dropped");
-      mapping.map(origArg, inputMap->replacementValue);
-      argReplacements.push_back(origArg);
-      continue;
-    }
-
-    // Otherwise, this is a 1->1+ mapping.
-    auto replArgs = newArgs.slice(inputMap->inputNo, inputMap->size);
-    Value newArg;
-
-    // If this is a 1->1 mapping and the types of new and replacement arguments
-    // match (i.e. it's an identity map), then the argument is mapped to its
-    // original type.
-    // FIXME: We simply pass through the replacement argument if there wasn't a
-    // converter, which isn't great as it allows implicit type conversions to
-    // appear. We should properly restructure this code to handle cases where a
-    // converter isn't provided and also to properly handle the case where an
-    // argument materialization is actually a temporary source materialization
-    // (e.g. in the case of 1->N).
-    if (replArgs.size() == 1 &&
-        (!converter || replArgs[0].getType() == origArg.getType())) {
-      newArg = replArgs.front();
-    } else {
-      Type origOutputType = origArg.getType();
-
-      // Legalize the argument output type.
-      Type outputType = origOutputType;
-      if (Type legalOutputType = converter->convertType(outputType))
-        outputType = legalOutputType;
-
-      newArg = buildUnresolvedArgumentMaterialization(
-          rewriter, origArg.getLoc(), replArgs, origOutputType, outputType,
-          converter, unresolvedMaterializations);
-    }
-
-    mapping.map(origArg, newArg);
-    argReplacements.push_back(origArg);
-    info.argInfo[i] =
-        ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg);
-  }
-
-  conversionInfo.insert({newBlock, std::move(info)});
-  return newBlock;
-}
-
 //===----------------------------------------------------------------------===//
 // IR rewrites
 //===----------------------------------------------------------------------===//
@@ -702,6 +342,12 @@ class IRRewrite {
   IRRewrite(Kind kind, ConversionPatternRewriterImpl &rewriterImpl)
       : kind(kind), rewriterImpl(rewriterImpl) {}
 
+  /// Erase the given op (unless it was already erased).
+  void eraseOp(Operation *op);
+
+  /// Erase the given block (unless it was already erased).
+  void eraseBlock(Block *block);
+
   const Kind kind;
   ConversionPatternRewriterImpl &rewriterImpl;
 };
@@ -744,8 +390,7 @@ class CreateBlockRewrite : public BlockRewrite {
     auto &blockOps = block->getOperations();
     while (!blockOps.empty())
       blockOps.remove(blockOps.begin());
-    block->dropAllDefinedValueUses();
-    block->erase();
+    eraseBlock(block);
   }
 };
 
@@ -881,8 +526,7 @@ class SplitBlockRewrite : public BlockRewrite {
     // Merge back the block that was split out.
     originalBlock->getOperations().splice(originalBlock->end(),
                                           block->getOperations());
-    block->dropAllDefinedValueUses();
-    block->erase();
+    eraseBlock(block);
   }
 
 private:
@@ -890,20 +534,59 @@ class SplitBlockRewrite : public BlockRewrite {
   Block *originalBlock;
 };
 
+/// This structure contains the information pertaining to an argument that has
+/// been converted.
+struct ConvertedArgInfo {
+  ConvertedArgInfo(unsigned newArgIdx, unsigned newArgSize,
+                   Value castValue = nullptr)
+      : newArgIdx(newArgIdx), newArgSize(newArgSize), castValue(castValue) {}
+
+  /// The start index of in the new argument list that contains arguments that
+  /// replace the original.
+  unsigned newArgIdx;
+
+  /// The number of arguments that replaced the original argument.
+  unsigned newArgSize;
+
+  /// The cast value that was created to cast from the new arguments to the
+  /// old. This only used if 'newArgSize' > 1.
+  Value castValue;
+};
+
 /// Block type conversion. This rewrite is partially reflected in the IR.
 class BlockTypeConversionRewrite : public BlockRewrite {
 public:
-  BlockTypeConversionRewrite(ConversionPatternRewriterImpl &rewriterImpl,
-                             Block *block)
-      : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block) {}
+  BlockTypeConversionRewrite(
+      ConversionPatternRewriterImpl &rewriterImpl, Block *block,
+      Block *origBlock, SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo,
+      const TypeConverter *converter)
+      : BlockRewrite(Kind::BlockTypeConversion, rewriterImpl, block),
+        origBlock(origBlock), argInfo(argInfo), converter(converter) {}
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() == Kind::BlockTypeConversion;
   }
 
-  // TODO: Block type conversions are currently committed in
-  // `ArgConverter::applyRewrites`. This should be done in the "commit" method.
+  /// Materialize any necessary conversions for converted arguments that have
+  /// live users, using the provided `findLiveUser` to search for a user that
+  /// survives the conversion process.
+  LogicalResult
+  materializeLiveConversions(function_ref<Operation *(Value)> findLiveUser);
+
+  void commit() override;
+
   void rollback() override;
+
+private:
+  /// The original block that was requested to have its signature converted.
+  Block *origBlock;
+
+  /// The conversion information for each of the arguments. The information is
+  /// std::nullopt if the argument was dropped during conversion.
+  SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo;
+
+  /// The type converter used to convert the arguments.
+  const TypeConverter *converter;
 };
 
 /// An operation rewrite.
@@ -949,8 +632,8 @@ class MoveOperationRewrite : public OperationRewrite {
   // The block in which this operation was previously contained.
   Block *block;
 
-  // The original successor of this operation before it was moved. "nullptr" if
-  // this operation was the only operation in the region.
+  // The original successor of this operation before it was moved. "nullptr"
+  // if this operation was the only operation in the region.
   Operation *insertBeforeOp;
 };
 
@@ -1027,6 +710,26 @@ static bool hasRewrite(R &&rewrites, Operation *op) {
   });
 }
 
+/// Find the single rewrite object of the specified type and block among the
+/// given rewrites. In debug mode, asserts that there is mo more than one such
+/// object. Return "nullptr" if no object was found.
+template <typename RewriteTy, typename R>
+static RewriteTy *findSingleRewrite(R &&rewrites, Block *block) {
+  RewriteTy *result = nullptr;
+  for (auto &rewrite : rewrites) {
+    auto *rewriteTy = dyn_cast<RewriteTy>(rewrite.get());
+    if (rewriteTy && rewriteTy->getBlock() == block) {
+#ifndef NDEBUG
+      assert(!result && "expected single matching rewrite");
+      result = rewriteTy;
+#else
+      return rewriteTy;
+#endif // NDEBUG
+    }
+  }
+  return result;
+}
+
 //===----------------------------------------------------------------------===//
 // ConversionPatternRewriterImpl
 //===----------------------------------------------------------------------===//
@@ -1034,7 +737,7 @@ namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter)
-      : argConverter(rewriter, unresolvedMaterializations),
+      : rewriter(rewriter), eraseRewriter(rewriter.getContext()),
         notifyCallback(nullptr) {}
 
   /// Cleanup and destroy any generated rewrite operations. This method is
@@ -1084,15 +787,33 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// removes them from being considered for legalization.
   void markNestedOpsIgnored(Operation *op);
 
+  /// Detach any operations nested in the given operation from their parent
+  /// blocks, and erase the given operation. This can be used when the nested
+  /// operations are scheduled for erasure themselves, so deleting the regions
+  /// of the given operation together with their content would result in
+  /// double-free. This happens, for example, when rolling back op creation in
+  /// the reverse order and if the nested ops were created before the parent op.
+  /// This function does not need to collect nested ops recursively because it
+  /// is expected to also be called for each nested op when it is about to be
+  /// deleted.
+  void detachNestedAndErase(Operation *op);
+
   //===--------------------------------------------------------------------===//
   // Type Conversion
   //===--------------------------------------------------------------------===//
 
-  /// Convert the signature of the given block.
+  /// Attempt to convert the signature of the given block, if successful a new
+  /// block is returned containing the new arguments. Returns `block` if it did
+  /// not require conversion.
   FailureOr<Block *> convertBlockSignature(
       Block *block, const TypeConverter *converter,
       TypeConverter::SignatureConversion *conversion = nullptr);
 
+  /// Convert the types of non-entry block arguments within the given region.
+  LogicalResult convertNonEntryRegionTypes(
+      Region *region, const TypeConverter &converter,
+      ArrayRef<TypeConverter::SignatureConversion> blockConversions = {});
+
   /// Apply a signature conversion on the given region, using `converter` for
   /// materializations if not null.
   Block *
@@ -1105,10 +826,15 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   convertRegionTypes(Region *region, const TypeConverter &converter,
                      TypeConverter::SignatureConversion *entryConversion);
 
-  /// Convert the types of non-entry block arguments within the given region.
-  LogicalResult convertNonEntryRegionTypes(
-      Region *region, const TypeConverter &converter,
-      ArrayRef<TypeConverter::SignatureConversion> blockConversions = {});
+  /// Apply the given signature conversion on the given block. The new block
+  /// containing the updated signature is returned. If no conversions were
+  /// necessary, e.g. if the block has no arguments, `block` is returned.
+  /// `converter` is used to generate any necessary cast operations that
+  /// translate between the origin argument types and those specified in the
+  /// signature conversion.
+  Block *applySignatureConversion(
+      Block *block, const TypeConverter *converter,
+      TypeConverter::SignatureConversion &signatureConversion);
 
   //===--------------------------------------------------------------------===//
   // Rewriter Notification Hooks
@@ -1140,17 +866,54 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   notifyMatchFailure(Location loc,
                      function_ref<void(Diagnostic &)> reasonCallback) override;
 
+  //===--------------------------------------------------------------------===//
+  // IR Erasure
+  //===--------------------------------------------------------------------===//
+
+  /// A rewriter that keeps track of erased ops and blocks. It ensures that no
+  /// operation or block is erased multiple times. This rewriter assumes that
+  /// no new IR is created between calls to `eraseOp`/`eraseBlock`.
+  struct SingleEraseRewriter : public RewriterBase, RewriterBase::Listener {
+  public:
+    SingleEraseRewriter(MLIRContext *context)
+        : RewriterBase(context, /*listener=*/this) {}
+
+    /// Erase the given op (unless it was already erased).
+    void eraseOp(Operation *op) override {
+      if (erased.contains(op))
+        return;
+      op->dropAllUses();
+      RewriterBase::eraseOp(op);
+    }
+
+    /// Erase the given block (unless it was already erased).
+    void eraseBlock(Block *block) override {
+      if (erased.contains(block))
+        return;
+      block->dropAllDefinedValueUses();
+      RewriterBase::eraseBlock(block);
+    }
+
+    void notifyOperationErased(Operation *op) override { erased.insert(op); }
+    void notifyBlockErased(Block *block) override { erased.insert(block); }
+
+    /// Pointers to all erased operations and blocks.
+    SetVector<void *> erased;
+  };
+
   //===--------------------------------------------------------------------===//
   // State
   //===--------------------------------------------------------------------===//
 
+  PatternRewriter &rewriter;
+
+  /// This rewriter must be used for erasing ops/blocks.
+  SingleEraseRewriter eraseRewriter;
+
   // Mapping between replaced values that differ in type. This happens when
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
 
-  /// Utility used to convert block arguments.
-  ArgConverter argConverter;
-
   /// Ordered vector of all of the newly created operations during conversion.
   SmallVector<Operation *> createdOps;
 
@@ -1207,20 +970,100 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
 } // namespace detail
 } // namespace mlir
 
+void IRRewrite::eraseOp(Operation *op) {
+  rewriterImpl.eraseRewriter.eraseOp(op);
+}
+
+void IRRewrite::eraseBlock(Block *block) {
+  rewriterImpl.eraseRewriter.eraseBlock(block);
+}
+
+void BlockTypeConversionRewrite::commit() {
+  // Process the remapping for each of the original arguments.
+  for (auto [origArg, info] :
+       llvm::zip_equal(origBlock->getArguments(), argInfo)) {
+    // Handle the case of a 1->0 value mapping.
+    if (!info) {
+      if (Value newArg =
+              rewriterImpl.mapping.lookupOrNull(origArg, origArg.getType()))
+        origArg.replaceAllUsesWith(newArg);
+      continue;
+    }
+
+    // Otherwise this is a 1->1+ value mapping.
+    Value castValue = info->castValue;
+    assert(info->newArgSize >= 1 && castValue && "expected 1->1+ mapping");
+
+    // If the argument is still used, replace it with the generated cast.
+    if (!origArg.use_empty()) {
+      origArg.replaceAllUsesWith(
+          rewriterImpl.mapping.lookupOrDefault(castValue, origArg.getType()));
+    }
+  }
+
+  delete origBlock;
+  origBlock = nullptr;
+}
+
 void BlockTypeConversionRewrite::rollback() {
-  // Undo the type conversion.
-  rewriterImpl.argConverter.discardRewrites(block);
-}
-
-/// Detach any operations nested in the given operation from their parent
-/// blocks, and erase the given operation. This can be used when the nested
-/// operations are scheduled for erasure themselves, so deleting the regions of
-/// the given operation together with their content would result in double-free.
-/// This happens, for example, when rolling back op creation in the reverse
-/// order and if the nested ops were created before the parent op. This function
-/// does not need to collect nested ops recursively because it is expected to
-/// also be called for each nested op when it is about to be deleted.
-static void detachNestedAndErase(Operation *op) {
+  // Drop all uses of the new block arguments and replace uses of the new block.
+  for (int i = block->getNumArguments() - 1; i >= 0; --i)
+    block->getArgument(i).dropAllUses();
+  block->replaceAllUsesWith(origBlock);
+
+  // Move the operations back the original block, move the original block back
+  // into its original location and the delete the new block.
+  origBlock->getOperations().splice(origBlock->end(), block->getOperations());
+  block->getParent()->getBlocks().insert(Region::iterator(block), origBlock);
+  eraseBlock(block);
+}
+
+LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
+    function_ref<Operation *(Value)> findLiveUser) {
+  // Process the remapping for each of the original arguments.
+  for (auto it : llvm::enumerate(origBlock->getArguments())) {
+    // If the type of this argument changed and the argument is still live, we
+    // need to materialize a conversion.
+    BlockArgument origArg = it.value();
+    if (rewriterImpl.mapping.lookupOrNull(origArg, origArg.getType()))
+      continue;
+    Operation *liveUser = findLiveUser(origArg);
+    if (!liveUser)
+      continue;
+
+    Value replacementValue = rewriterImpl.mapping.lookupOrDefault(origArg);
+    bool isDroppedArg = replacementValue == origArg;
+    if (isDroppedArg)
+      rewriterImpl.rewriter.setInsertionPointToStart(getBlock());
+    else
+      rewriterImpl.rewriter.setInsertionPointAfterValue(replacementValue);
+    Value newArg;
+    if (converter) {
+      newArg = converter->materializeSourceConversion(
+          rewriterImpl.rewriter, origArg.getLoc(), origArg.getType(),
+          isDroppedArg ? ValueRange() : ValueRange(replacementValue));
+      assert((!newArg || newArg.getType() == origArg.getType()) &&
+             "materialization hook did not provide a value of the expected "
+             "type");
+    }
+    if (!newArg) {
+      InFlightDiagnostic diag =
+          emitError(origArg.getLoc())
+          << "failed to materialize conversion for block argument #"
+          << it.index() << " that remained live after conversion, type was "
+          << origArg.getType();
+      if (!isDroppedArg)
+        diag << ", with target type " << replacementValue.getType();
+      diag.attachNote(liveUser->getLoc())
+          << "see existing live user here: " << *liveUser;
+      return failure();
+    }
+    rewriterImpl.mapping.map(origArg, newArg);
+  }
+  return success();
+}
+
+void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) {
   for (Region &region : op->getRegions()) {
     for (Block &block : region.getBlocks()) {
       while (!block.getOperations().empty())
@@ -1228,8 +1071,7 @@ static void detachNestedAndErase(Operation *op) {
       block.dropAllDefinedValueUses();
     }
   }
-  op->dropAllUses();
-  op->erase();
+  eraseRewriter.eraseOp(op);
 }
 
 void ConversionPatternRewriterImpl::discardRewrites() {
@@ -1248,11 +1090,6 @@ void ConversionPatternRewriterImpl::applyRewrites() {
     for (OpResult result : repl.first->getResults())
       if (Value newValue = mapping.lookupOrNull(result, result.getType()))
         result.replaceAllUsesWith(newValue);
-
-    // If this operation defines any regions, drop any pending argument
-    // rewrites.
-    if (repl.first->getNumRegions())
-      argConverter.notifyOpRemoved(repl.first);
   }
 
   // Apply all of the requested argument replacements.
@@ -1279,22 +1116,16 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 
   // Drop all of the unresolved materialization operations created during
   // conversion.
-  for (auto &mat : unresolvedMaterializations) {
-    mat.getOp()->dropAllUses();
-    mat.getOp()->erase();
-  }
+  for (auto &mat : unresolvedMaterializations)
+    eraseRewriter.eraseOp(mat.getOp());
 
   // In a second pass, erase all of the replaced operations in reverse. This
   // allows processing nested operations before their parent region is
   // destroyed. Because we process in reverse order, producers may be deleted
   // before their users (a pattern deleting a producer and then the consumer)
   // so we first drop all uses explicitly.
-  for (auto &repl : llvm::reverse(replacements)) {
-    repl.first->dropAllUses();
-    repl.first->erase();
-  }
-
-  argConverter.applyRewrites(mapping);
+  for (auto &repl : llvm::reverse(replacements))
+    eraseRewriter.eraseOp(repl.first);
 
   // Commit all rewrites.
   for (auto &rewrite : rewrites)
@@ -1307,7 +1138,8 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 RewriterState ConversionPatternRewriterImpl::getCurrentState() {
   return RewriterState(createdOps.size(), unresolvedMaterializations.size(),
                        replacements.size(), argReplacements.size(),
-                       rewrites.size(), ignoredOps.size());
+                       rewrites.size(), ignoredOps.size(),
+                       eraseRewriter.erased.size());
 }
 
 void ConversionPatternRewriterImpl::resetState(RewriterState state) {
@@ -1355,6 +1187,9 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) {
   while (!operationsWithChangedResults.empty() &&
          operationsWithChangedResults.back() >= state.numReplacements)
     operationsWithChangedResults.pop_back();
+
+  while (eraseRewriter.erased.size() != state.numErased)
+    eraseRewriter.erased.pop_back();
 }
 
 void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) {
@@ -1443,18 +1278,18 @@ void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
 FailureOr<Block *> ConversionPatternRewriterImpl::convertBlockSignature(
     Block *block, const TypeConverter *converter,
     TypeConverter::SignatureConversion *conversion) {
-  FailureOr<Block *> result =
-      conversion ? argConverter.applySignatureConversion(
-                       block, converter, *conversion, mapping, argReplacements)
-                 : argConverter.convertSignature(block, converter, mapping,
-                                                 argReplacements);
-  if (failed(result))
+  if (conversion)
+    return applySignatureConversion(block, converter, *conversion);
+
+  // If a converter wasn't provided, and the block wasn't already converted,
+  // there is nothing we can do.
+  if (!converter)
     return failure();
-  if (Block *newBlock = *result) {
-    if (newBlock != block)
-      appendRewrite<BlockTypeConversionRewrite>(newBlock);
-  }
-  return result;
+
+  // Try to convert the signature for the block with the provided converter.
+  if (auto conversion = converter->convertBlockSignature(block))
+    return applySignatureConversion(block, converter, *conversion);
+  return failure();
 }
 
 Block *ConversionPatternRewriterImpl::applySignatureConversion(
@@ -1508,6 +1343,102 @@ LogicalResult ConversionPatternRewriterImpl::convertNonEntryRegionTypes(
   return success();
 }
 
+Block *ConversionPatternRewriterImpl::applySignatureConversion(
+    Block *block, const TypeConverter *converter,
+    TypeConverter::SignatureConversion &signatureConversion) {
+  // If no arguments are being changed or added, there is nothing to do.
+  unsigned origArgCount = block->getNumArguments();
+  auto convertedTypes = signatureConversion.getConvertedTypes();
+  if (llvm::equal(block->getArgumentTypes(), convertedTypes))
+    return block;
+
+  // Split the block at the beginning to get a new block to use for the updated
+  // signature.
+  Block *newBlock = block->splitBlock(block->begin());
+  block->replaceAllUsesWith(newBlock);
+  // Unlink the block, but do not erase it yet, so that the change can be rolled
+  // back.
+  block->getParent()->getBlocks().remove(block);
+
+  // Map all new arguments to the location of the argument they originate from.
+  SmallVector<Location> newLocs(convertedTypes.size(),
+                                rewriter.getUnknownLoc());
+  for (unsigned i = 0; i < origArgCount; ++i) {
+    auto inputMap = signatureConversion.getInputMapping(i);
+    if (!inputMap || inputMap->replacementValue)
+      continue;
+    Location origLoc = block->getArgument(i).getLoc();
+    for (unsigned j = 0; j < inputMap->size; ++j)
+      newLocs[inputMap->inputNo + j] = origLoc;
+  }
+
+  SmallVector<Value, 4> newArgRange(
+      newBlock->addArguments(convertedTypes, newLocs));
+  ArrayRef<Value> newArgs(newArgRange);
+
+  // Remap each of the original arguments as determined by the signature
+  // conversion.
+  SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo;
+  argInfo.resize(origArgCount);
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(newBlock);
+  for (unsigned i = 0; i != origArgCount; ++i) {
+    auto inputMap = signatureConversion.getInputMapping(i);
+    if (!inputMap)
+      continue;
+    BlockArgument origArg = block->getArgument(i);
+
+    // If inputMap->replacementValue is not nullptr, then the argument is
+    // dropped and a replacement value is provided to be the remappedValue.
+    if (inputMap->replacementValue) {
+      assert(inputMap->size == 0 &&
+             "invalid to provide a replacement value when the argument isn't "
+             "dropped");
+      mapping.map(origArg, inputMap->replacementValue);
+      argReplacements.push_back(origArg);
+      continue;
+    }
+
+    // Otherwise, this is a 1->1+ mapping.
+    auto replArgs = newArgs.slice(inputMap->inputNo, inputMap->size);
+    Value newArg;
+
+    // If this is a 1->1 mapping and the types of new and replacement arguments
+    // match (i.e. it's an identity map), then the argument is mapped to its
+    // original type.
+    // FIXME: We simply pass through the replacement argument if there wasn't a
+    // converter, which isn't great as it allows implicit type conversions to
+    // appear. We should properly restructure this code to handle cases where a
+    // converter isn't provided and also to properly handle the case where an
+    // argument materialization is actually a temporary source materialization
+    // (e.g. in the case of 1->N).
+    if (replArgs.size() == 1 &&
+        (!converter || replArgs[0].getType() == origArg.getType())) {
+      newArg = replArgs.front();
+    } else {
+      Type origOutputType = origArg.getType();
+
+      // Legalize the argument output type.
+      Type outputType = origOutputType;
+      if (Type legalOutputType = converter->convertType(outputType))
+        outputType = legalOutputType;
+
+      newArg = buildUnresolvedArgumentMaterialization(
+          rewriter, origArg.getLoc(), replArgs, origOutputType, outputType,
+          converter, unresolvedMaterializations);
+    }
+
+    mapping.map(origArg, newArg);
+    argReplacements.push_back(origArg);
+    argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg);
+  }
+
+  appendRewrite<BlockTypeConversionRewrite>(newBlock, block, argInfo,
+                                            converter);
+  return newBlock;
+}
+
 //===----------------------------------------------------------------------===//
 // Rewriter Notification Hooks
 
@@ -2635,8 +2566,11 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
     });
     return liveUserIt == val.user_end() ? nullptr : *liveUserIt;
   };
-  return rewriterImpl.argConverter.materializeLiveConversions(
-      rewriterImpl.mapping, rewriter, findLiveUser);
+  for (auto &r : rewriterImpl.rewrites)
+    if (auto *rewrite = dyn_cast<BlockTypeConversionRewrite>(r.get()))
+      if (failed(rewrite->materializeLiveConversions(findLiveUser)))
+        return failure();
+  return success();
 }
 
 /// Replace the results of a materialization operation with the given values.

From fddf23c6f4478fc39b0077538d288082f983ce80 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Thu, 22 Feb 2024 10:27:59 +0100
Subject: [PATCH 016/546] [SPIRV] Add support for the SPV_KHR_subgroup_rotate
 extension (#82374)

This PR adds support for the SPV_KHR_subgroup_rotate extension that
enables rotating values across invocations within a subgroup:
*
https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/KHR/SPV_KHR_subgroup_rotate.asciidoc
---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |   7 +-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |   5 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |   9 +
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp      |   4 +
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |   1 +
 .../subgroup-rotate.ll                        | 357 ++++++++++++++++++
 6 files changed, 382 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index e6e3560d02f58..28a63b93b43b6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -619,7 +619,8 @@ class GroupBuiltin<string name, Op operation> {
                              !eq(operation, OpGroupNonUniformShuffleDown),
                              !eq(operation, OpGroupBroadcast),
                              !eq(operation, OpGroupNonUniformBroadcast),
-                             !eq(operation, OpGroupNonUniformBroadcastFirst));
+                             !eq(operation, OpGroupNonUniformBroadcastFirst),
+                             !eq(operation, OpGroupNonUniformRotateKHR));
   bit HasBoolArg = !or(!and(IsAllOrAny, !eq(IsAllEqual, false)), IsBallot, IsLogical);
 }
 
@@ -877,6 +878,10 @@ defm : DemangledGroupBuiltin<"group_non_uniform_scan_inclusive_logical_xors", Wo
 defm : DemangledGroupBuiltin<"group_non_uniform_scan_exclusive_logical_xors", WorkOrSub, OpGroupNonUniformLogicalXor>;
 defm : DemangledGroupBuiltin<"group_clustered_reduce_logical_xor", WorkOrSub, OpGroupNonUniformLogicalXor>;
 
+// cl_khr_subgroup_rotate / SPV_KHR_subgroup_rotate
+defm : DemangledGroupBuiltin<"group_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+defm : DemangledGroupBuiltin<"group_clustered_rotate", OnlySub, OpGroupNonUniformRotateKHR>;
+
 // cl_khr_work_group_uniform_arithmetic / SPV_KHR_uniform_group_instructions
 defm : DemangledGroupBuiltin<"group_reduce_imul", OnlyWork, OpGroupIMulKHR>;
 defm : DemangledGroupBuiltin<"group_reduce_mulu", OnlyWork, OpGroupIMulKHR>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 0f11bc34d176f..86f65b6320d53 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -765,6 +765,11 @@ def OpGroupNonUniformLogicalAnd: OpGroupNUGroup<"LogicalAnd", 362>;
 def OpGroupNonUniformLogicalOr: OpGroupNUGroup<"LogicalOr", 363>;
 def OpGroupNonUniformLogicalXor: OpGroupNUGroup<"LogicalXor", 364>;
 
+// SPV_KHR_subgroup_rotate
+def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res),
+                  (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops),
+                  "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">;
+
 // 3.49.7, Constant-Creation Instructions
 
 //  - SPV_INTEL_function_pointers
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index dbda2871e153d..9b9575b987994 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1069,6 +1069,15 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::FunctionPointersINTEL);
     }
     break;
+  case SPIRV::OpGroupNonUniformRotateKHR:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate))
+      report_fatal_error("OpGroupNonUniformRotateKHR instruction requires the "
+                         "following SPIR-V extension: SPV_KHR_subgroup_rotate",
+                         false);
+    Reqs.addExtension(SPIRV::Extension::SPV_KHR_subgroup_rotate);
+    Reqs.addCapability(SPIRV::Capability::GroupNonUniformRotateKHR);
+    Reqs.addCapability(SPIRV::Capability::GroupNonUniform);
+    break;
   case SPIRV::OpGroupIMulKHR:
   case SPIRV::OpGroupFMulKHR:
   case SPIRV::OpGroupBitwiseAndKHR:
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index e186154aa408b..4694363614ef6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -75,6 +75,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
             "Allows to use the LinkOnceODR linkage type that is to let "
             "a function or global variable to be merged with other functions "
             "or global variables of the same name when linkage occurs."),
+        clEnumValN(SPIRV::Extension::SPV_KHR_subgroup_rotate,
+                   "SPV_KHR_subgroup_rotate",
+                   "Adds a new instruction that enables rotating values across "
+                   "invocations within a subgroup."),
         clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
                    "SPV_INTEL_function_pointers",
                    "Allows translation of function pointers.")));
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 4e5ac0d531b2d..6c36087baa85e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -455,6 +455,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions],
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
 defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
 defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
 defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_float16_add], []>;
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll
new file mode 100644
index 0000000000000..b1d6a09c7fe35
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll
@@ -0,0 +1,357 @@
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpGroupNonUniformRotateKHR instruction requires the following SPIR-V extension: SPV_KHR_subgroup_rotate
+
+; CHECK: OpCapability GroupNonUniformRotateKHR
+; CHECK: OpExtension "SPV_KHR_subgroup_rotate"
+
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyInt16:.*]] = OpTypeInt 16 0
+; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyFloat:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[TyHalf:.*]] = OpTypeFloat 16
+; CHECK-DAG: %[[TyDouble:.*]] = OpTypeFloat 64
+; CHECK-DAG: %[[ScopeSubgroup:.*]] = OpConstant %[[TyInt32]] 3
+; CHECK-DAG: %[[ConstInt2:.*]] = OpConstant %[[TyInt32]] 2
+; CHECK-DAG: %[[ConstInt4:.*]] = OpConstant %[[TyInt32]] 4
+		
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i8, align 1
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i8 0, ptr %v, align 1
+  %value = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0
+  store i8 %call, ptr addrspace(1) %arrayidx, align 1
+  %value_clustered = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1
+  store i8 %call1, ptr addrspace(1) %arrayidx2, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i8, align 1
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i8 0, ptr %v, align 1
+  %value = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0
+  store i8 %call, ptr addrspace(1) %arrayidx, align 1
+  %value_clustered = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1
+  store i8 %call1, ptr addrspace(1) %arrayidx2, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i16, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i16 0, ptr %v, align 2
+  %value = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0
+  store i16 %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1
+  store i16 %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i16, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i16 0, ptr %v, align 2
+  %value = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0
+  store i16 %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1
+  store i16 %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i32, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i32 0, ptr %v, align 4
+  %value = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i32 @_Z16sub_group_rotateii(i32 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0
+  store i32 %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z16sub_group_rotateii(i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i32, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i32 0, ptr %v, align 4
+  %value = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i32 @_Z16sub_group_rotateji(i32 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0
+  store i32 %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z16sub_group_rotateji(i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateLong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i64, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i64 0, ptr %v, align 8
+  %value = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i64 @_Z16sub_group_rotateli(i64 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0
+  store i64 %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z16sub_group_rotateli(i64 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateULong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i64, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i64 0, ptr %v, align 8
+  %value = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i64 @_Z16sub_group_rotatemi(i64 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0
+  store i64 %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z16sub_group_rotatemi(i64 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateFloat(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca float, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store float 0.000000e+00, ptr %v, align 4
+  %value = load float, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func float @_Z16sub_group_rotatefi(float noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %data, i32 0
+  store float %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load float, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func float @_Z26sub_group_clustered_rotatefij(float noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %data2, i32 1
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z16sub_group_rotatefi(float noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z26sub_group_clustered_rotatefij(float noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateHalf(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca half, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store half 0xH0000, ptr %v, align 2
+  %value = load half, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func half @_Z16sub_group_rotateDhi(half noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds half, ptr addrspace(1) %data, i32 0
+  store half %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load half, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds half, ptr addrspace(1) %data2, i32 1
+  store half %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func half @_Z16sub_group_rotateDhi(half noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateDouble(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca double, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store double 0.000000e+00, ptr %v, align 8
+  %value = load double, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func double @_Z16sub_group_rotatedi(double noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds double, ptr addrspace(1) %data, i32 0
+  store double %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load double, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func double @_Z26sub_group_clustered_rotatedij(double noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds double, ptr addrspace(1) %data2, i32 1
+  store double %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func double @_Z16sub_group_rotatedi(double noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func double @_Z26sub_group_clustered_rotatedij(double noundef, i32 noundef, i32 noundef) #1
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
+attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{!"clang version 19.0.0"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!"uchar*"}
+!8 = !{!"short*"}
+!9 = !{!"ushort*"}
+!10 = !{!"int*"}
+!11 = !{!"uint*"}
+!12 = !{!"long*"}
+!13 = !{!"ulong*"}
+!14 = !{!"float*"}
+!15 = !{!"half*"}
+!16 = !{!"double*"}

From 6cca23a3b91e12c0b6639449bc1e5eb564067db3 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Thu, 22 Feb 2024 10:30:00 +0100
Subject: [PATCH 017/546] [SPIRV] Prevent creation of jump tables from switch
 (#82287)

This PR is to prevent creation of jump tables from switch. The reason is
that SPIR-V doesn't know how to lower jump tables, and a sequence of
commands that IRTranslator generates for switch via jump tables breaks
SPIR-V Backend code generation with complains to G_BRJT. The next
example is the shortest code to break SPIR-V Backend code generation in
this way:

```
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
target triple = "spir64-unknown-unknown"

define spir_func void @foo(i32 noundef %val) {
entry:
  switch i32 %val, label %sw.epilog [
    i32 0, label %sw.bb
    i32 1, label %sw.bb2
    i32 2, label %sw.bb3
    i32 3, label %sw.bb4
  ]
sw.bb:
  br label %sw.epilog
sw.bb2:
  br label %sw.epilog
sw.bb3:
  br label %sw.epilog
sw.bb4:
  br label %sw.epilog
sw.epilog:
  ret void
}
```

To resolve the issue we set a high lower limit for number of blocks in a
jump table via getMinimumJumpTableEntries() and prevent undesirable (or
rather unsupported at the moment) path of code generation.
---
 llvm/lib/Target/SPIRV/SPIRVISelLowering.h     |  3 ++
 .../CodeGen/SPIRV/switch-no-jump-table.ll     | 30 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index f317b26207195..d34f802e9d889 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -31,6 +31,9 @@ class SPIRVTargetLowering : public TargetLowering {
     return true;
   }
 
+  // prevent creation of jump tables
+  bool areJTsAllowed(const Function *) const override { return false; }
+
   // This is to prevent sexts of non-i64 vector indices which are generated
   // within general IRTranslator hence type generation for it is omitted.
   MVT getVectorIdxTy(const DataLayout &DL) const override {
diff --git a/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll
new file mode 100644
index 0000000000000..c9c0f17f0b91e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll
@@ -0,0 +1,30 @@
+; The test is to check that jump tables are not generated from switch
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpSwitch %[[#]] %[[Label:]]
+; CHECK-4: OpBranch %[[Label]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(i32 noundef %val) {
+entry:
+  switch i32 %val, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb2
+    i32 2, label %sw.bb3
+    i32 3, label %sw.bb4
+  ]
+sw.bb:
+  br label %sw.epilog
+sw.bb2:
+  br label %sw.epilog
+sw.bb3:
+  br label %sw.epilog
+sw.bb4:
+  br label %sw.epilog
+sw.epilog:
+  ret void
+}

From bcbffd99c48ed0cabd1b94e9ff252680f0968fc3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 22 Feb 2024 09:40:46 +0000
Subject: [PATCH 018/546] [AMDGPU] Split Dpp8FI and Dpp16FI operands (#82379)

Split Dpp8FI and Dpp16FI into two different operands sharing an
AsmOperandClass. They are parsed and rendered identically as fi:1 but
the encoding is different: for DPP16 FI is a single bit, but for DPP8 it
uses two different special values in the src0 field. Having a dedicated
decoder for Dpp8FI allows it to reject other (non-special) src0 values
so that AMDGPUDisassembler::getInstruction no longer needs to call
isValidDPP8 to do post hoc validation of decoded DPP8 instructions.
---
 .../Disassembler/AMDGPUDisassembler.cpp       | 33 ++++++++-----------
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         | 19 ++++++-----
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  4 +--
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    | 18 +++++-----
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  8 ++---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  4 +--
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |  2 +-
 8 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 894607dfdd8c4..53abb3e3f9aea 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -119,6 +119,12 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
   return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
 }
 
+static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr,
+                                 const MCDisassembler *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+  return addOperand(Inst, DAsm->decodeDpp8FI(Val));
+}
+
 #define DECODE_OPERAND(StaticDecoderName, DecoderName)                         \
   static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm,            \
                                         uint64_t /*Addr*/,                     \
@@ -440,19 +446,6 @@ static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
   return DecoderUInt128(Lo, Hi);
 }
 
-// The disassembler is greedy, so we need to check FI operand value to
-// not parse a dpp if the correct literal is not set. For dpp16 the
-// autogenerated decoder checks the dpp literal
-static bool isValidDPP8(const MCInst &MI) {
-  using namespace llvm::AMDGPU::DPP;
-  int FiIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::fi);
-  assert(FiIdx != -1);
-  if ((unsigned)FiIdx >= MI.getNumOperands())
-    return false;
-  unsigned Fi = MI.getOperand(FiIdx).getImm();
-  return Fi == DPP8_FI_0 || Fi == DPP8_FI_1;
-}
-
 DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                 ArrayRef<uint8_t> Bytes_,
                                                 uint64_t Address,
@@ -474,13 +467,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                         MI, DecW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
-      MI = MCInst(); // clear
       Res =
           tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
                         MI, DecW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
-      MI = MCInst(); // clear
 
       const auto convertVOPDPP = [&]() {
         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
@@ -530,26 +521,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
             break;
           if (convertDPP8Inst(MI) == MCDisassembler::Success)
             break;
-          MI = MCInst(); // clear
         }
       }
 
       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
-      MI = MCInst(); // clear
 
       Res = tryDecodeInst(DecoderTableDPP8GFX1164,
                           DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
-      MI = MCInst(); // clear
 
       Res = tryDecodeInst(DecoderTableDPP8GFX1264,
                           DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
-      MI = MCInst(); // clear
 
       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
       if (Res) break;
@@ -982,7 +969,7 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
                              AMDGPU::OpName::src1_modifiers);
     }
   }
-  return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
+  return MCDisassembler::Success;
 }
 
 DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
@@ -1831,6 +1818,12 @@ MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
   return decodeSrcOp(OPW32, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const {
+  if (Val != AMDGPU::DPP::DPP8_FI_0 && Val != AMDGPU::DPP::DPP8_FI_1)
+    return MCOperand();
+  return MCOperand::createImm(Val);
+}
+
 bool AMDGPUDisassembler::isVI() const {
   return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 3142b8a14a4dd..dd0581576bd22 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -261,6 +261,7 @@ class AMDGPUDisassembler : public MCDisassembler {
 
   MCOperand decodeBoolReg(unsigned Val) const;
   MCOperand decodeSplitBarrier(unsigned Val) const;
+  MCOperand decodeDpp8FI(unsigned Val) const;
 
   int getTTmpIdx(unsigned Val) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 97c723752b70b..34cdb09b0e15d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -987,8 +987,8 @@ def SDWAVopcDst : BoolRC {
 }
 
 class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
-                      string ConvertMethod = "nullptr">
-    : CustomOperand<Type, Optional, NAME> {
+                      string name = NAME, string ConvertMethod = "nullptr">
+    : CustomOperand<Type, Optional, name> {
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1090,9 +1090,12 @@ let DefaultValue = "0xf" in {
 def DppRowMask : NamedIntOperand<i32, "row_mask">;
 def DppBankMask : NamedIntOperand<i32, "bank_mask">;
 }
-def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1,
+def DppBoundCtrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
     "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
-def DppFI : NamedIntOperand<i32, "fi">;
+
+let DecoderMethod = "decodeDpp8FI" in
+def Dpp8FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
+def Dpp16FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
 def CBSZ : NamedIntOperand<i32, "cbsz">;
@@ -1823,7 +1826,7 @@ class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperan
                    Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
-                 (ins DppFI:$fi));
+                 (ins Dpp16FI:$fi));
 }
 
 class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
@@ -1831,7 +1834,7 @@ class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand
                   Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
-                 (ins dpp8:$dpp8, DppFI:$fi));
+                 (ins dpp8:$dpp8, Dpp8FI:$fi));
 }
 
 class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> {
@@ -1851,12 +1854,12 @@ class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit Has
 
 class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
   dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
-                 (ins DppFI:$fi));
+                 (ins Dpp16FI:$fi));
 }
 
 class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
   dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
-                 (ins dpp8:$dpp8, DppFI:$fi));
+                 (ins dpp8:$dpp8, Dpp8FI:$fi));
 }
 
 // Ins for SDWA
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 99f8e8ede4ace..576ad32a70cf3 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -380,9 +380,9 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
   let OutsDPP = (outs Src0RC32:$vdst);
   let InsDPP16 = (ins Src0RC32:$old, Src0RC32:$src0,
                       dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
-                      DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                      DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
   let AsmDPP16 = getAsmDPP16<1, 1, 0>.ret;
-  let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, DppFI:$fi);
+  let InsDPP8 = (ins Src0RC32:$old, Src0RC32:$src0, dpp8:$dpp8, Dpp8FI:$fi);
   let AsmDPP8 = getAsmDPP8<1, 1, 0>.ret;
 
   let OutsVOP3DPP = (outs Src0RC64:$vdst);
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 4437d5f2a0333..9f54e69f6d55e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -430,7 +430,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
                     getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsVOP3Base = getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, RegisterOperand<VGPR_32>, 3,
                        0, HasModifiers, HasModifiers, HasOMod,
                        Src0ModVOP3DPP, Src1ModVOP3DPP, Src2Mod, HasOpSel>.ret;
@@ -447,7 +447,7 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                      getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      getVregSrcForVT<Src2VT>.ret:$src2, // stub argument
@@ -500,7 +500,7 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                      getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
   let Src2Mod = FP32InputMods; // dummy unused modifiers
   let Src2RC64 = VGPRSrc_32;   // stub argument
 }
@@ -552,11 +552,11 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped], /*EnableClamp=*/
                     Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsDPP8 = (ins DstRCDPP:$old,
                     Src0DPP:$src0,
                     Src1DPP:$src1,
-                    dpp8:$dpp8, DppFI:$fi);
+                    dpp8:$dpp8, Dpp8FI:$fi);
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
   let OutsVOP3DPP = Outs64;
@@ -594,11 +594,11 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1], /*EnableClamp=*/1>
                     Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsDPP8 = (ins DstRCDPP:$old,
                      Src0DPP:$src0,
                      Src1DPP:$src1,
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
 
   let HasExt = 1;
   let HasExtDPP = 1;
@@ -645,11 +645,11 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
                     FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
                     DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   let InsDPP8 = (ins DstRCDPP:$old,
                      FPVRegInputMods:$src0_modifiers, Src0DPP:$src0,
                      FPVRegInputMods:$src1_modifiers, Src1DPP:$src1,
-                     dpp8:$dpp8, DppFI:$fi);
+                     dpp8:$dpp8, Dpp8FI:$fi);
 
   let Src0ModVOP3DPP = FPVRegInputMods;
   let Src1ModVOP3DPP = FPVRegInputMods;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 396ae9c9d92ee..7198a4022dae8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -532,11 +532,11 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
                           FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
                           VGPR_32:$vdst_in, op_sel0:$op_sel,
                           dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
-                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
   let InsVOP3DPP8 = (ins VGPR_32:$old,
                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
-                         VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+                         VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
 
   let HasClamp = 0;
   let HasExtVOP3DPP = 1;
@@ -553,12 +553,12 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                           FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
                           FP32InputMods:$src2_modifiers, VGPR_32:$src2,
                           op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
-                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                          DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
   let InsVOP3DPP8 = (ins VGPR_32:$old,
                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
-                         op_sel0:$op_sel, dpp8:$dpp8, DppFI:$fi);
+                         op_sel0:$op_sel, dpp8:$dpp8, Dpp8FI:$fi);
   let HasClamp = 0;
   let HasSrc2 = 0;
   let HasSrc2Mods = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 74f451b6d4f7f..a0090f3e8d1db 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -461,13 +461,13 @@ def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
 
   let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
                          PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
-                         neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, DppFI:$fi);
+                         neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, Dpp8FI:$fi);
 
   let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
                           PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
                           neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
                           DppRowMask:$row_mask, DppBankMask:$bank_mask,
-                          DppBoundCtrl:$bound_ctrl, DppFI:$fi);
+                          DppBoundCtrl:$bound_ctrl, Dpp16FI:$fi);
 }
 
 multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index fe52a0e39e4f1..508f06c4739a5 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -766,7 +766,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
   let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP16 = AsmDPP#"$fi";
     let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask, DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
-  let InsDPP16 = !con(InsDPP, (ins DppFI:$fi));
+  let InsDPP16 = !con(InsDPP, (ins Dpp16FI:$fi));
   // DPP8 forbids modifiers and can inherit from VOPC_Profile
 
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);

From 6193233540e55de61baeb80208b06c6808b14dbc Mon Sep 17 00:00:00 2001
From: Yury Gribov <tetra2005@gmail.com>
Date: Thu, 22 Feb 2024 13:01:37 +0300
Subject: [PATCH 019/546] [AArch64] Fix sched model for TSV110 core. (#82343)

Accumulator operand of MADD instruction can be bypassed from another
MUL-like operation. Before this fix bypassing was incorrectly applied to
multiplier operand.

Co-authored-by: Yury Gribov <gribov.yuri@huawei.com>
---
 llvm/lib/Target/AArch64/AArch64SchedTSV110.td |  6 +-
 .../AArch64/HiSilicon/tsv110-forwarding.s     | 83 +++++++++++++++++++
 2 files changed, 86 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s

diff --git a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
index 0ae9a69fd4826..1c577a25bf739 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedTSV110.td
@@ -419,10 +419,10 @@ def : InstRW<[TSV110Wr_12cyc_1MDU],  (instregex "^(S|U)DIVWr$")>;
 def : InstRW<[TSV110Wr_20cyc_1MDU],  (instregex "^(S|U)DIVXr$")>;
 
 def TSV110ReadMAW : SchedReadAdvance<2, [TSV110Wr_3cyc_1MDU]>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instrs MADDWrrr, MSUBWrrr)>;
 def TSV110ReadMAQ : SchedReadAdvance<3, [TSV110Wr_4cyc_1MDU]>;
-def : InstRW<[TSV110Wr_4cyc_1MDU, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
-def : InstRW<[TSV110Wr_3cyc_1MDU, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def : InstRW<[TSV110Wr_4cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAQ], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[TSV110Wr_3cyc_1MDU, ReadIM, ReadIM, TSV110ReadMAW], (instregex "(S|U)(MADDL|MSUBL)rrr")>;
 def : InstRW<[TSV110Wr_4cyc_1MDU], (instregex "^(S|U)MULHrr$")>;
 
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
new file mode 100644
index 0000000000000..207822b618396
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/HiSilicon/tsv110-forwarding.s
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=tsv110 --instruction-info=0 --resource-pressure=0 --timeline --iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd nobypass
+mul  x0, x1, x2
+add  x0, x0, x1
+add  x0, x0, x1
+add  x0, x0, x1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN madd bypass
+mul  x0, x1, x2
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - madd nobypass
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER  .   mul	x0, x1, x2
+# CHECK-NEXT: [0,1]     D====eER .   add	x0, x0, x1
+# CHECK-NEXT: [0,2]     D=====eER.   add	x0, x0, x1
+# CHECK-NEXT: [0,3]     D======eER   add	x0, x0, x1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       mul	x0, x1, x2
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x0, x1
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       add	x0, x0, x1
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x0, x1
+# CHECK-NEXT:        1     4.8    0.3    0.0       <total>
+
+# CHECK:      [1] Code Region - madd bypass
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   . .   mul	x0, x1, x2
+# CHECK-NEXT: [0,1]     D=eeeeER  . .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D==eeeeER . .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D======eeeeER   madd	x0, x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       mul	x0, x1, x2
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        1     3.3    0.3    0.0       <total>

From 4a602d9250e1eb3c729d0421d11be2be8693cbf2 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Thu, 22 Feb 2024 11:05:19 +0100
Subject: [PATCH 020/546] Add support for the SPV_INTEL_usm_storage_classes
 extension (#82247)

Add support for the SPV_INTEL_usm_storage_classes extension:
*
https://github.com/intel/llvm/blob/sycl/sycl/doc/design/spirv-extensions/SPV_INTEL_usm_storage_classes.asciidoc
---
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   | 17 ++--
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp |  5 +-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |  4 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 36 ++++++--
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  | 16 ++--
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |  7 ++
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp   | 11 ++-
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp      |  6 ++
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |  3 +
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          | 19 ++++-
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |  3 +-
 .../intel-usm-addrspaces.ll                   | 84 +++++++++++++++++++
 12 files changed, 183 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index cc438b2bb8d4d..10569ef0468bd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -150,7 +150,8 @@ getKernelArgTypeQual(const Function &F, unsigned ArgIdx) {
 
 static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
                                   SPIRVGlobalRegistry *GR,
-                                  MachineIRBuilder &MIRBuilder) {
+                                  MachineIRBuilder &MIRBuilder,
+                                  const SPIRVSubtarget &ST) {
   // Read argument's access qualifier from metadata or default.
   SPIRV::AccessQualifier::AccessQualifier ArgAccessQual =
       getArgAccessQual(F, ArgIdx);
@@ -169,8 +170,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
     if (MDTypeStr.ends_with("*"))
       ResArgType = GR->getOrCreateSPIRVTypeByName(
           MDTypeStr, MIRBuilder,
-          addressSpaceToStorageClass(
-              OriginalArgType->getPointerAddressSpace()));
+          addressSpaceToStorageClass(OriginalArgType->getPointerAddressSpace(),
+                                     ST));
     else if (MDTypeStr.ends_with("_t"))
       ResArgType = GR->getOrCreateSPIRVTypeByName(
           "opencl." + MDTypeStr.str(), MIRBuilder,
@@ -206,6 +207,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   assert(GR && "Must initialize the SPIRV type registry before lowering args.");
   GR->setCurrentFunc(MIRBuilder.getMF());
 
+  // Get access to information about available extensions
+  const SPIRVSubtarget *ST =
+      static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+
   // Assign types and names to all args, and store their types for later.
   FunctionType *FTy = getOriginalFunctionType(F);
   SmallVector<SPIRVType *, 4> ArgTypeVRegs;
@@ -216,7 +221,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
       // TODO: handle the case of multiple registers.
       if (VRegs[i].size() > 1)
         return false;
-      auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder);
+      auto *SpirvTy = getArgSPIRVType(F, i, GR, MIRBuilder, *ST);
       GR->assignSPIRVTypeToVReg(SpirvTy, VRegs[i][0], MIRBuilder.getMF());
       ArgTypeVRegs.push_back(SpirvTy);
 
@@ -318,10 +323,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (F.hasName())
     buildOpName(FuncVReg, F.getName(), MIRBuilder);
 
-  // Get access to information about available extensions
-  const auto *ST =
-      static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
-
   // Handle entry points and function linkage.
   if (isEntryPoint(F)) {
     const auto &STI = MIRBuilder.getMF().getSubtarget<SPIRVSubtarget>();
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 47fec745c3f18..a1cb630f1aa47 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -709,7 +709,10 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
     // TODO: change the implementation once opaque pointers are supported
     // in the SPIR-V specification.
     SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
-    auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
+    // Get access to information about available extensions
+    const SPIRVSubtarget *ST =
+        static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
+    auto SC = addressSpaceToStorageClass(PType->getAddressSpace(), *ST);
     // Null pointer means we have a loop in type definitions, make and
     // return corresponding OpTypeForwardPointer.
     if (SpvElementType == nullptr) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 86f65b6320d53..7c5252e8cb372 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -430,6 +430,10 @@ def OpGenericCastToPtrExplicit : Op<123, (outs ID:$r), (ins TYPE:$t, ID:$p, Stor
                               "$r = OpGenericCastToPtrExplicit $t $p $s">;
 def OpBitcast : UnOp<"OpBitcast", 124>;
 
+// SPV_INTEL_usm_storage_classes
+def OpPtrCastToCrossWorkgroupINTEL : UnOp<"OpPtrCastToCrossWorkgroupINTEL", 5934>;
+def OpCrossWorkgroupCastToPtrINTEL : UnOp<"OpCrossWorkgroupCastToPtrINTEL", 5938>;
+
 // 3.42.12 Composite Instructions
 
 def OpVectorExtractDynamic: Op<77, (outs ID:$res), (ins TYPE:$type, vID:$vec, ID:$idx),
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 53d19a1e31382..7258d3b4d88ed 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -828,8 +828,18 @@ static bool isGenericCastablePtr(SPIRV::StorageClass::StorageClass SC) {
   }
 }
 
+static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
+  switch (SC) {
+  case SPIRV::StorageClass::DeviceOnlyINTEL:
+  case SPIRV::StorageClass::HostOnlyINTEL:
+    return true;
+  default:
+    return false;
+  }
+}
+
 // In SPIR-V address space casting can only happen to and from the Generic
-// storage class. We can also only case Workgroup, CrossWorkgroup, or Function
+// storage class. We can also only cast Workgroup, CrossWorkgroup, or Function
 // pointers to and from Generic pointers. As such, we can convert e.g. from
 // Workgroup to Function by going via a Generic pointer as an intermediary. All
 // other combinations can only be done by a bitcast, and are probably not safe.
@@ -862,13 +872,17 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
   SPIRV::StorageClass::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
   SPIRV::StorageClass::StorageClass DstSC = GR.getPointerStorageClass(ResVReg);
 
-  // Casting from an eligable pointer to Generic.
+  // don't generate a cast between identical storage classes
+  if (SrcSC == DstSC)
+    return true;
+
+  // Casting from an eligible pointer to Generic.
   if (DstSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(SrcSC))
     return selectUnOp(ResVReg, ResType, I, SPIRV::OpPtrCastToGeneric);
-  // Casting from Generic to an eligable pointer.
+  // Casting from Generic to an eligible pointer.
   if (SrcSC == SPIRV::StorageClass::Generic && isGenericCastablePtr(DstSC))
     return selectUnOp(ResVReg, ResType, I, SPIRV::OpGenericCastToPtr);
-  // Casting between 2 eligable pointers using Generic as an intermediary.
+  // Casting between 2 eligible pointers using Generic as an intermediary.
   if (isGenericCastablePtr(SrcSC) && isGenericCastablePtr(DstSC)) {
     Register Tmp = MRI->createVirtualRegister(&SPIRV::IDRegClass);
     SPIRVType *GenericPtrTy = GR.getOrCreateSPIRVPointerType(
@@ -886,6 +900,16 @@ bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
                           .addUse(Tmp)
                           .constrainAllUses(TII, TRI, RBI);
   }
+
+  // Check if instructions from the SPV_INTEL_usm_storage_classes extension may
+  // be applied
+  if (isUSMStorageClass(SrcSC) && DstSC == SPIRV::StorageClass::CrossWorkgroup)
+    return selectUnOp(ResVReg, ResType, I,
+                      SPIRV::OpPtrCastToCrossWorkgroupINTEL);
+  if (SrcSC == SPIRV::StorageClass::CrossWorkgroup && isUSMStorageClass(DstSC))
+    return selectUnOp(ResVReg, ResType, I,
+                      SPIRV::OpCrossWorkgroupCastToPtrINTEL);
+
   // TODO Should this case just be disallowed completely?
   // We're casting 2 other arbitrary address spaces, so have to bitcast.
   return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
@@ -1545,7 +1569,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   }
   SPIRVType *ResType = GR.getOrCreateSPIRVPointerType(
       PointerBaseType, I, TII,
-      addressSpaceToStorageClass(GV->getAddressSpace()));
+      addressSpaceToStorageClass(GV->getAddressSpace(), STI));
 
   std::string GlobalIdent;
   if (!GV->hasName()) {
@@ -1618,7 +1642,7 @@ bool SPIRVInstructionSelector::selectGlobalValue(
 
   unsigned AddrSpace = GV->getAddressSpace();
   SPIRV::StorageClass::StorageClass Storage =
-      addressSpaceToStorageClass(AddrSpace);
+      addressSpaceToStorageClass(AddrSpace, STI);
   bool HasLnkTy = GV->getLinkage() != GlobalValue::InternalLinkage &&
                   Storage != SPIRV::StorageClass::Function;
   SPIRV::LinkageType::LinkageType LnkType =
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 011a550a7b3d9..4f2e7a240fc2c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -102,14 +102,16 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   const LLT p2 = LLT::pointer(2, PSize); // UniformConstant
   const LLT p3 = LLT::pointer(3, PSize); // Workgroup
   const LLT p4 = LLT::pointer(4, PSize); // Generic
-  const LLT p5 = LLT::pointer(5, PSize); // Input
+  const LLT p5 =
+      LLT::pointer(5, PSize); // Input, SPV_INTEL_usm_storage_classes (Device)
+  const LLT p6 = LLT::pointer(6, PSize); // SPV_INTEL_usm_storage_classes (Host)
 
   // TODO: remove copy-pasting here by using concatenation in some way.
   auto allPtrsScalarsAndVectors = {
-      p0,    p1,    p2,    p3,    p4,    p5,    s1,     s8,     s16,
-      s32,   s64,   v2s1,  v2s8,  v2s16, v2s32, v2s64,  v3s1,   v3s8,
-      v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16, v4s32,  v4s64,  v8s1,
-      v8s8,  v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+      p0,    p1,    p2,    p3,    p4,     p5,     p6,    s1,   s8,   s16,
+      s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64, v3s1, v3s8, v3s16,
+      v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64, v8s1, v8s8, v8s16,
+      v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
 
   auto allScalarsAndVectors = {
       s1,   s8,   s16,   s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64,
@@ -133,8 +135,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
 
   auto allFloatAndIntScalars = allIntScalars;
 
-  auto allPtrs = {p0, p1, p2, p3, p4, p5};
-  auto allWritablePtrs = {p0, p1, p3, p4};
+  auto allPtrs = {p0, p1, p2, p3, p4, p5, p6};
+  auto allWritablePtrs = {p0, p1, p3, p4, p5, p6};
 
   for (auto Opc : TypeFoldingSupportingOpcs)
     getActionDefinitionsBuilder(Opc).custom();
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 9b9575b987994..3be28c97d9538 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1063,6 +1063,13 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::ExpectAssumeKHR);
     }
     break;
+  case SPIRV::OpPtrCastToCrossWorkgroupINTEL:
+  case SPIRV::OpCrossWorkgroupCastToPtrINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes);
+      Reqs.addCapability(SPIRV::Capability::USMStorageClassesINTEL);
+    }
+    break;
   case SPIRV::OpConstantFunctionPointerINTEL:
     if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
       Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index cbc16fa986614..144216896eb68 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -122,6 +122,9 @@ static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
 
 static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                            MachineIRBuilder MIB) {
+  // Get access to information about available extensions
+  const SPIRVSubtarget *ST =
+      static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -141,7 +144,7 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB);
       SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
           BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
-          addressSpaceToStorageClass(MI.getOperand(4).getImm()));
+          addressSpaceToStorageClass(MI.getOperand(4).getImm(), *ST));
 
       // If the bitcast would be redundant, replace all uses with the source
       // register.
@@ -250,6 +253,10 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
 
 static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                                  MachineIRBuilder MIB) {
+  // Get access to information about available extensions
+  const SPIRVSubtarget *ST =
+      static_cast<const SPIRVSubtarget *>(&MIB.getMF().getSubtarget());
+
   MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<MachineInstr *, 10> ToErase;
 
@@ -269,7 +276,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
             getMDOperandAsType(MI.getOperand(2).getMetadata(), 0), MIB);
         SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType(
             BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(),
-            addressSpaceToStorageClass(MI.getOperand(3).getImm()));
+            addressSpaceToStorageClass(MI.getOperand(3).getImm(), *ST));
         MachineInstr *Def = MRI.getVRegDef(Reg);
         assert(Def && "Expecting an instruction that defines the register");
         insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 4694363614ef6..79f16146ccd94 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -49,6 +49,12 @@ cl::list<SPIRV::Extension::Extension> Extensions(
         clEnumValN(SPIRV::Extension::SPV_INTEL_optnone, "SPV_INTEL_optnone",
                    "Adds OptNoneINTEL value for Function Control mask that "
                    "indicates a request to not optimize the function."),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_usm_storage_classes,
+                   "SPV_INTEL_usm_storage_classes",
+                   "Introduces two new storage classes that are sub classes of "
+                   "the CrossWorkgroup storage class "
+                   "that provides additional information that can enable "
+                   "optimization."),
         clEnumValN(SPIRV::Extension::SPV_INTEL_subgroups, "SPV_INTEL_subgroups",
                    "Allows work items in a subgroup to share data without the "
                    "use of local memory and work group barriers, and to "
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 6c36087baa85e..b022b97408d7d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -463,6 +463,7 @@ defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atom
 defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
+defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define SourceLanguage enum values and at the same time
@@ -700,6 +701,8 @@ defm IncomingRayPayloadNV : StorageClassOperand<5342, [RayTracingNV]>;
 defm ShaderRecordBufferNV : StorageClassOperand<5343, [RayTracingNV]>;
 defm PhysicalStorageBufferEXT : StorageClassOperand<5349, [PhysicalStorageBufferAddressesEXT]>;
 defm CodeSectionINTEL : StorageClassOperand<5605, [FunctionPointersINTEL]>;
+defm DeviceOnlyINTEL : StorageClassOperand<5936, [USMStorageClassesINTEL]>;
+defm HostOnlyINTEL : StorageClassOperand<5937, [USMStorageClassesINTEL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Dim enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 05f766d3ec548..169d7cc93897e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/SPIRVBaseInfo.h"
 #include "SPIRV.h"
 #include "SPIRVInstrInfo.h"
+#include "SPIRVSubtarget.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -146,15 +147,19 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) {
     return 3;
   case SPIRV::StorageClass::Generic:
     return 4;
+  case SPIRV::StorageClass::DeviceOnlyINTEL:
+    return 5;
+  case SPIRV::StorageClass::HostOnlyINTEL:
+    return 6;
   case SPIRV::StorageClass::Input:
     return 7;
   default:
-    llvm_unreachable("Unable to get address space id");
+    report_fatal_error("Unable to get address space id");
   }
 }
 
 SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace) {
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) {
   switch (AddrSpace) {
   case 0:
     return SPIRV::StorageClass::Function;
@@ -166,10 +171,18 @@ addressSpaceToStorageClass(unsigned AddrSpace) {
     return SPIRV::StorageClass::Workgroup;
   case 4:
     return SPIRV::StorageClass::Generic;
+  case 5:
+    return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+               ? SPIRV::StorageClass::DeviceOnlyINTEL
+               : SPIRV::StorageClass::CrossWorkgroup;
+  case 6:
+    return STI.canUseExtension(SPIRV::Extension::SPV_INTEL_usm_storage_classes)
+               ? SPIRV::StorageClass::HostOnlyINTEL
+               : SPIRV::StorageClass::CrossWorkgroup;
   case 7:
     return SPIRV::StorageClass::Input;
   default:
-    llvm_unreachable("Unknown address space");
+    report_fatal_error("Unknown address space");
   }
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index a33dc02f854f5..1af53dcd0c4cd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -27,6 +27,7 @@ class MachineRegisterInfo;
 class Register;
 class StringRef;
 class SPIRVInstrInfo;
+class SPIRVSubtarget;
 
 // Add the given string as a series of integer operand, inserting null
 // terminators and padding to make sure the operands all have 32-bit
@@ -62,7 +63,7 @@ unsigned storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC);
 
 // Convert an LLVM IR address space to a SPIR-V storage class.
 SPIRV::StorageClass::StorageClass
-addressSpaceToStorageClass(unsigned AddrSpace);
+addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI);
 
 SPIRV::MemorySemantics::MemorySemantics
 getMemSemanticsForStorageClass(SPIRV::StorageClass::StorageClass SC);
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll
new file mode 100644
index 0000000000000..30c16350bf2b1
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll
@@ -0,0 +1,84 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_usm_storage_classes/intel_usm_addrspaces.ll
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-EXT
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-WITHOUT
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-: Capability USMStorageClassesINTEL
+; CHECK-SPIRV-WITHOUT-NO: Capability USMStorageClassesINTEL
+; CHECK-SPIRV-EXT-DAG: %[[DevTy:[0-9]+]] = OpTypePointer DeviceOnlyINTEL %[[#]]
+; CHECK-SPIRV-EXT-DAG: %[[HostTy:[0-9]+]] = OpTypePointer HostOnlyINTEL %[[#]]
+; CHECK-SPIRV-DAG: %[[CrsWrkTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[#]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @foo_kernel() {
+entry:
+  ret void
+}
+
+; CHECK-SPIRV: %[[Ptr1:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr1:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[DevTy]] %[[Ptr1]]
+; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr1]]
+define spir_func void @test1(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_dev.addr = alloca ptr addrspace(5), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4
+  %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(5)
+  store ptr addrspace(5) %casted_ptr, ptr %arg_dev.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV: %[[Ptr2:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr2:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[HostTy]] %[[Ptr2]]
+; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr2]]
+define spir_func void @test2(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_host.addr = alloca ptr addrspace(6), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4
+  %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(6)
+  store ptr addrspace(6) %casted_ptr, ptr %arg_host.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV-EXT: %[[Ptr3:[0-9]+]] = OpLoad %[[DevTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr3:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr3]]
+; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr3]]
+define spir_func void @test3(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_dev.addr = alloca ptr addrspace(5), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4
+  %loaded_dev = load ptr addrspace(5), ptr %arg_dev.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(5) %loaded_dev to ptr addrspace(1)
+  store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV-EXT: %[[Ptr4:[0-9]+]] = OpLoad %[[HostTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr4:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr4]]
+; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr4]]
+define spir_func void @test4(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_host.addr = alloca ptr addrspace(6), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4
+  %loaded_host = load ptr addrspace(6), ptr %arg_host.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(6) %loaded_host to ptr addrspace(1)
+  store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4
+  ret void
+}

From f01719afaae9a208ac272d99760d18e4c16d9241 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 22 Feb 2024 10:21:12 +0000
Subject: [PATCH 021/546] [mlir][test] Add integration tests for
 vector.interleave (#80969)

---
 .../CPU/ArmSVE/test-scalable-interleave.mlir  | 24 +++++++++++++++++++
 .../Dialect/Vector/CPU/test-interleave.mlir   | 24 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir
new file mode 100644
index 0000000000000..8ae3eee6462ca
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd -e entry -entry-point-result=void  \
+// RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_arm_runner_utils | \
+// RUN: FileCheck %s
+
+func.func @entry() {
+  %f1 = arith.constant 1.0 : f32
+  %f2 = arith.constant 2.0 : f32
+  %v1 = vector.splat %f1 : vector<[4]xf32>
+  %v2 = vector.splat %f2 : vector<[4]xf32>
+  vector.print %v1 : vector<[4]xf32>
+  vector.print %v2 : vector<[4]xf32>
+  //
+  // Test vectors:
+  //
+  // CHECK: ( 1, 1, 1, 1
+  // CHECK: ( 2, 2, 2, 2
+
+  %v3 = vector.interleave %v1, %v2 : vector<[4]xf32>
+  vector.print %v3 : vector<[8]xf32>
+  // CHECK: ( 1, 2, 1, 2, 1, 2, 1, 2
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir
new file mode 100644
index 0000000000000..0bc78af6aba03
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-interleave.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -test-lower-to-llvm | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
+// RUN:   -shared-libs=%mlir_c_runner_utils | \
+// RUN: FileCheck %s
+
+func.func @entry() {
+  %f1 = arith.constant 1.0 : f32
+  %f2 = arith.constant 2.0 : f32
+  %v1 = vector.splat %f1 : vector<2x4xf32>
+  %v2 = vector.splat %f2 : vector<2x4xf32>
+  vector.print %v1 : vector<2x4xf32>
+  vector.print %v2 : vector<2x4xf32>
+  //
+  // Test vectors:
+  //
+  // CHECK: ( ( 1, 1, 1, 1 ), ( 1, 1, 1, 1 ) )
+  // CHECK: ( ( 2, 2, 2, 2 ), ( 2, 2, 2, 2 ) )
+
+  %v3 = vector.interleave %v1, %v2 : vector<2x4xf32>
+  vector.print %v3 : vector<2x8xf32>
+  // CHECK: ( ( 1, 2, 1, 2, 1, 2, 1, 2 ), ( 1, 2, 1, 2, 1, 2, 1, 2 ) )
+
+  return
+}

From e4d4ebe0415b9f1fd8cb034ac68f0616f12facf2 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 22 Feb 2024 10:22:07 +0000
Subject: [PATCH 022/546] [llvm][llvm-jitlink] Disable test on Windows on Arm

This fails on one of our bots:
https://lab.llvm.org/buildbot/#/builders/120/builds/6309

llvm-jitlink error: Unsupported target machine architecture in COFF object

The other bot doesn't run the test at all it seems but I can't explain
why. It's also possible that I'm mistaken and the mostly native but still
"cross compiling" setup we have on WoA means an x86 object is produced sometimes
(perhaps because a default triple is still x86).
---
 llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
index 33ad5515a6357..ec71011d545eb 100644
--- a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
+++ b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test
@@ -5,4 +5,7 @@
 #
 # Use -sectcreate to create a section from a data file.
 
+# Jitlink does not support ARM64 COFF files.
+# UNSUPPORTED: target=aarch64-pc-windows-{{.*}}
+
 # jitlink-check: *{4}foo = 0x2a2a5a5a
\ No newline at end of file

From b9ce237980b5a636e87e3578609c812833f7537f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 22 Feb 2024 10:39:43 +0000
Subject: [PATCH 023/546] [AMDGPU] Clean up conversion of DPP instructions in
 AMDGPUDisassembler (#82480)

Convert DPP instructions after all calls to tryDecodeInst, just like we
do for all other instruction types. NFCI.
---
 .../Disassembler/AMDGPUDisassembler.cpp       | 127 ++++++++----------
 1 file changed, 53 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 53abb3e3f9aea..c5d06dea92c30 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -465,36 +465,25 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Res =
           tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
                         MI, DecW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+      if (Res)
         break;
+
       Res =
           tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
                         MI, DecW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+      if (Res)
         break;
 
-      const auto convertVOPDPP = [&]() {
-        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
-          convertVOP3PDPPInst(MI);
-        } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) {
-          convertVOPCDPPInst(MI); // Special VOP3 case
-        } else {
-          assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
-          convertVOP3DPPInst(MI); // Regular VOP3 case
-        }
-      };
       Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
                           MI, DecW, Address, CS);
-      if (Res) {
-        convertVOPDPP();
+      if (Res)
         break;
-      }
+
       Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
                           MI, DecW, Address, CS);
-      if (Res) {
-        convertVOPDPP();
+      if (Res)
         break;
-      }
+
       Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
       if (Res)
         break;
@@ -515,27 +504,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
       if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
         Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
-        if (Res) {
-          if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
-              == -1)
-            break;
-          if (convertDPP8Inst(MI) == MCDisassembler::Success)
-            break;
-        }
+        if (Res)
+          break;
       }
 
       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+      if (Res)
         break;
 
       Res = tryDecodeInst(DecoderTableDPP8GFX1164,
                           DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+      if (Res)
         break;
 
       Res = tryDecodeInst(DecoderTableDPP8GFX1264,
                           DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
-      if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+      if (Res)
         break;
 
       Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
@@ -543,19 +527,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
       Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
                           MI, QW, Address, CS);
-      if (Res) {
-        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
-          convertVOPCDPPInst(MI);
+      if (Res)
         break;
-      }
 
       Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
                           MI, QW, Address, CS);
-      if (Res) {
-        if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
-          convertVOPCDPPInst(MI);
+      if (Res)
         break;
-      }
 
       if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
         Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
@@ -652,6 +630,22 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                         Address, CS);
   } while (false);
 
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP)) {
+    if (isMacDPP(MI))
+      convertMacDPPInst(MI);
+
+    if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+      convertVOP3PDPPInst(MI);
+    else if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) ||
+             AMDGPU::isVOPC64DPP(MI.getOpcode()))
+      convertVOPCDPPInst(MI); // Special VOP3 case
+    else if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) !=
+             -1)
+      convertDPP8Inst(MI);
+    else if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3)
+      convertVOP3DPPInst(MI); // Regular VOP3 case
+  }
+
   if (Res && AMDGPU::isMAC(MI.getOpcode())) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
@@ -926,56 +920,41 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
                        AMDGPU::OpName::src2_modifiers);
 }
 
-// We must check FI == literal to reject not genuine dpp8 insts, and we must
-// first add optional MI operands to check FI
 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
 
-  if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
-    convertVOP3PDPPInst(MI);
-  } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
-             AMDGPU::isVOPC64DPP(Opc)) {
-    convertVOPCDPPInst(MI);
-  } else {
-    if (isMacDPP(MI))
-      convertMacDPPInst(MI);
+  int VDstInIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+  if (VDstInIdx != -1)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
 
-    int VDstInIdx =
-        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
-    if (VDstInIdx != -1)
-      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
 
-    if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
-        MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
-      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+  unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
+    convertTrue16OpSel(MI);
+    auto Mods = collectVOPModifiers(MI);
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+                         AMDGPU::OpName::op_sel);
+  } else {
+    // Insert dummy unused src modifiers.
+    if (MI.getNumOperands() < DescNumOps &&
+        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src0_modifiers);
 
-    unsigned DescNumOps = MCII->get(Opc).getNumOperands();
     if (MI.getNumOperands() < DescNumOps &&
-        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
-      convertTrue16OpSel(MI);
-      auto Mods = collectVOPModifiers(MI);
-      insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
-                           AMDGPU::OpName::op_sel);
-    } else {
-      // Insert dummy unused src modifiers.
-      if (MI.getNumOperands() < DescNumOps &&
-          AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src0_modifiers))
-        insertNamedMCOperand(MI, MCOperand::createImm(0),
-                             AMDGPU::OpName::src0_modifiers);
-
-      if (MI.getNumOperands() < DescNumOps &&
-          AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
-        insertNamedMCOperand(MI, MCOperand::createImm(0),
-                             AMDGPU::OpName::src1_modifiers);
-    }
+        AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
+      insertNamedMCOperand(MI, MCOperand::createImm(0),
+                           AMDGPU::OpName::src1_modifiers);
   }
   return MCDisassembler::Success;
 }
 
 DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
-  if (isMacDPP(MI))
-    convertMacDPPInst(MI);
-
   convertTrue16OpSel(MI);
 
   int VDstInIdx =

From 4f12f47550eee85447c9ec37d27a20c6593d3d40 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald@gigawatt.nl>
Date: Thu, 22 Feb 2024 10:45:27 +0000
Subject: [PATCH 024/546] [AArch64] Switch to soft promoting half types.
 (#80576)

The traditional promotion is known to generate wrong code.

Like #80440 for ARM, except that far less is affected as on AArch64,
hardware floating point support always includes FP16 support and is
unaffected by these changes. This only affects `-mgeneral-regs-only`
(Clang) / `-mattr=-fp-armv8` (LLVM).

Because this only affects a configuration where no FP support is
available at all, `useFPRegsForHalfType()` has no effect and is not
specified: `f32` was getting legalized as a parameter and return type to
an integer anyway.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +
 .../AArch64/strictfp_f16_abi_promote.ll       | 140 +++---------------
 2 files changed, 26 insertions(+), 116 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 436b21fd13463..bec13484450d7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1308,6 +1308,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool preferScalarizeSplat(SDNode *N) const override;
 
   unsigned getMinimumJumpTableEntries() const override;
+
+  bool softPromoteHalfType() const override { return true; }
 };
 
 namespace AArch64 {
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
index 37186cf22ccc7..a34f7abcc22a3 100644
--- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -70,22 +70,20 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
 ; NOFP16-NEXT:    .cfi_offset w22, -32
 ; NOFP16-NEXT:    .cfi_offset w30, -48
 ; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w2, #0xffff
+; NOFP16-NEXT:    and w0, w1, #0xffff
 ; NOFP16-NEXT:    mov x19, x3
-; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    mov w20, w2
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
 ; NOFP16-NEXT:    mov w22, w0
 ; NOFP16-NEXT:    and w0, w21, #0xffff
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w8, w0
 ; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    orr x21, x8, x22, lsl #32
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w8, w21
-; NOFP16-NEXT:    // kill: def $w0 killed $w0 def $x0
-; NOFP16-NEXT:    str w22, [x19, #8]
-; NOFP16-NEXT:    orr x8, x8, x0, lsl #32
+; NOFP16-NEXT:    str x21, [x19]
 ; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    str x8, [x19]
+; NOFP16-NEXT:    str w0, [x19, #8]
 ; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; NOFP16-NEXT:    ret
@@ -182,46 +180,17 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
 define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v4f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v4f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w20, w1
-; NOFP16-NEXT:    mov w21, w2
-; NOFP16-NEXT:    mov w22, w3
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w20, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <4 x half> @v4f16_result()
   store <4 x half> %val, ptr %ptr
@@ -231,82 +200,21 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v8f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x27, [sp, #-80]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w24, -48
-; NOFP16-NEXT:    .cfi_offset w25, -56
-; NOFP16-NEXT:    .cfi_offset w26, -64
-; NOFP16-NEXT:    .cfi_offset w27, -72
-; NOFP16-NEXT:    .cfi_offset w30, -80
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v8f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w21, w1
-; NOFP16-NEXT:    mov w22, w2
-; NOFP16-NEXT:    mov w23, w3
-; NOFP16-NEXT:    mov w24, w4
-; NOFP16-NEXT:    mov w25, w5
-; NOFP16-NEXT:    mov w26, w6
-; NOFP16-NEXT:    mov w27, w7
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w22, w0
-; NOFP16-NEXT:    and w0, w23, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w24, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w24, w0
-; NOFP16-NEXT:    and w0, w25, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w25, w0
-; NOFP16-NEXT:    and w0, w26, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w26, w0
-; NOFP16-NEXT:    and w0, w27, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #14]
-; NOFP16-NEXT:    mov w0, w26
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #12]
-; NOFP16-NEXT:    mov w0, w25
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #10]
-; NOFP16-NEXT:    mov w0, w24
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #8]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w22
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w5, [x19, #10]
+; NOFP16-NEXT:    strh w7, [x19, #14]
+; NOFP16-NEXT:    strh w6, [x19, #12]
+; NOFP16-NEXT:    strh w4, [x19, #8]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x27, [sp], #80 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <8 x half> @v8f16_result()
   store <8 x half> %val, ptr %ptr

From 3b7d43301e3662da4197cef7948c18fab850d9c4 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 22 Feb 2024 11:18:18 +0000
Subject: [PATCH 025/546] [AMDGPU] Remove DPP DecoderNamespaces. NFC. (#82491)

Now that there is no special checking for valid DPP encodings, these
instructions can use the same DecoderNamespace as other 64- or 96-bit
instructions.

Also clean up setting DecoderNamespace: in most cases it should be set
as a pair with AssemblerPredicate.
---
 .../Disassembler/AMDGPUDisassembler.cpp       |  57 +-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  75 ++-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |  36 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   6 +-
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    | 498 ++++++++----------
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  16 +-
 6 files changed, 288 insertions(+), 400 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index c5d06dea92c30..70e2275c58745 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -462,33 +462,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // encodings
     if (isGFX11Plus() && Bytes.size() >= 12 ) {
       DecoderUInt128 DecW = eat12Bytes(Bytes);
-      Res =
-          tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
-                        MI, DecW, Address, CS);
+      Res = tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+                          DecW, Address, CS);
       if (Res)
         break;
 
-      Res =
-          tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
-                        MI, DecW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
-                          MI, DecW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
-                          MI, DecW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
+      Res = tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+                          DecW, Address, CS);
       if (Res)
         break;
 
@@ -508,33 +488,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           break;
       }
 
-      Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableDPP8GFX1164,
-                          DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableDPP8GFX1264,
-                          DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
-      if (Res) break;
-
-      Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
-                          MI, QW, Address, CS);
-      if (Res)
-        break;
-
-      Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
-                          MI, QW, Address, CS);
-      if (Res)
-        break;
-
       if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
         Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
         if (Res)
@@ -593,7 +546,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
         break;
     }
 
-    // Reinitialize Bytes as DPP64 could have eaten too much
+    // Reinitialize Bytes
     Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
     // Try decode 32-bit instruction
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 576ad32a70cf3..f5424cf48d7a5 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -749,7 +749,7 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p
 class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
     VOP1_DPP16 <op, ps, Gen.Subtarget, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+  let DecoderNamespace = Gen.DecoderNamespace;
 }
 
 class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -770,7 +770,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
 class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
     VOP1_DPP8<op, ps, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+  let DecoderNamespace = Gen.DecoderNamespace;
 }
 
 //===----------------------------------------------------------------------===//
@@ -816,7 +816,7 @@ multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
                                    string asmName> {
   defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
   let AsmString = asmName # ps.Pfl.AsmDPP16,
-      DecoderNamespace = "DPP" # Gen.DecoderNamespace #
+      DecoderNamespace = Gen.DecoderNamespace #
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
     defm NAME : VOP1_Real_dpp<Gen, op, opName>;
   }
@@ -831,7 +831,7 @@ multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
                                     string asmName> {
   defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
   let AsmString = asmName # ps.Pfl.AsmDPP8,
-      DecoderNamespace = "DPP8" # Gen.DecoderNamespace #
+      DecoderNamespace = Gen.DecoderNamespace #
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16") in {
     defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
   }
@@ -994,9 +994,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
   multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
     if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
-    def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
-      let DecoderNamespace = "DPP8";
-    }
+    def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
@@ -1192,16 +1190,14 @@ class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> :
   let Inst{31-25} = 0x3f; //encoding
 }
 
-multiclass VOP1Only_Real_vi <bits<10> op> {
-  let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+  multiclass VOP1Only_Real_vi <bits<10> op> {
     def _vi :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
       VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
   }
-}
 
-multiclass VOP1_Real_e32e64_vi <bits<10> op> {
-  let AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" in {
+  multiclass VOP1_Real_e32e64_vi <bits<10> op> {
     def _e32_vi :
       VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
       VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32").Pfl>;
@@ -1389,44 +1385,41 @@ def : GCNPat <
 // GFX9
 //===----------------------------------------------------------------------===//
 
-multiclass VOP1_Real_gfx9 <bits<10> op> {
-  let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+  multiclass VOP1_Real_gfx9 <bits<10> op> {
     defm NAME : VOP1_Real_e32e64_vi <op>;
-  }
-
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
-  def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
-    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
-
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
-    def _dpp_gfx9 :
-      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
-      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
-
-}
 
-multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
-  let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
-    defm NAME : VOP1_Real_e32e64_vi <op>;
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+    def _sdwa_gfx9 :
+      VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+      VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+      def _dpp_gfx9 :
+        VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+        VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
   }
 
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
-  def _sdwa_gfx9 :
-    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
-    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
-      let Inst{42-40} = 6;
-    }
+  multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+    defm NAME : VOP1_Real_e32e64_vi <op>;
 
-  if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
-    def _dpp_gfx9 :
-      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
-      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
+    def _sdwa_gfx9 :
+      VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+      VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+        let Inst{42-40} = 6;
+      }
+
+    if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+      def _dpp_gfx9 :
+        VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+        VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+  }
 }
 
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 
-let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
+let AssemblerPredicate = isGFX940Plus in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
 let OtherPredicates = [HasFP8ConversionInsts] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9f54e69f6d55e..13fe79b475960 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1273,7 +1273,7 @@ class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen,
     VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
-  let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+  let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
@@ -1302,7 +1302,7 @@ class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,
     VOP2_DPP8<op, ps, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let OtherPredicates = !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], []);
-  let DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+  let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
@@ -1748,9 +1748,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
   multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
     if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
-    def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
-      let DecoderNamespace = "DPP8";
-    }
+    def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
   }
 
   //===------------------------- VOP2 (with name) -------------------------===//
@@ -1797,7 +1795,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
     def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
       VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
       let AsmString = asmName # ps.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8";
     }
   }
 
@@ -1876,7 +1873,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
       VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
         string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
         let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
-        let DecoderNamespace = "DPP8";
       }
     if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
     def _dpp8_w32_gfx10 :
@@ -2231,7 +2227,7 @@ multiclass VOP2_SDWA9_Real <bits<6> op> {
     VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
-let AssemblerPredicate = isGFX8Only in {
+let AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8" in {
 
 multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
   def _e32_vi :
@@ -2239,14 +2235,12 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
     VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
       VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX8";
     }
   def _e64_vi :
     VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
     VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
       VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX8";
     }
   if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then
     def _sdwa_vi :
@@ -2263,9 +2257,10 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
         let AsmString = AsmName # ps.AsmOperands;
       }
 }
-}
 
-let AssemblerPredicate = isGFX9Only in {
+} // End AssemblerPredicate = isGFX8Only, DecoderNamespace = "GFX8"
+
+let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
 
 multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
   def _e32_gfx9 :
@@ -2273,14 +2268,12 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
     VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
       VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX9";
     }
   def _e64_gfx9 :
     VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
     VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
       VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
       let AsmString = AsmName # ps.AsmOperands;
-      let DecoderNamespace = "GFX9";
     }
   if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx9 :
@@ -2295,21 +2288,16 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
       VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
         VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp");
         let AsmString = AsmName # ps.AsmOperands;
-        let DecoderNamespace = "GFX9";
       }
 }
 
 multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
   def _e32_gfx9 :
     VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>,
-    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{
-      let DecoderNamespace = "GFX9";
-    }
+    VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
   def _e64_gfx9 :
     VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
-    VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
-      let DecoderNamespace = "GFX9";
-    }
+    VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
   if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx9 :
       VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
@@ -2318,12 +2306,10 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
   if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
     def _dpp_gfx9 :
       VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
-      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
-        let DecoderNamespace = "GFX9";
-      }
+      VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
-} // AssemblerPredicate = isGFX9Only
+} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
   Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a0090f3e8d1db..cf76de40aef41 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1486,7 +1486,7 @@ multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
       : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
                     Gen.Subtarget> {
     let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
-    let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1496,7 +1496,7 @@ multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
   defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
   def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> {
     let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1613,7 +1613,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
 multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
                                   VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
                                   VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
-  let SubtargetPredicate = isGFX940Plus,
+  let AssemblerPredicate = isGFX940Plus,
       DecoderNamespace = "GFX940",
       AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
   def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 508f06c4739a5..e5e82447d55fb 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -222,6 +222,8 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst,
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
   let SubtargetPredicate = AssemblerPredicate;
+
+  string DecoderNamespace; // dummy
 }
 
 multiclass VOPCInstAliases <string old_name, string Arch, string real_name = old_name, string mnemonic_from = real_name> {
@@ -1331,196 +1333,176 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
 //===----------------------------------------------------------------------===//
 
 multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
-                            VOPCe<op{7-0}>;
-      def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
-                            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
+                          VOPCe<op{7-0}>;
+    def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
+                          VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
       defvar AsmDPP = ps32.Pfl.AsmDPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
-        def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
+      def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
-        def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
-          let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
-          let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
+      def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+        let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+        let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
-                                  SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
-        def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
-          let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+                                SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+      def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+        let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
-        def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
-          let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
-          let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
+      def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+        let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
+        let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
                                string asm_name, string pseudo_mnemonic = ""> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix :
-        // 32 and 64 bit forms of the instruction have _e32 and _e64
-        // respectively appended to their assembly mnemonic.
-        // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
-        // the destination-less 32bit forms add it to the asmString here.
-        VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
-        VOPCe<op{7-0}>,
-        MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
-                          pseudo_mnemonic),
-                      asm_name, ps32.AsmVariantName>,
-        Requires<[Gen.AssemblerPredicate]>;
-      def _e64#Gen.Suffix :
-            VOP3_Real<ps64, Gen.Subtarget, asm_name>,
-            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
-            MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
-                              pseudo_mnemonic),
-                          asm_name, ps64.AsmVariantName>,
-            Requires<[Gen.AssemblerPredicate]> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix :
+      // 32 and 64 bit forms of the instruction have _e32 and _e64
+      // respectively appended to their assembly mnemonic.
+      // _e64 is printed as part of the VOPDstS64orS32 operand, whereas
+      // the destination-less 32bit forms add it to the asmString here.
+      VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
+      VOPCe<op{7-0}>,
+      MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
+                        pseudo_mnemonic),
+                    asm_name, ps32.AsmVariantName>,
+      Requires<[Gen.AssemblerPredicate]>;
+    def _e64#Gen.Suffix :
+          VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+          VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
+          MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
+                            pseudo_mnemonic),
+                        asm_name, ps64.AsmVariantName>,
+          Requires<[Gen.AssemblerPredicate]> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
       defvar AsmDPP = ps32.Pfl.AsmDPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
-                                                  Gen.Subtarget, asm_name>;
-        def _e32_dpp_w32#Gen.Suffix
-            : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp_w64#Gen.Suffix
-            : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                                Gen.Subtarget, asm_name>;
+      def _e32_dpp_w32#Gen.Suffix
+          : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp_w64#Gen.Suffix
+          : VOPC_DPP16<op{7-0}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
-        def _e32_dpp8_w32#Gen.Suffix
-            : VOPC_DPP8<op{7-0}, ps32, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e32_dpp8_w64#Gen.Suffix
-            : VOPC_DPP8<op{7-0}, ps32, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+      def _e32_dpp8_w32#Gen.Suffix
+          : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e32_dpp8_w64#Gen.Suffix
+          : VOPC_DPP8<op{7-0}, ps32, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
 
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
-                                  SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
-        def _e64_dpp_w32#Gen.Suffix
-            : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp_w64#Gen.Suffix
-            : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+                                SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+      def _e64_dpp_w32#Gen.Suffix
+          : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp_w64#Gen.Suffix
+          : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
-        def _e64_dpp8_w32#Gen.Suffix
-            : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave32;
-        }
-        def _e64_dpp8_w64#Gen.Suffix
-            : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # " vcc, " # AsmDPP8;
-          let isAsmParserOnly = 1;
-          let WaveSizePredicate = isWave64;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+      def _e64_dpp8_w32#Gen.Suffix
+          : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+        let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave32;
+      }
+      def _e64_dpp8_w64#Gen.Suffix
+          : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
+        let AsmString = asm_name # " vcc, " # AsmDPP8;
+        let isAsmParserOnly = 1;
+        let WaveSizePredicate = isWave64;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1528,123 +1510,103 @@ multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
   VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>;
 
 multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix :
-        VOPC_Real<ps32, Gen.Subtarget>,
-        VOPCe<op{7-0}> {
-          let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
-                          # " " # ps32.AsmOperands;
-        }
-      def _e64#Gen.Suffix :
-        VOP3_Real<ps64, Gen.Subtarget>,
-        VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-          let Inst{7-0} = ?; // sdst
-          let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
-                          # "{_e64} " # ps64.AsmOperands;
-        }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix :
+      VOPC_Real<ps32, Gen.Subtarget>,
+      VOPCe<op{7-0}> {
+        let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
+                        # " " # ps32.AsmOperands;
+    }
+    def _e64#Gen.Suffix :
+      VOP3_Real<ps64, Gen.Subtarget>,
+      VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+        let Inst{7-0} = ?; // sdst
+        let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
+                        # "{_e64} " # ps64.AsmOperands;
+    }
 
     defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
       defvar AsmDPP = ps32.Pfl.AsmDPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix
-            : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
-          let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
-        }
+      def _e32_dpp#Gen.Suffix
+          : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
+        let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
       }
       defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
-          let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
-        }
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
+        let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
       }
     }
 
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix
-            : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
-              SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
-          let AsmString = !subst("_nosdst", "", psDPP.OpName)
-                          # "{_e64_dpp} " # AsmDPP;
-        }
+      def _e64_dpp#Gen.Suffix
+          : VOPC64_DPP16_NoDst<{0, op}, psDPP>,
+            SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+        let AsmString = !subst("_nosdst", "", psDPP.OpName)
+                        # "{_e64_dpp} " # AsmDPP;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
-          let AsmString = !subst("_nosdst", "", ps64.OpName)
-                          # "{_e64_dpp} " # AsmDPP8;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+        let AsmString = !subst("_nosdst", "", ps64.OpName)
+                        # "{_e64_dpp} " # AsmDPP8;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
       string asm_name, string pseudo_mnemonic = ""> {
-  let AssemblerPredicate = Gen.AssemblerPredicate in {
+  let AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace in {
     defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
     defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
-    let DecoderNamespace = Gen.DecoderNamespace in {
-      def _e32#Gen.Suffix
-          : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
-            MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
-                              pseudo_mnemonic),
-                          asm_name, ps32.AsmVariantName>,
-            Requires<[Gen.AssemblerPredicate]>,
-            VOPCe<op{7-0}> {
-        let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
-      }
-      def _e64#Gen.Suffix
-          : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
-            MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
-                              pseudo_mnemonic),
-                          asm_name, ps64.AsmVariantName>,
-            Requires<[Gen.AssemblerPredicate]>,
-            VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
-        let Inst{7-0} = ? ; // sdst
-        let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
-      }
-    } // End DecoderNamespace = Gen.DecoderNamespace
+    def _e32#Gen.Suffix
+        : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
+          MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
+                            pseudo_mnemonic),
+                        asm_name, ps32.AsmVariantName>,
+          Requires<[Gen.AssemblerPredicate]>,
+          VOPCe<op{7-0}> {
+      let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
+    }
+    def _e64#Gen.Suffix
+        : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+          MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
+                            pseudo_mnemonic),
+                        asm_name, ps64.AsmVariantName>,
+          Requires<[Gen.AssemblerPredicate]>,
+          VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
+      let Inst{7-0} = ? ; // sdst
+      let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
+    }
 
     defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
 
     if ps32.Pfl.HasExtDPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
-                                              Gen.Subtarget, asm_name>;
-      }
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
-      }
+      def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+                                            Gen.Subtarget, asm_name>;
+      def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
     }
     if ps64.Pfl.HasExtVOP3DPP then {
       defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
       defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
-      let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
-        def _e64_dpp#Gen.Suffix
-            : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
-              SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
-          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
-        }
+      def _e64_dpp#Gen.Suffix
+          : VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
+            SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
+        let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
       }
       defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
-      let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
-        def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
-          let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
-        }
+      def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+        let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
       }
     }
-  } // AssemblerPredicate = Gen.AssemblerPredicate
+  } // End AssemblerPredicate = Gen.AssemblerPredicate, DecoderNamespace = Gen.DecoderNamespace
 }
 
 multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
@@ -1873,21 +1835,19 @@ defm V_CMPX_CLASS_F64     : VOPCX_Real_gfx11_gfx12<0x0ff>;
 // GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX10Only in {
+let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   multiclass VOPC_Real_gfx10<bits<9> op> {
-    let DecoderNamespace = "GFX10" in {
-      def _e32_gfx10 :
-        VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
-        VOPCe<op{7-0}>;
-      def _e64_gfx10 :
-        VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
-        VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = "GFX10"
+    def _e32_gfx10 :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX10>,
+      VOPCe<op{7-0}>;
+    def _e64_gfx10 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
+      VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx10 :
@@ -1898,22 +1858,20 @@ let AssemblerPredicate = isGFX10Only in {
   }
 
   multiclass VOPCX_Real_gfx10<bits<9> op> {
-    let DecoderNamespace = "GFX10" in {
-      def _e32_gfx10 :
-        VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
-        VOPCe<op{7-0}> {
-          let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
-                          # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
-        }
-
-      def _e64_gfx10 :
-        VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
-        VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
-          let Inst{7-0} = ?; // sdst
-          let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
-                          # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
-        }
-    } // End DecoderNamespace = "GFX10"
+    def _e32_gfx10 :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32"), SIEncodingFamily.GFX10>,
+      VOPCe<op{7-0}> {
+        let AsmString = !subst("_nosdst", "", !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").PseudoInstr)
+                        # " " # !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").AsmOperands;
+    }
+
+    def _e64_gfx10 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_nosdst_e64"), SIEncodingFamily.GFX10>,
+      VOP3a_gfx10<{0, op}, !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Pfl> {
+        let Inst{7-0} = ?; // sdst
+        let AsmString = !subst("_nosdst", "", !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").Mnemonic)
+                        # "{_e64} " # !cast<VOP3_Pseudo>(NAME#"_nosdst_e64").AsmOperands;
+    }
 
     if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
     def _sdwa_gfx10 :
@@ -1925,7 +1883,7 @@ let AssemblerPredicate = isGFX10Only in {
 
     defm : VOPCXInstAliases<NAME, "gfx10">;
   }
-} // End AssemblerPredicate = isGFX10Only
+} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
 defm V_CMP_LT_I16     : VOPC_Real_gfx10<0x089>;
 defm V_CMP_EQ_I16     : VOPC_Real_gfx10<0x08a>;
@@ -1990,25 +1948,23 @@ defm V_CMPX_TRU_F16   : VOPCX_Real_gfx10<0x0ff>;
 // GFX6, GFX7, GFX10.
 //===----------------------------------------------------------------------===//
 
-let AssemblerPredicate = isGFX6GFX7 in {
+let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
   multiclass VOPC_Real_gfx6_gfx7<bits<9> op> {
-    let DecoderNamespace = "GFX6GFX7" in {
-      def _e32_gfx6_gfx7 :
-        VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
-        VOPCe<op{7-0}>;
-      def _e64_gfx6_gfx7 :
-        VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
-        VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
-        // Encoding used for VOPC instructions encoded as VOP3 differs from
-        // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
-        bits<8> sdst;
-        let Inst{7-0} = sdst;
-      }
-    } // End DecoderNamespace = "GFX6GFX7"
+    def _e32_gfx6_gfx7 :
+      VOPC_Real<!cast<VOPC_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
+      VOPCe<op{7-0}>;
+    def _e64_gfx6_gfx7 :
+      VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
+      VOP3a_gfx6_gfx7<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Encoding used for VOPC instructions encoded as VOP3 differs from
+      // VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
+      bits<8> sdst;
+      let Inst{7-0} = sdst;
+    }
 
     defm : VOPCInstAliases<NAME, "gfx6_gfx7">;
   }
-} // End AssemblerPredicate = isGFX6GFX7
+} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
 
 multiclass VOPC_Real_gfx6_gfx7_gfx10<bits<9> op> :
   VOPC_Real_gfx6_gfx7<op>, VOPC_Real_gfx10<op>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 801afabbdb140..2989d05e968ef 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -835,7 +835,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
-  let DecoderNamespace = "DPP";
+  let DecoderNamespace = "GFX8";
 
   VOPProfile Pfl = P;
 }
@@ -906,7 +906,7 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
-  let DecoderNamespace = "DPP";
+  let DecoderNamespace = "GFX8";
 }
 
 class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
@@ -1350,7 +1350,7 @@ class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
   VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
-  let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+  let DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
@@ -1463,7 +1463,7 @@ multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName,
 multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
   def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1473,7 +1473,7 @@ multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME>
   def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
     let Inst{11} = ?;
     let Inst{12} = ?;
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1482,7 +1482,7 @@ multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName,
                                     string asmName> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.Pfl.AsmVOP3DPP8,
-      DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+      DecoderNamespace = Gen.DecoderNamespace#
                          !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
       True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
                             NoTrue16Predicate) in {
@@ -1505,7 +1505,7 @@ multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName,
   defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
   def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
                             SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> {
-    let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1514,7 +1514,7 @@ multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName,
                             string asmName> {
   defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
   def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> {
-    let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+    let DecoderNamespace = Gen.DecoderNamespace;
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }

From f17e4151423a798c18533080fe7f8a3e922d7312 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Thu, 22 Feb 2024 11:36:18 +0000
Subject: [PATCH 026/546] [AArch64] Mangle names of all ARM64EC functions with
 entry thunks (#80996)

This better matches MSVC output in cases where static functions have their addresses taken.
---
 llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp      | 2 +-
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp               | 3 ++-
 .../CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll   | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index c62582ac01a4c..a99856dcc9439 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -712,7 +712,7 @@ bool AArch64Arm64ECCallLowering::processFunction(
   // name (emitting the definition) can grab it from the metadata.
   //
   // FIXME: Handle functions with weak linkage?
-  if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) {
+  if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
     if (std::optional<std::string> MangledName =
             getArm64ECMangledFunctionName(F.getName().str())) {
       F.setMetadata("arm64ec_unmangled_name",
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 5b5ffd7b2feb0..4fa719ad67cf3 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1121,7 +1121,8 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
     TS->emitDirectiveVariantPCS(CurrentFnSym);
   }
 
-  if (TM.getTargetTriple().isWindowsArm64EC()) {
+  if (TM.getTargetTriple().isWindowsArm64EC() &&
+      !MF->getFunction().hasLocalLinkage()) {
     // For ARM64EC targets, a function definition's name is mangled differently
     // from the normal symbol. We emit the alias from the unmangled symbol to
     // mangled symbol name here.
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
index 00ae34bf4b00f..217f08be05218 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
@@ -2,7 +2,8 @@
 
 ; Validates when local linkage functions get a thunk generated.
 
-; Being called does not cause a thunk to be generated.
+; Being called does not cause a thunk to be generated or the symbol name to be mangled.
+; CHECK-NOT: "#does_not_have_addr_taken":
 ; CHECK-NOT:  $ientry_thunk$cdecl$v$f;
 define internal void @does_not_have_addr_taken(float) nounwind {
   ret void
@@ -12,7 +13,8 @@ define void @calls_does_not_have_addr_taken() nounwind {
   ret void
 }
 
-; Having an address taken does cause a thunk to be generated.
+; Having an address taken does cause a thunk to be generated and the symbol name to be mangled.
+; CHECK: "#has_addr_taken":
 ; CHECK: $ientry_thunk$cdecl$v$i8;
 define internal void @has_addr_taken(i64) nounwind {
   ret void

From 1f99a450127c2404d4f9b8ac24acdb17823c988b Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 20 Feb 2024 15:08:06 +0000
Subject: [PATCH 027/546] [AArch64] Remove unused ReverseCSRRestoreSeq option.
 (#82326)

This patch removes the `-reverse-csr-restore-seq` option from
AArch64FrameLowering, since this is no longer used.

This patch was reverted because of a crash in PR#79623.
Merging it back as it was fixed in PR#82492.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  66 ++++--------
 .../AArch64/reverse-csr-restore-seq.mir       | 101 ------------------
 2 files changed, 21 insertions(+), 146 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 503b1c199650f..5cc612e89162a 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -239,11 +239,6 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
                                    cl::desc("enable use of redzone on AArch64"),
                                    cl::init(false), cl::Hidden);
 
-static cl::opt<bool>
-    ReverseCSRRestoreSeq("reverse-csr-restore-seq",
-                         cl::desc("reverse the CSR restore sequence"),
-                         cl::init(false), cl::Hidden);
-
 static cl::opt<bool> StackTaggingMergeSetTag(
     "stack-tagging-merge-settag",
     cl::desc("merge settag instruction in function epilog"), cl::init(true),
@@ -307,8 +302,6 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
     return false;
   if (!EnableHomogeneousPrologEpilog)
     return false;
-  if (ReverseCSRRestoreSeq)
-    return false;
   if (EnableRedZone)
     return false;
 
@@ -3117,7 +3110,27 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
 
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
 
-  auto EmitMI = [&](const RegPairInfo &RPI) -> MachineBasicBlock::iterator {
+  if (homogeneousPrologEpilog(MF, &MBB)) {
+    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
+                   .setMIFlag(MachineInstr::FrameDestroy);
+    for (auto &RPI : RegPairs) {
+      MIB.addReg(RPI.Reg1, RegState::Define);
+      MIB.addReg(RPI.Reg2, RegState::Define);
+    }
+    return true;
+  }
+
+  // For performance reasons restore SVE register in increasing order
+  auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
+  auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
+  auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
+  std::reverse(PPRBegin, PPREnd);
+  auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
+  auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
+  auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
+  std::reverse(ZPRBegin, ZPREnd);
+
+  for (const RegPairInfo &RPI : RegPairs) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
@@ -3191,43 +3204,6 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
         MachineMemOperand::MOLoad, Size, Alignment));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
-
-    return MIB->getIterator();
-  };
-
-  if (homogeneousPrologEpilog(MF, &MBB)) {
-    auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
-                   .setMIFlag(MachineInstr::FrameDestroy);
-    for (auto &RPI : RegPairs) {
-      MIB.addReg(RPI.Reg1, RegState::Define);
-      MIB.addReg(RPI.Reg2, RegState::Define);
-    }
-    return true;
-  }
-
-  // For performance reasons restore SVE register in increasing order
-  auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
-  auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
-  auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
-  std::reverse(PPRBegin, PPREnd);
-  auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
-  auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
-  auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
-  std::reverse(ZPRBegin, ZPREnd);
-
-  if (ReverseCSRRestoreSeq) {
-    MachineBasicBlock::iterator First = MBB.end();
-    for (const RegPairInfo &RPI : reverse(RegPairs)) {
-      MachineBasicBlock::iterator It = EmitMI(RPI);
-      if (First == MBB.end())
-        First = It;
-    }
-    if (First != MBB.end())
-      MBB.splice(MBBI, &MBB, First);
-  } else {
-    for (const RegPairInfo &RPI : RegPairs) {
-      (void)EmitMI(RPI);
-    }
   }
 
   return true;
diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
deleted file mode 100644
index de4baec50e0c6..0000000000000
--- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
+++ /dev/null
@@ -1,101 +0,0 @@
-# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-#
---- |
-
-  define void @foo() nounwind { entry: unreachable }
-
-  define void @bar() nounwind { entry: unreachable }
-
-  define void @baz() nounwind { entry: unreachable }
-
-...
----
-name:            foo
-# CHECK-LABEL: name: foo
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    $x19 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-    $x22 = IMPLICIT_DEF
-    $x23 = IMPLICIT_DEF
-    $x24 = IMPLICIT_DEF
-    $x25 = IMPLICIT_DEF
-    $x26 = IMPLICIT_DEF
-
-  ; The local stack size is 0, so the last ldp in the sequence will also
-  ; restore the stack.
-  ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
-  ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
-
-  ; The ldp and the stack increment get merged even before
-  ; the load-store optimizer.
-  ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
-
-    RET_ReallyLR
-...
----
-name:            bar
-# CHECK-LABEL: name: bar
-tracksRegLiveness: true
-stack:
-  - { id : 0, size: 8, alignment: 4,
-  stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-  local-offset: -4, debug-info-variable: '', debug-info-expression: '',
-  debug-info-location: '' }
-
-body:             |
-  bb.0:
-    $x19 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-    $x22 = IMPLICIT_DEF
-    $x23 = IMPLICIT_DEF
-    $x24 = IMPLICIT_DEF
-    $x25 = IMPLICIT_DEF
-    $x26 = IMPLICIT_DEF
-
-  ; The local stack size is not 0, and we can combine the CSR stack size with
-  ; the local stack size. This results in rewriting the offsets for all the
-  ; save/restores and forbids us to merge the stack adjustment and the last pop.
-  ; In this case, there is no point of moving the first CSR pair at the end.
-  ; We do it anyway, as it's a small price to pay for the resulting
-  ; simplification in the epilogue emission code.
-  ; CHECK:      $x24, $x23 = frame-destroy LDPXi $sp, 4
-  ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
-  ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
-  ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
-    RET_ReallyLR
-...
----
-# Check that the load from the offset 0 is moved at the end even when hasFP is
-# false.
-name:            baz
-# CHECK-LABEL: name: baz
-alignment:       4
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-body:             |
-  bb.0:
-    successors: %bb.1
-
-    $x0 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-
-    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
-    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
-    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-    B %bb.1
-
-  bb.1:
-   ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2
-   ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
-    RET_ReallyLR
-...

From 4235e44d4c37ca738c74def05da8caf124d2464e Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 22 Feb 2024 13:15:26 +0100
Subject: [PATCH 028/546] [GlobalISel] Constant-fold G_PTR_ADD with different
 type sizes (#81473)

All other opcodes in the list are constrained to have the same type on
both operands, but not G_PTR_ADD.

Fixes  #81464
---
 llvm/lib/CodeGen/GlobalISel/Utils.cpp         |  5 ++-
 .../combine-extract-vector-load.mir           | 40 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 26fd12f9e51c4..23ad68b331c97 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -660,8 +660,11 @@ std::optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode,
   default:
     break;
   case TargetOpcode::G_ADD:
-  case TargetOpcode::G_PTR_ADD:
     return C1 + C2;
+  case TargetOpcode::G_PTR_ADD:
+    // Types can be of different width here.
+    // Result needs to be the same width as C1, so trunc or sext C2.
+    return C1 + C2.sextOrTrunc(C1.getBitWidth());
   case TargetOpcode::G_AND:
     return C1 & C2;
   case TargetOpcode::G_ASHR:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
new file mode 100644
index 0000000000000..aa72a9ec06ede
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+# Tries to emit a foldable G_PTR_ADD with (p1, s32) operands.
+---
+name:            test_ptradd_crash__offset_smaller
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_ptradd_crash__offset_smaller
+    ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1)
+    ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %1:_(p1) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 3
+    %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1)
+    %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...
+
+# Tries to emit a foldable G_PTR_ADD with (p1, s128) operands.
+---
+name:            test_ptradd_crash__offset_wider
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_ptradd_crash__offset_wider
+    ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1)
+    ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %1:_(p1) = G_CONSTANT i64 0
+    %3:_(s128) = G_CONSTANT i128 3
+    %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1)
+    %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...

From 3ef63a71adb7fd1c792fd61d00c74159fcef9a2f Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 22 Feb 2024 20:57:34 +0800
Subject: [PATCH 029/546] [CVP] Refactor `processMinMaxIntrinsic` to check
 non-strict predicate in both directions (#82596)

This patch uses `getConstantRangeAtUse` in `processMinMaxIntrinsic` to
address the comment
https://github.com/llvm/llvm-project/pull/82478#discussion_r1497300920.
After this patch we can reuse the range result in
https://github.com/llvm/llvm-project/pull/82478.
---
 .../Scalar/CorrelatedValuePropagation.cpp     | 26 +++++---
 .../CorrelatedValuePropagation/min-max.ll     | 63 +++++++++++++++++--
 2 files changed, 76 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 9235850de92f3..c71870bc1b656 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -530,15 +530,23 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) {
 // See if this min/max intrinsic always picks it's one specific operand.
 static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) {
   CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate());
-  LazyValueInfo::Tristate Result = LVI->getPredicateAt(
-      Pred, MM->getLHS(), MM->getRHS(), MM, /*UseBlockValue=*/true);
-  if (Result == LazyValueInfo::Unknown)
-    return false;
-
-  ++NumMinMax;
-  MM->replaceAllUsesWith(MM->getOperand(!Result));
-  MM->eraseFromParent();
-  return true;
+  ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0),
+                                                    /*UndefAllowed*/ false);
+  ConstantRange RHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(1),
+                                                    /*UndefAllowed*/ false);
+  if (LHS_CR.icmp(Pred, RHS_CR)) {
+    ++NumMinMax;
+    MM->replaceAllUsesWith(MM->getLHS());
+    MM->eraseFromParent();
+    return true;
+  }
+  if (RHS_CR.icmp(Pred, LHS_CR)) {
+    ++NumMinMax;
+    MM->replaceAllUsesWith(MM->getRHS());
+    MM->eraseFromParent();
+    return true;
+  }
+  return false;
 }
 
 // Rewrite this with.overflow intrinsic as non-overflowing.
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
index 705b6e96fe9e3..d21b8f2418c2e 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
@@ -71,7 +71,6 @@ define i8 @test6(i8 %x) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp uge i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp uge i8 %x, 42
@@ -119,7 +118,6 @@ define i8 @test10(i8 %x) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp ule i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp ule i8 %x, 42
@@ -167,7 +165,6 @@ define i8 @test14(i8 %x) {
 ; CHECK-LABEL: @test14(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp sge i8 %x, 42
@@ -215,7 +212,6 @@ define i8 @test18(i8 %x) {
 ; CHECK-LABEL: @test18(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sle i8 [[X:%.*]], 42
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 42)
 ; CHECK-NEXT:    ret i8 42
 ;
   %lim = icmp sle i8 %x, 42
@@ -235,3 +231,62 @@ define i8 @test19(i8 %x) {
   %r = call i8 @llvm.smax(i8 %x, i8 42)
   ret i8 %r
 }
+
+declare void @body(i32)
+
+define void @test_bidirectional() {
+; CHECK-LABEL: @test_bidirectional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    call void @body(i32 65535)
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[INDVAR]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INDVAR]], 65535
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvar = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %smax = call i32 @llvm.smax.i32(i32 %indvar, i32 65535)
+  call void @body(i32 %smax)
+  %inc = add nsw i32 %indvar, 1
+  %cmp = icmp slt i32 %indvar, 65535
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+define i64 @test_at_use(i1 %cond, i64 %x) {
+; CHECK-LABEL: @test_at_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB1:%.*]], label [[IF_END:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i64 0
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[X]], [[BB1]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+entry:
+  br i1 %cond, label %bb1, label %if.end
+
+bb1:
+  %val = call i64 @llvm.smax.i64(i64 %x, i64 -1)
+  %cmp = icmp slt i64 %x, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  ret i64 0
+
+if.end:
+  %phi = phi i64 [%val, %bb1], [0, %entry]
+  ret i64 %phi
+}

From c831d83bb17caa3a8f137052559cb6c54b21b7c1 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 22 Feb 2024 13:59:04 +0100
Subject: [PATCH 030/546] [InferAddrSpaces] Correctly replace identical
 operands of insts (#82610)

It's important for PHI nodes because if a PHI node has multiple edges
coming from the same block, we can have the same incoming value multiple
times in the list of incoming values. All of those need to be consistent
(exact same Value*) otherwise verifier complains.

Fixes SWDEV-445797
---
 .../Transforms/Scalar/InferAddressSpaces.cpp  | 13 ++--
 .../AMDGPU/multiple-uses-of-val.ll            | 69 +++++++++++++++++++
 2 files changed, 77 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll

diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 1bf50d79e5331..851eab04c8dbb 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1221,6 +1221,7 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
     Value::use_iterator I, E, Next;
     for (I = V->use_begin(), E = V->use_end(); I != E;) {
       Use &U = *I;
+      User *CurUser = U.getUser();
 
       // Some users may see the same pointer operand in multiple operands. Skip
       // to the next instruction.
@@ -1231,11 +1232,10 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
         // If V is used as the pointer operand of a compatible memory operation,
         // sets the pointer operand to NewV. This replacement does not change
         // the element type, so the resultant load/store is still valid.
-        U.set(NewV);
+        CurUser->replaceUsesOfWith(V, NewV);
         continue;
       }
 
-      User *CurUser = U.getUser();
       // Skip if the current user is the new value itself.
       if (CurUser == NewV)
         continue;
@@ -1311,10 +1311,13 @@ bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
 
           while (isa<PHINode>(InsertPos))
             ++InsertPos;
-          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+          // This instruction may contain multiple uses of V, update them all.
+          CurUser->replaceUsesOfWith(
+              V, new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
         } else {
-          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
-                                               V->getType()));
+          CurUser->replaceUsesOfWith(
+              V, ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                V->getType()));
         }
       }
     }
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll
new file mode 100644
index 0000000000000..717bd09897732
--- /dev/null
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/multiple-uses-of-val.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -S -passes=infer-address-spaces --verify-each %s | FileCheck %s
+
+; Inst can use a value multiple time. When we're inserting an addrspacecast to flat,
+; it's important all the identical uses use an indentical replacement, especially
+; for PHIs.
+
+define amdgpu_kernel void @test_phi() {
+; CHECK-LABEL: @test_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOADED_PTR:%.*]] = load ptr, ptr addrspace(4) null, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[LOADED_PTR]] to ptr addrspace(1)
+; CHECK-NEXT:    br label [[BB0:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr addrspace(1) [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[GEP]] to ptr
+; CHECK-NEXT:    switch i32 0, label [[END:%.*]] [
+; CHECK-NEXT:      i32 1, label [[END]]
+; CHECK-NEXT:      i32 4, label [[END]]
+; CHECK-NEXT:      i32 5, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(1) [[GEP]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RETVAL_SROA_0_0_I569_PH:%.*]] = phi ptr [ null, [[BB1]] ], [ [[TMP1]], [[BB0]] ], [ [[TMP1]], [[BB0]] ], [ [[TMP1]], [[BB0]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %loaded.ptr = load ptr, ptr addrspace(4) null, align 8
+  br label %bb0
+
+bb0:
+  %gep = getelementptr i64, ptr %loaded.ptr, i64 3
+  switch i32 0, label %end [
+  i32 1, label %end
+  i32 4, label %end
+  i32 5, label %bb1
+  ]
+
+bb1:
+  %0 = load double, ptr %gep, align 16
+  br label %end
+
+end:
+  %retval.sroa.0.0.i569.ph = phi ptr [ null, %bb1 ], [ %gep, %bb0 ], [ %gep, %bb0 ], [ %gep, %bb0 ]
+  ret void
+}
+
+declare void @uses_ptrs(ptr, ptr, ptr)
+
+; We shouldn't treat PHIs differently, even other users should have the same treatment.
+; All occurences of %gep are replaced with an identical value.
+define amdgpu_kernel void @test_other() {
+; CHECK-LABEL: @test_other(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOADED_PTR:%.*]] = load ptr, ptr addrspace(4) null, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[LOADED_PTR]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[TMP1]], i64 3
+; CHECK-NEXT:    call void @uses_ptrs(ptr [[GEP]], ptr [[GEP]], ptr [[GEP]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %loaded.ptr = load ptr, ptr addrspace(4) null, align 8
+  %gep = getelementptr i64, ptr %loaded.ptr, i64 3
+  call void @uses_ptrs(ptr %gep, ptr %gep, ptr %gep)
+  ret void
+}

From 73c646a3b27293f8cb4ba120de7bc01c223b4b5f Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 22 Feb 2024 12:58:10 +0000
Subject: [PATCH 031/546] [flang] Fix warning when with clang-cl/msvc

\llvm\flang\lib\Evaluate\fold-integer.cpp(705,35): warning: lambda capture 'FromInt64' is not used [-Wunused-lambda-capture]

It is intentionally unused.
---
 flang/lib/Evaluate/fold-integer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 0e8706e0f2740..09b2f91debda2 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -719,6 +719,7 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
       // CharacterUtils<2>::ICHAR(). Can't find a work-around,
       // so remove the FromInt64 error checking lambda that
       // seems to have caused the proble.
+                      (void)FromInt64;
                       [](const Scalar<Char> &c) {
                         return CharacterUtils<Char::kind>::ICHAR(
                             CharacterUtils<Char::kind>::Resize(c, 1));

From 18f116651af0e328e6f9f6b0619171bd8a2c4817 Mon Sep 17 00:00:00 2001
From: pwprzybyla <121295298+pwprzybyla@users.noreply.github.com>
Date: Thu, 22 Feb 2024 14:04:21 +0100
Subject: [PATCH 032/546] Multilib support for libraries with exceptions
 (#75031)

For better multilib matching explicitly match -fno-rtti and -fno-exceptions
---
 clang/include/clang/Driver/ToolChain.h | 10 ++++++++++
 clang/lib/Driver/ToolChain.cpp         | 23 ++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 2d0c1f826c172..fbe2e8fe8e88d 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -120,6 +120,11 @@ class ToolChain {
     RM_Disabled,
   };
 
+  enum ExceptionsMode {
+    EM_Enabled,
+    EM_Disabled,
+  };
+
   struct BitCodeLibraryInfo {
     std::string Path;
     bool ShouldInternalize;
@@ -141,6 +146,8 @@ class ToolChain {
 
   const RTTIMode CachedRTTIMode;
 
+  const ExceptionsMode CachedExceptionsMode;
+
   /// The list of toolchain specific path prefixes to search for libraries.
   path_list LibraryPaths;
 
@@ -318,6 +325,9 @@ class ToolChain {
   // Returns the RTTIMode for the toolchain with the current arguments.
   RTTIMode getRTTIMode() const { return CachedRTTIMode; }
 
+  // Returns the ExceptionsMode for the toolchain with the current arguments.
+  ExceptionsMode getExceptionsMode() const { return CachedExceptionsMode; }
+
   /// Return any implicit target and/or mode flag for an invocation of
   /// the compiler driver as `ProgName`.
   ///
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 388030592b483..f8c13c86daf9b 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -77,10 +77,19 @@ static ToolChain::RTTIMode CalculateRTTIMode(const ArgList &Args,
   return NoRTTI ? ToolChain::RM_Disabled : ToolChain::RM_Enabled;
 }
 
+static ToolChain::ExceptionsMode CalculateExceptionsMode(const ArgList &Args) {
+  if (Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions,
+                   true)) {
+    return ToolChain::EM_Enabled;
+  }
+  return ToolChain::EM_Disabled;
+}
+
 ToolChain::ToolChain(const Driver &D, const llvm::Triple &T,
                      const ArgList &Args)
     : D(D), Triple(T), Args(Args), CachedRTTIArg(GetRTTIArgument(Args)),
-      CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)) {
+      CachedRTTIMode(CalculateRTTIMode(Args, Triple, CachedRTTIArg)),
+      CachedExceptionsMode(CalculateExceptionsMode(Args)) {
   auto addIfExists = [this](path_list &List, const std::string &Path) {
     if (getVFS().exists(Path))
       List.push_back(Path);
@@ -264,6 +273,18 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const {
     break;
   }
 
+  // Include fno-exceptions and fno-rtti
+  // to improve multilib selection
+  if (getRTTIMode() == ToolChain::RTTIMode::RM_Disabled)
+    Result.push_back("-fno-rtti");
+  else
+    Result.push_back("-frtti");
+
+  if (getExceptionsMode() == ToolChain::ExceptionsMode::EM_Disabled)
+    Result.push_back("-fno-exceptions");
+  else
+    Result.push_back("-fexceptions");
+
   // Sort and remove duplicates.
   std::sort(Result.begin(), Result.end());
   Result.erase(std::unique(Result.begin(), Result.end()), Result.end());

From b47f63d3c8fedf7c98b7f58e892e784fddee4601 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Thu, 22 Feb 2024 13:07:31 +0000
Subject: [PATCH 033/546] [Clang][SME] Detect always_inline used with
 mismatched streaming attributes (#77936)

This patch adds an error that is emitted when a streaming function is
marked as always_inline and is called from a non-streaming function.
---
 .../clang/Basic/DiagnosticFrontendKinds.td    |  4 ++
 clang/lib/CodeGen/Targets/AArch64.cpp         | 43 +++++++++++++++++
 .../aarch64-sme-inline-streaming-attrs.c      | 47 +++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c

diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index b1a282f5164a2..dcd2c19fb7ee3 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -279,6 +279,10 @@ def err_builtin_needs_feature : Error<"%0 needs target feature %1">;
 def err_function_needs_feature : Error<
   "always_inline function %1 requires target feature '%2', but would "
   "be inlined into function %0 that is compiled without support for '%2'">;
+def err_function_always_inline_attribute_mismatch : Error<
+  "always_inline function %1 and its caller %0 have mismatching %2 attributes">;
+def err_function_always_inline_new_za : Error<
+  "always_inline function %0 has new za state">;
 
 def warn_avx_calling_convention
     : Warning<"AVX vector %select{return|argument}0 of type %1 without '%2' "
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index ee7f95084d2e0..94f8e7be2ee6e 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -8,6 +8,7 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -155,6 +156,11 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
     }
     return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty);
   }
+
+  void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
+                            const FunctionDecl *Caller,
+                            const FunctionDecl *Callee,
+                            const CallArgList &Args) const override;
 };
 
 class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo {
@@ -814,6 +820,43 @@ Address AArch64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr,
                           /*allowHigherAlign*/ false);
 }
 
+static bool isStreaming(const FunctionDecl *F) {
+  if (F->hasAttr<ArmLocallyStreamingAttr>())
+    return true;
+  if (const auto *T = F->getType()->getAs<FunctionProtoType>())
+    return T->getAArch64SMEAttributes() & FunctionType::SME_PStateSMEnabledMask;
+  return false;
+}
+
+static bool isStreamingCompatible(const FunctionDecl *F) {
+  if (const auto *T = F->getType()->getAs<FunctionProtoType>())
+    return T->getAArch64SMEAttributes() &
+           FunctionType::SME_PStateSMCompatibleMask;
+  return false;
+}
+
+void AArch64TargetCodeGenInfo::checkFunctionCallABI(
+    CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
+    const FunctionDecl *Callee, const CallArgList &Args) const {
+  if (!Caller || !Callee || !Callee->hasAttr<AlwaysInlineAttr>())
+    return;
+
+  bool CallerIsStreaming = isStreaming(Caller);
+  bool CalleeIsStreaming = isStreaming(Callee);
+  bool CallerIsStreamingCompatible = isStreamingCompatible(Caller);
+  bool CalleeIsStreamingCompatible = isStreamingCompatible(Callee);
+
+  if (!CalleeIsStreamingCompatible &&
+      (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible))
+    CGM.getDiags().Report(CallLoc,
+                          diag::err_function_always_inline_attribute_mismatch)
+        << Caller->getDeclName() << Callee->getDeclName() << "streaming";
+  if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
+    if (NewAttr->isNewZA())
+      CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za)
+          << Callee->getDeclName();
+}
+
 std::unique_ptr<TargetCodeGenInfo>
 CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM,
                                         AArch64ABIKind Kind) {
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
new file mode 100644
index 0000000000000..7eb74f28a1c85
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_LOCALLY %s
+
+#define __ai __attribute__((always_inline))
+__ai void inlined_fn(void) {}
+__ai void inlined_fn_streaming_compatible(void) __arm_streaming_compatible {}
+__ai void inlined_fn_streaming(void) __arm_streaming {}
+__ai __arm_locally_streaming void inlined_fn_local(void) {}
+
+#ifdef TEST_NONE
+void caller(void) {
+    inlined_fn();
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller' have mismatching streaming attributes}}
+    inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller' have mismatching streaming attributes}}
+}
+#endif
+
+#ifdef TEST_COMPATIBLE
+void caller_compatible(void) __arm_streaming_compatible {
+    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming(); // expected-error {{always_inline function 'inlined_fn_streaming' and its caller 'caller_compatible' have mismatching streaming attributes}}
+    inlined_fn_local(); // expected-error {{always_inline function 'inlined_fn_local' and its caller 'caller_compatible' have mismatching streaming attributes}}
+}
+#endif
+
+#ifdef TEST_STREAMING
+void caller_streaming(void) __arm_streaming {
+    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_streaming' have mismatching streaming attributes}}
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming();
+    inlined_fn_local();
+}
+#endif
+
+#ifdef TEST_LOCALLY
+__arm_locally_streaming
+void caller_local(void) {
+    inlined_fn(); // expected-error {{always_inline function 'inlined_fn' and its caller 'caller_local' have mismatching streaming attributes}}
+    inlined_fn_streaming_compatible();
+    inlined_fn_streaming();
+    inlined_fn_local();
+}
+#endif

From fa8a21144ec9a6836e9bf1e3bf5cd0b2f058209e Mon Sep 17 00:00:00 2001
From: NagyDonat <donat.nagy@ericsson.com>
Date: Thu, 22 Feb 2024 14:19:20 +0100
Subject: [PATCH 034/546] [analyzer] Improve handling of unsigned values in
 ArrayBoundCheckerV2 (#81034)

A memory access is an out of bounds error if the offset is < the extent
of the memory region. Notice that here "<" is a _mathematical_
comparison between two numbers and NOT a C/C++ operator that compares
two typed C++ values: for example -1 < 1000 is true in mathematics, but
if the `-1` is an `int` and the `1000` is a `size_t` value, then
evaluating the C/C++ operator `<` will return false because the `-1`
will be converted to `SIZE_MAX` by the automatic type conversions.

This means that it's incorrect to perform a bounds check with
`evalBinOpNN(State, BO_LT, ...)` which performs automatic conversions
and can produce wildly incorrect results.

ArrayBoundsCheckerV2 already had a special case where it avoided calling
`evalBinOpNN` in a situation where it would have performed an automatic
conversion; this commit replaces that code with a more general one that
covers more situations. (It's still not perfect, but it's better than
the previous version and I think it will cover practically all
real-world code.)

Note that this is not a limitation/bug of the simplification algorithm
defined in `getSimplifedOffsets()`: the simplification is not applied in
the test case `test_comparison_with_extent_symbol` (because the `Extent`
is not a concrete int), but without the new code it would still run into
a `-1 < UNSIGNED` comparison that evaluates to false because
`evalBinOpNN` performs an automatic type conversion.
---
 .../Core/PathSensitive/SValBuilder.h          | 12 ++++--
 .../Checkers/ArrayBoundCheckerV2.cpp          | 42 +++++++++++++++----
 clang/test/Analysis/out-of-bounds.c           |  8 ++++
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
index d7cff49036cb8..a560f274c43cc 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
@@ -110,12 +110,16 @@ class SValBuilder {
   /// that value is returned. Otherwise, returns NULL.
   virtual const llvm::APSInt *getKnownValue(ProgramStateRef state, SVal val) = 0;
 
-  /// Tries to get the minimal possible (integer) value of a given SVal. If the
-  /// constraint manager cannot provide an useful answer, this returns NULL.
+  /// Tries to get the minimal possible (integer) value of a given SVal. This
+  /// always returns the value of a ConcreteInt, but may return NULL if the
+  /// value is symbolic and the constraint manager cannot provide a useful
+  /// answer.
   virtual const llvm::APSInt *getMinValue(ProgramStateRef state, SVal val) = 0;
 
-  /// Tries to get the maximal possible (integer) value of a given SVal. If the
-  /// constraint manager cannot provide an useful answer, this returns NULL.
+  /// Tries to get the maximal possible (integer) value of a given SVal. This
+  /// always returns the value of a ConcreteInt, but may return NULL if the
+  /// value is symbolic and the constraint manager cannot provide a useful
+  /// answer.
   virtual const llvm::APSInt *getMaxValue(ProgramStateRef state, SVal val) = 0;
 
   /// Simplify symbolic expressions within a given SVal. Return an SVal
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 05fc00a990d52..fdcc46e58580b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -268,6 +268,16 @@ getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
   return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
 }
 
+static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value) {
+  const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
+  return MaxV && MaxV->isNegative();
+}
+
+static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
+  QualType T = Value.getType(SVB.getContext());
+  return T->isUnsignedIntegerType();
+}
+
 // Evaluate the comparison Value < Threshold with the help of the custom
 // simplification algorithm defined for this checker. Return a pair of states,
 // where the first one corresponds to "value below threshold" and the second
@@ -281,18 +291,32 @@ compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
     std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
   }
-  if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
-    QualType T = Value.getType(SVB.getContext());
-    if (T->isUnsignedIntegerType() && ConcreteThreshold->getValue().isNegative()) {
-      // In this case we reduced the bound check to a comparison of the form
-      //   (symbol or value with unsigned type) < (negative number)
-      // which is always false. We are handling these cases separately because
-      // evalBinOpNN can perform a signed->unsigned conversion that turns the
-      // negative number into a huge positive value and leads to wildly
-      // inaccurate conclusions.
+
+  // We want to perform a _mathematical_ comparison between the numbers `Value`
+  // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
+  // perform automatic conversions. For example the number -1 is less than the
+  // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
+  // -1 is converted to ULONGLONG_MAX.
+  // To avoid automatic conversions, we evaluate the "obvious" cases without
+  // calling `evalBinOpNN`:
+  if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
+    if (CheckEquality) {
+      // negative_value == unsigned_value is always false
       return {nullptr, State};
     }
+    // negative_value < unsigned_value is always false
+    return {State, nullptr};
   }
+  if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
+    // unsigned_value == negative_value and unsigned_value < negative_value are
+    // both always false
+    return {nullptr, State};
+  }
+  // FIXME: these special cases are sufficient for handling real-world
+  // comparisons, but in theory there could be contrived situations where
+  // automatic conversion of a symbolic value (which can be negative and can be
+  // positive) leads to incorrect results.
+
   const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
   auto BelowThreshold =
       SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
index ed457e8696006..1f771c2b3bd13 100644
--- a/clang/test/Analysis/out-of-bounds.c
+++ b/clang/test/Analysis/out-of-bounds.c
@@ -186,3 +186,11 @@ void test_assume_after_access2(unsigned long x) {
   clang_analyzer_eval(x <= 99); // expected-warning{{TRUE}}
 }
 
+struct incomplete;
+char test_comparison_with_extent_symbol(struct incomplete *p) {
+  // Previously this was reported as a (false positive) overflow error because
+  // the extent symbol of the area pointed by `p` was an unsigned and the '-1'
+  // was converted to its type by `evalBinOpNN`.
+  return ((char *)p)[-1]; // no-warning
+}
+

From afa8a2eed0c4ca61ac19abd88022e63e58408af1 Mon Sep 17 00:00:00 2001
From: NagyDonat <donat.nagy@ericsson.com>
Date: Thu, 22 Feb 2024 14:29:05 +0100
Subject: [PATCH 035/546] [analyzer] Remove superfluous #include
 "CallDescription.h" (NFC) (#82614)

To fix https://github.com/llvm/llvm-project/issues/81597, I'm planning
to refactor the usage of CallDescription; and as I was preparing for
this I noticed that there are two superfluous references to this header.
---
 clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp      | 2 +-
 clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp
index 265185e641072..18e718e085536 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ErrnoChecker.cpp
@@ -17,7 +17,7 @@
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
diff --git a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
index 6de33da107a3f..dec461296fed5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
+++ b/clang/lib/StaticAnalyzer/Checkers/TaggedUnionModeling.h
@@ -13,7 +13,6 @@
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -96,4 +95,4 @@ void handleConstructorAndAssignment(const CallEvent &Call, CheckerContext &C,
 
 } // namespace clang::ento::tagged_union_modeling
 
-#endif // LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_TAGGEDUNIONMODELING_H
\ No newline at end of file
+#endif // LLVM_CLANG_LIB_STATICANALYZER_CHECKERS_TAGGEDUNIONMODELING_H

From 770fd3856660fea6cbaa78d9cb1f03cc92611783 Mon Sep 17 00:00:00 2001
From: Ian Hickson <ian@hixie.ch>
Date: Thu, 22 Feb 2024 05:35:23 -0800
Subject: [PATCH 036/546] [LangRef] Document string literals in LLVM's format
 (#82529)

---
 llvm/docs/LangRef.rst | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index fd2e3aacd0169..8f4495e25d0fa 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -61,10 +61,13 @@ run by the parser after parsing input assembly and by the optimizer
 before it outputs bitcode. The violations pointed out by the verifier
 pass indicate bugs in transformation passes or input to the parser.
 
+Syntax
+======
+
 .. _identifiers:
 
 Identifiers
-===========
+-----------
 
 LLVM identifiers come in two basic types: global and local. Global
 identifiers (functions, global variables) begin with the ``'@'``
@@ -140,6 +143,34 @@ It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
 that defines the type and name of value produced.
 
+.. _strings:
+
+String constants
+----------------
+
+Strings in LLVM programs are delimited by ``"`` characters. Within a
+string, all bytes are treated literally with the exception of ``\``
+characters, which start escapes, and the first ``"`` character, which
+ends the string.
+
+There are two kinds of escapes.
+
+* ``\\`` represents a single ``\`` character.
+
+* ``\`` followed by two hexadecimal characters (0-9, a-f, or A-F)
+  represents the byte with the given value (e.g. \x00 represents a
+  null byte).
+
+To represent a ``"`` character, use ``\22``. (``\"`` will end the string
+with a trailing ``\``.)
+
+Newlines do not terminate string constants; strings can span multiple
+lines.
+
+The interpretation of string constants (e.g. their character encoding)
+depends on context.
+
+
 High Level Structure
 ====================
 

From 5b8e5604c297aa8fd09bf641d12d0a663e0ea801 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Thu, 22 Feb 2024 08:46:08 -0500
Subject: [PATCH 037/546] [AIX] Lower intrinsic __builtin_cpu_is into AIX
 platform-specific code. (#80069)

On AIX OS, __builtin_cpu_is() references the runtime external variable
_system_configuration from /usr/include/sys/systemcfg.h.

ref issue:  https://github.com/llvm/llvm-project/issues/80042
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +
 clang/lib/Basic/Targets/PPC.cpp               | 10 +++
 clang/lib/Basic/Targets/PPC.h                 | 10 ++-
 clang/lib/CodeGen/CGBuiltin.cpp               | 47 ++++++++++++
 clang/lib/Sema/SemaChecking.cpp               |  5 +-
 clang/test/CodeGen/aix-builtin-cpu-is.c       | 71 +++++++++++++++++++
 clang/test/Sema/aix-builtin-cpu-unsupports.c  |  6 ++
 .../llvm/TargetParser/PPCTargetParser.def     | 57 +++++++++++++++
 8 files changed, 206 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aix-builtin-cpu-is.c
 create mode 100644 clang/test/Sema/aix-builtin-cpu-unsupports.c

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 11411883e1bfc..a96f69d6ac760 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10364,6 +10364,8 @@ def err_x86_builtin_tile_arg_duplicate : Error<
 
 def err_builtin_target_unsupported : Error<
   "builtin is not supported on this target">;
+def err_builtin_aix_os_unsupported : Error<
+  "this builtin is available only on AIX 7.2 and later operating systems">;
 def err_builtin_longjmp_unsupported : Error<
   "__builtin_longjmp is not supported for the current target">;
 def err_builtin_setjmp_unsupported : Error<
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 8c891ccdeb59d..aebe51bfa4daa 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -904,6 +904,16 @@ bool PPCTargetInfo::validateCpuSupports(StringRef FeatureStr) const {
 }
 
 bool PPCTargetInfo::validateCpuIs(StringRef CPUName) const {
+  llvm::Triple Triple = getTriple();
+  if (Triple.isOSAIX()) {
+#define PPC_AIX_CPU(NAME, SUPPORT, INDEX, OP, VALUE) .Case(NAME, true)
+    return llvm::StringSwitch<bool>(CPUName)
+#include "llvm/TargetParser/PPCTargetParser.def"
+        .Default(false);
+  }
+
+  assert(Triple.isOSLinux() &&
+         "__builtin_cpu_is() is only supported for AIX and Linux.");
 #define PPC_LNX_CPU(NAME, NUM) .Case(NAME, true)
   return llvm::StringSwitch<bool>(CPUName)
 #include "llvm/TargetParser/PPCTargetParser.def"
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index a91bdede53e40..70683916a8b04 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -362,8 +362,16 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
 
   // We support __builtin_cpu_supports/__builtin_cpu_is on targets that
   // have Glibc since it is Glibc that provides the HWCAP[2] in the auxv.
+  static constexpr int MINIMUM_AIX_OS_MAJOR = 7;
+  static constexpr int MINIMUM_AIX_OS_MINOR = 2;
   bool supportsCpuSupports() const override { return getTriple().isOSGlibc(); }
-  bool supportsCpuIs() const override { return getTriple().isOSGlibc(); }
+  bool supportsCpuIs() const override {
+    llvm::Triple Triple = getTriple();
+    // AIX 7.2 is the minimum requirement to support __builtin_cpu_is().
+    return Triple.isOSGlibc() ||
+           (Triple.isOSAIX() &&
+            !Triple.isOSVersionLT(MINIMUM_AIX_OS_MAJOR, MINIMUM_AIX_OS_MINOR));
+  }
   bool validateCpuSupports(StringRef Feature) const override;
   bool validateCpuIs(StringRef Name) const override;
 };
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d454ccc1dd861..d8b2115f1e5e3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16542,12 +16542,59 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
 
   Intrinsic::ID ID = Intrinsic::not_intrinsic;
 
+#include "llvm/TargetParser/PPCTargetParser.def"
+  auto GenAIXPPCBuiltinCpuExpr = [&](unsigned SupportMethod, unsigned FieldIdx,
+                                     unsigned CompOp,
+                                     unsigned OpValue) -> Value * {
+    if (SupportMethod == AIX_BUILTIN_PPC_FALSE)
+      return llvm::ConstantInt::getFalse(ConvertType(E->getType()));
+
+    if (SupportMethod == AIX_BUILTIN_PPC_TRUE)
+      return llvm::ConstantInt::getTrue(ConvertType(E->getType()));
+
+    assert(SupportMethod <= USE_SYS_CONF && "Invalid value for SupportMethod.");
+    assert((CompOp == COMP_EQ) && "Only equal comparisons are supported.");
+
+    llvm::Type *STy = llvm::StructType::get(PPC_SYSTEMCONFIG_TYPE);
+    llvm::Constant *SysConf =
+        CGM.CreateRuntimeVariable(STy, "_system_configuration");
+
+    // Grab the appropriate field from _system_configuration.
+    llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
+                           ConstantInt::get(Int32Ty, FieldIdx)};
+
+    llvm::Value *FieldValue = Builder.CreateGEP(STy, SysConf, Idxs);
+    FieldValue = Builder.CreateAlignedLoad(Int32Ty, FieldValue,
+                                           CharUnits::fromQuantity(4));
+    assert(FieldValue->getType()->isIntegerTy(32) &&
+           "Only 32-bit integers are supported in GenAIXPPCBuiltinCpuExpr().");
+    return Builder.CreateICmp(ICmpInst::ICMP_EQ, FieldValue,
+                              ConstantInt::get(Int32Ty, OpValue));
+  };
+
   switch (BuiltinID) {
   default: return nullptr;
 
   case Builtin::BI__builtin_cpu_is: {
     const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
     StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
+    llvm::Triple Triple = getTarget().getTriple();
+
+    if (Triple.isOSAIX()) {
+      unsigned IsCpuSupport, FieldIdx, CompareOp, CpuIdValue;
+      typedef std::tuple<unsigned, unsigned, unsigned, unsigned> CPUType;
+      std::tie(IsCpuSupport, FieldIdx, CompareOp, CpuIdValue) =
+          static_cast<CPUType>(StringSwitch<CPUType>(CPUStr)
+#define PPC_AIX_CPU(NAME, SUPPORT_MAGIC, INDEX, COMPARE_OP, VALUE)             \
+  .Case(NAME, {SUPPORT_MAGIC, INDEX, COMPARE_OP, VALUE})
+#include "llvm/TargetParser/PPCTargetParser.def"
+          );
+      return GenAIXPPCBuiltinCpuExpr(IsCpuSupport, FieldIdx, CompareOp,
+                                     CpuIdValue);
+    }
+
+    assert(Triple.isOSLinux() &&
+           "__builtin_cpu_is() is only supported for AIX and Linux.");
     unsigned NumCPUID = StringSwitch<unsigned>(CPUStr)
 #define PPC_LNX_CPU(Name, NumericID) .Case(Name, NumericID)
 #include "llvm/TargetParser/PPCTargetParser.def"
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e8bfb215a5b4c..710437b354521 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2165,7 +2165,10 @@ static bool SemaBuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall,
     return S.Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported)
            << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc());
   if (!IsCPUSupports && !TheTI->supportsCpuIs())
-    return S.Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported)
+    return S.Diag(TheCall->getBeginLoc(),
+                  TI.getTriple().isOSAIX()
+                      ? diag::err_builtin_aix_os_unsupported
+                      : diag::err_builtin_target_unsupported)
            << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc());
 
   Expr *Arg = TheCall->getArg(0)->IgnoreParenImpCasts();
diff --git a/clang/test/CodeGen/aix-builtin-cpu-is.c b/clang/test/CodeGen/aix-builtin-cpu-is.c
new file mode 100644
index 0000000000000..b0a0dec41b56c
--- /dev/null
+++ b/clang/test/CodeGen/aix-builtin-cpu-is.c
@@ -0,0 +1,71 @@
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc970\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc-cell-be\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppca2\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc405\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc440\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc464\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"ppc476\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power4\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power5\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power5+\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power6\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power6x\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power7\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=32768 \
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power8\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=65536 \
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power9\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=131072\
+// RUN:   --check-prefix=CHECKOP
+
+// RUN: echo "int main() { return __builtin_cpu_is(\"power10\");}" > %t.c 
+// RUN: %clang_cc1 -triple powerpc-ibm-aix7.2.0.0 -emit-llvm -o - %t.c | FileCheck %s -DVALUE=262144 \
+// RUN:   --check-prefix=CHECKOP
+
+// CHECK:     define i32 @main() #0 {
+// CHECK-NEXT: entry:
+// CHECK-NEXT:   %retval = alloca i32, align 4
+// CHECK-NEXT:   store i32 0, ptr %retval, align 4
+// CHECK-NEXT:   ret i32 0
+// CHECK-NEXT: }
+
+// CHECKOP: @_system_configuration = external global { i32, i32, i32 }
+// CHECKOP:   define i32 @main() #0 {
+// CHECKOP-NEXT: entry:
+// CHECKOP-NEXT:   %retval = alloca i32, align 4
+// CHECKOP-NEXT:   store i32 0, ptr %retval, align 4
+// CHECKOP-NEXT:   %0 = load i32, ptr getelementptr inbounds ({ i32, i32, i32 }, ptr @_system_configuration, i32 0, i32 1), align 4
+// CHECKOP-NEXT:   %1 = icmp eq i32 %0, [[VALUE]] 
+// CHECKOP-NEXT:  %conv = zext i1 %1 to i32
+// CHECKOP-NEXT:   ret i32 %conv
+// CHECKOP-NEXT: }
+
+
diff --git a/clang/test/Sema/aix-builtin-cpu-unsupports.c b/clang/test/Sema/aix-builtin-cpu-unsupports.c
new file mode 100644
index 0000000000000..10e21867c3937
--- /dev/null
+++ b/clang/test/Sema/aix-builtin-cpu-unsupports.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only -triple  powerpc-ibm-aix7.1.0.0 -verify %s
+
+int main(void) {
+  if (__builtin_cpu_is("power8")) // expected-error {{this builtin is available only on AIX 7.2 and later operating systems}}
+    return 1;
+}
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.def b/llvm/include/llvm/TargetParser/PPCTargetParser.def
index f2c44b46fa673..88c7304659c4d 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.def
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.def
@@ -126,4 +126,61 @@ PPC_LNX_CPU("power10",47)
 #undef PPC_LNX_DEFINE_OFFSETS
 #undef PPC_LNX_FEATURE
 #undef PPC_LNX_CPU
+
+// Definition of the following values are found in the AIX header
+// file: </usr/include/sys/systemcfg.h>.
+#ifndef AIX_POWERPC_USE_SYS_CONF
+  #define AIX_POWERPC_USE_SYS_CONF
+  #define AIX_SYSCON_IMPL_IDX 1
+  #define AIX_PPC7_VALUE 0x00008000
+  #define AIX_PPC8_VALUE 0x00010000
+  #define AIX_PPC9_VALUE 0x00020000
+  #define AIX_PPC10_VALUE 0x00040000
+
+  // Supported SUPPORT_METHOD values.
+  #define AIX_BUILTIN_PPC_TRUE 1
+  #define AIX_BUILTIN_PPC_FALSE 0
+  #define USE_SYS_CONF 2
+
+  // Supported COMPARE_OP values.
+  #define COMP_EQ  0
+
+#endif
+
+// The value of SUPPORT_METHOD can be AIX_BUILTIN_PPC_TRUE,
+// AIX_BUILTIN_PPC_FALSE, or USE_SYS_CONF.
+// When the value of SUPPORT_METHOD is USE_SYS_CONF, the return value
+// depends on the result of comparing the data member of
+// _system_configuration specified by INDEX with a certain value.
+
+#ifndef PPC_AIX_CPU
+  #define PPC_AIX_CPU(NAME, SUPPORT_METHOD, INDEX, COMPARE_OP, VALUE)
+#endif
+
+// __builtin_cpu_is() is supported only on Power7 and up.
+PPC_AIX_CPU("power4",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc970",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power5",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power5+",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power6",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc-cell-be",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power6x",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppca2",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc405",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc440",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc464",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("ppc476",AIX_BUILTIN_PPC_FALSE,0,0,0)
+PPC_AIX_CPU("power7",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC7_VALUE)
+PPC_AIX_CPU("power8",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC8_VALUE)
+PPC_AIX_CPU("power9",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC9_VALUE)
+PPC_AIX_CPU("power10",USE_SYS_CONF,AIX_SYSCON_IMPL_IDX,COMP_EQ,AIX_PPC10_VALUE)
+#undef PPC_AIX_CPU
+
+// PPC_SYSTEMCONFIG_TYPE defines the IR data structure of kernel variable
+// `_system_configuration`, that is found in the AIX OS header file: </usr/include/sys/systemcfg.h>.
+#ifndef PPC_SYSTEMCONFIG_TYPE
+#define PPC_SYSTEMCONFIG_TYPE \
+Int32Ty, Int32Ty, Int32Ty
+#endif
+
 #endif // !PPC_TGT_PARSER_UNDEF_MACROS

From cbb24e139d0753d755d17fbe6bfac48ab44d0721 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 22 Feb 2024 14:07:16 +0000
Subject: [PATCH 038/546] [LLVM][IR] Add native vector support to ConstantInt &
 ConstantFP. (#74502)

NOTE: For brevity the following talks about ConstantInt but
everything extends to cover ConstantFP as well.

Whilst ConstantInt::get() supports the creation of vectors whereby
each lane has the same value, it achieves this via other constants:

  * ConstantVector for fixed-length vectors
  * ConstantExprs for scalable vectors

However, ConstantExprs are being deprecated and ConstantVector is
not space efficient for larger vector types. By extending ConstantInt
we can represent vector splats by only storing the underlying scalar
value.

More specifically:

 * ConstantInt gains an ElementCount variant of get().
 * LLVMContext is extended to map <EC,APInt>->ConstantInt.
 * BitcodeReader/Writer support is extended to allow vector types.

Whilst this patch adds the base support, more work is required
before it's production ready. For example, there's likely to be
many places where isa<ConstantInt> assumes a scalar type. Accordingly
the default behaviour of ConstantInt::get() remains unchanged but a
set of flags are added to allow wider testing and thus help with the
migration:

  --use-constant-int-for-fixed-length-splat
  --use-constant-fp-for-fixed-length-splat
  --use-constant-int-for-scalable-splat
  --use-constant-fp-for-scalable-splat

NOTE: No change is required to the bitcode format because types and
values are handled separately.

NOTE: For similar reasons as above, code generation doesn't work
out-the-box.
---
 llvm/include/llvm/IR/Constants.h          | 18 ++++-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 55 ++++++-------
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp |  2 +-
 llvm/lib/IR/AsmWriter.cpp                 | 31 +++++++-
 llvm/lib/IR/Constants.cpp                 | 94 +++++++++++++++++++++--
 llvm/lib/IR/LLVMContextImpl.cpp           |  2 +
 llvm/lib/IR/LLVMContextImpl.h             |  4 +
 llvm/test/Bitcode/constant-splat.ll       | 76 ++++++++++++++++++
 8 files changed, 243 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/Bitcode/constant-splat.ll

diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index b5dcc7fbc1d92..c0ac9a4aa6750 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -78,13 +78,20 @@ class ConstantData : public Constant {
 /// Class for constant integers.
 class ConstantInt final : public ConstantData {
   friend class Constant;
+  friend class ConstantVector;
 
   APInt Val;
 
-  ConstantInt(IntegerType *Ty, const APInt &V);
+  ConstantInt(Type *Ty, const APInt &V);
 
   void destroyConstantImpl();
 
+  /// Return a ConstantInt with the specified value and an implied Type. The
+  /// type is the vector type whose integer element type corresponds to the bit
+  /// width of the value.
+  static ConstantInt *get(LLVMContext &Context, ElementCount EC,
+                          const APInt &V);
+
 public:
   ConstantInt(const ConstantInt &) = delete;
 
@@ -136,7 +143,7 @@ class ConstantInt final : public ConstantData {
   /// Return the constant's value.
   inline const APInt &getValue() const { return Val; }
 
-  /// getBitWidth - Return the bitwidth of this constant.
+  /// getBitWidth - Return the scalar bitwidth of this constant.
   unsigned getBitWidth() const { return Val.getBitWidth(); }
 
   /// Return the constant as a 64-bit unsigned integer value after it
@@ -259,6 +266,7 @@ class ConstantInt final : public ConstantData {
 ///
 class ConstantFP final : public ConstantData {
   friend class Constant;
+  friend class ConstantVector;
 
   APFloat Val;
 
@@ -266,6 +274,12 @@ class ConstantFP final : public ConstantData {
 
   void destroyConstantImpl();
 
+  /// Return a ConstantFP with the specified value and an implied Type. The
+  /// type is the vector type whose element type has the same floating point
+  /// semantics as the value.
+  static ConstantFP *get(LLVMContext &Context, ElementCount EC,
+                         const APFloat &V);
+
 public:
   ConstantFP(const ConstantFP &) = delete;
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 515a1d0caa041..832907a3f53f5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -3060,48 +3060,49 @@ Error BitcodeReader::parseConstants() {
       V = Constant::getNullValue(CurTy);
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
-      if (!CurTy->isIntegerTy() || Record.empty())
+      if (!CurTy->isIntOrIntVectorTy() || Record.empty())
         return error("Invalid integer const record");
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
-      if (!CurTy->isIntegerTy() || Record.empty())
+      if (!CurTy->isIntOrIntVectorTy() || Record.empty())
         return error("Invalid wide integer const record");
 
-      APInt VInt =
-          readWideAPInt(Record, cast<IntegerType>(CurTy)->getBitWidth());
-      V = ConstantInt::get(Context, VInt);
-
+      auto *ScalarTy = cast<IntegerType>(CurTy->getScalarType());
+      APInt VInt = readWideAPInt(Record, ScalarTy->getBitWidth());
+      V = ConstantInt::get(CurTy, VInt);
       break;
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
         return error("Invalid float const record");
-      if (CurTy->isHalfTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(),
-                                             APInt(16, (uint16_t)Record[0])));
-      else if (CurTy->isBFloatTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::BFloat(),
-                                             APInt(16, (uint32_t)Record[0])));
-      else if (CurTy->isFloatTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(),
-                                             APInt(32, (uint32_t)Record[0])));
-      else if (CurTy->isDoubleTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(),
-                                             APInt(64, Record[0])));
-      else if (CurTy->isX86_FP80Ty()) {
+
+      auto *ScalarTy = CurTy->getScalarType();
+      if (ScalarTy->isHalfTy())
+        V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEhalf(),
+                                           APInt(16, (uint16_t)Record[0])));
+      else if (ScalarTy->isBFloatTy())
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::BFloat(), APInt(16, (uint32_t)Record[0])));
+      else if (ScalarTy->isFloatTy())
+        V = ConstantFP::get(CurTy, APFloat(APFloat::IEEEsingle(),
+                                           APInt(32, (uint32_t)Record[0])));
+      else if (ScalarTy->isDoubleTy())
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0])));
+      else if (ScalarTy->isX86_FP80Ty()) {
         // Bits are not stored the same way as a normal i80 APInt, compensate.
         uint64_t Rearrange[2];
         Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
         Rearrange[1] = Record[0] >> 48;
-        V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(),
-                                             APInt(80, Rearrange)));
-      } else if (CurTy->isFP128Ty())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(),
-                                             APInt(128, Record)));
-      else if (CurTy->isPPC_FP128Ty())
-        V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(),
-                                             APInt(128, Record)));
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::x87DoubleExtended(), APInt(80, Rearrange)));
+      } else if (ScalarTy->isFP128Ty())
+        V = ConstantFP::get(CurTy,
+                            APFloat(APFloat::IEEEquad(), APInt(128, Record)));
+      else if (ScalarTy->isPPC_FP128Ty())
+        V = ConstantFP::get(
+            CurTy, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record)));
       else
         V = UndefValue::get(CurTy);
       break;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 13be0b0c3307f..656f2a6ce870f 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2624,7 +2624,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
-      Type *Ty = CFP->getType();
+      Type *Ty = CFP->getType()->getScalarType();
       if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() ||
           Ty->isDoubleTy()) {
         Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 251485a403fee..ac0f119b00bde 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1505,16 +1505,39 @@ static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
 static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
                                   AsmWriterContext &WriterCtx) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-    if (CI->getType()->isIntegerTy(1)) {
-      Out << (CI->getZExtValue() ? "true" : "false");
-      return;
+    Type *Ty = CI->getType();
+
+    if (Ty->isVectorTy()) {
+      Out << "splat (";
+      WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+      Out << " ";
     }
-    Out << CI->getValue();
+
+    if (Ty->getScalarType()->isIntegerTy(1))
+      Out << (CI->getZExtValue() ? "true" : "false");
+    else
+      Out << CI->getValue();
+
+    if (Ty->isVectorTy())
+      Out << ")";
+
     return;
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    Type *Ty = CFP->getType();
+
+    if (Ty->isVectorTy()) {
+      Out << "splat (";
+      WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+      Out << " ";
+    }
+
     WriteAPFloatInternal(Out, CFP->getValueAPF());
+
+    if (Ty->isVectorTy())
+      Out << ")";
+
     return;
   }
 
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a38b912164b13..e6b92aad392f6 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -35,6 +35,20 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+// As set of temporary options to help migrate how splats are represented.
+static cl::opt<bool> UseConstantIntForFixedLengthSplat(
+    "use-constant-int-for-fixed-length-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantInt's native fixed-length vector splat support."));
+static cl::opt<bool> UseConstantFPForFixedLengthSplat(
+    "use-constant-fp-for-fixed-length-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantFP's native fixed-length vector splat support."));
+static cl::opt<bool> UseConstantIntForScalableSplat(
+    "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantInt's native scalable vector splat support."));
+static cl::opt<bool> UseConstantFPForScalableSplat(
+    "use-constant-fp-for-scalable-splat", cl::init(false), cl::Hidden,
+    cl::desc("Use ConstantFP's native scalable vector splat support."));
+
 //===----------------------------------------------------------------------===//
 //                              Constant Class
 //===----------------------------------------------------------------------===//
@@ -825,9 +839,11 @@ bool Constant::isManifestConstant() const {
 //                                ConstantInt
 //===----------------------------------------------------------------------===//
 
-ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V)
+ConstantInt::ConstantInt(Type *Ty, const APInt &V)
     : ConstantData(Ty, ConstantIntVal), Val(V) {
-  assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
+  assert(V.getBitWidth() ==
+             cast<IntegerType>(Ty->getScalarType())->getBitWidth() &&
+         "Invalid constant for type");
 }
 
 ConstantInt *ConstantInt::getTrue(LLVMContext &Context) {
@@ -885,6 +901,26 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
   return Slot.get();
 }
 
+// Get a ConstantInt vector with each lane set to the same APInt.
+ConstantInt *ConstantInt::get(LLVMContext &Context, ElementCount EC,
+                              const APInt &V) {
+  // Get an existing value or the insertion position.
+  std::unique_ptr<ConstantInt> &Slot =
+      Context.pImpl->IntSplatConstants[std::make_pair(EC, V)];
+  if (!Slot) {
+    IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+    VectorType *VTy = VectorType::get(ITy, EC);
+    Slot.reset(new ConstantInt(VTy, V));
+  }
+
+#ifndef NDEBUG
+  IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+  VectorType *VTy = VectorType::get(ITy, EC);
+  assert(Slot->getType() == VTy);
+#endif
+  return Slot.get();
+}
+
 Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
   Constant *C = get(cast<IntegerType>(Ty->getScalarType()), V, isSigned);
 
@@ -1024,6 +1060,26 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   return Slot.get();
 }
 
+// Get a ConstantFP vector with each lane set to the same APFloat.
+ConstantFP *ConstantFP::get(LLVMContext &Context, ElementCount EC,
+                            const APFloat &V) {
+  // Get an existing value or the insertion position.
+  std::unique_ptr<ConstantFP> &Slot =
+      Context.pImpl->FPSplatConstants[std::make_pair(EC, V)];
+  if (!Slot) {
+    Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics());
+    VectorType *VTy = VectorType::get(EltTy, EC);
+    Slot.reset(new ConstantFP(VTy, V));
+  }
+
+#ifndef NDEBUG
+  Type *EltTy = Type::getFloatingPointTy(Context, V.getSemantics());
+  VectorType *VTy = VectorType::get(EltTy, EC);
+  assert(Slot->getType() == VTy);
+#endif
+  return Slot.get();
+}
+
 Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
   const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics();
   Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative));
@@ -1036,7 +1092,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
 
 ConstantFP::ConstantFP(Type *Ty, const APFloat &V)
     : ConstantData(Ty, ConstantFPVal), Val(V) {
-  assert(&V.getSemantics() == &Ty->getFltSemantics() &&
+  assert(&V.getSemantics() == &Ty->getScalarType()->getFltSemantics() &&
          "FP type Mismatch");
 }
 
@@ -1356,11 +1412,13 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   bool isZero = C->isNullValue();
   bool isUndef = isa<UndefValue>(C);
   bool isPoison = isa<PoisonValue>(C);
+  bool isSplatFP = UseConstantFPForFixedLengthSplat && isa<ConstantFP>(C);
+  bool isSplatInt = UseConstantIntForFixedLengthSplat && isa<ConstantInt>(C);
 
-  if (isZero || isUndef) {
+  if (isZero || isUndef || isSplatFP || isSplatInt) {
     for (unsigned i = 1, e = V.size(); i != e; ++i)
       if (V[i] != C) {
-        isZero = isUndef = isPoison = false;
+        isZero = isUndef = isPoison = isSplatFP = isSplatInt = false;
         break;
       }
   }
@@ -1371,6 +1429,12 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
     return PoisonValue::get(T);
   if (isUndef)
     return UndefValue::get(T);
+  if (isSplatFP)
+    return ConstantFP::get(C->getContext(), T->getElementCount(),
+                           cast<ConstantFP>(C)->getValue());
+  if (isSplatInt)
+    return ConstantInt::get(C->getContext(), T->getElementCount(),
+                            cast<ConstantInt>(C)->getValue());
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
@@ -1384,6 +1448,16 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
 
 Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
   if (!EC.isScalable()) {
+    // Maintain special handling of zero.
+    if (!V->isNullValue()) {
+      if (UseConstantIntForFixedLengthSplat && isa<ConstantInt>(V))
+        return ConstantInt::get(V->getContext(), EC,
+                                cast<ConstantInt>(V)->getValue());
+      if (UseConstantFPForFixedLengthSplat && isa<ConstantFP>(V))
+        return ConstantFP::get(V->getContext(), EC,
+                               cast<ConstantFP>(V)->getValue());
+    }
+
     // If this splat is compatible with ConstantDataVector, use it instead of
     // ConstantVector.
     if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
@@ -1394,6 +1468,16 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
     return get(Elts);
   }
 
+  // Maintain special handling of zero.
+  if (!V->isNullValue()) {
+    if (UseConstantIntForScalableSplat && isa<ConstantInt>(V))
+      return ConstantInt::get(V->getContext(), EC,
+                              cast<ConstantInt>(V)->getValue());
+    if (UseConstantFPForScalableSplat && isa<ConstantFP>(V))
+      return ConstantFP::get(V->getContext(), EC,
+                             cast<ConstantFP>(V)->getValue());
+  }
+
   Type *VTy = VectorType::get(V->getType(), EC);
 
   if (V->isNullValue())
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 15c90a4fe7b2e..a0bf9cae7926b 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -119,7 +119,9 @@ LLVMContextImpl::~LLVMContextImpl() {
   IntZeroConstants.clear();
   IntOneConstants.clear();
   IntConstants.clear();
+  IntSplatConstants.clear();
   FPConstants.clear();
+  FPSplatConstants.clear();
   CDSConstants.clear();
 
   // Destroy attribute node lists.
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 6a20291344989..2ee1080a1ffa2 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1488,8 +1488,12 @@ class LLVMContextImpl {
   DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntZeroConstants;
   DenseMap<unsigned, std::unique_ptr<ConstantInt>> IntOneConstants;
   DenseMap<APInt, std::unique_ptr<ConstantInt>> IntConstants;
+  DenseMap<std::pair<ElementCount, APInt>, std::unique_ptr<ConstantInt>>
+      IntSplatConstants;
 
   DenseMap<APFloat, std::unique_ptr<ConstantFP>> FPConstants;
+  DenseMap<std::pair<ElementCount, APFloat>, std::unique_ptr<ConstantFP>>
+      FPSplatConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
   FoldingSet<AttributeListImpl> AttrsLists;
diff --git a/llvm/test/Bitcode/constant-splat.ll b/llvm/test/Bitcode/constant-splat.ll
new file mode 100644
index 0000000000000..2bcc3ddf3e4f3
--- /dev/null
+++ b/llvm/test/Bitcode/constant-splat.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-as -use-constant-int-for-fixed-length-splat \
+; RUN:         -use-constant-fp-for-fixed-length-splat \
+; RUN:         -use-constant-int-for-scalable-splat \
+; RUN:         -use-constant-fp-for-scalable-splat \
+; RUN:   < %s | llvm-dis -use-constant-int-for-fixed-length-splat \
+; RUN:                   -use-constant-fp-for-fixed-length-splat \
+; RUN:                   -use-constant-int-for-scalable-splat \
+; RUN:                   -use-constant-fp-for-scalable-splat \
+; RUN:   | FileCheck %s
+
+; CHECK: @constant.splat.i1 = constant <1 x i1> splat (i1 true)
+@constant.splat.i1 = constant <1 x i1> splat (i1 true)
+
+; CHECK: @constant.splat.i32 = constant <5 x i32> splat (i32 7)
+@constant.splat.i32 = constant <5 x i32> splat (i32 7)
+
+; CHECK: @constant.splat.i128 = constant <7 x i128> splat (i128 85070591730234615870450834276742070272)
+@constant.splat.i128 = constant <7 x i128> splat (i128 85070591730234615870450834276742070272)
+
+; CHECK: @constant.splat.f16 = constant <2 x half> splat (half 0xHBC00)
+@constant.splat.f16 = constant <2 x half> splat (half 0xHBC00)
+
+; CHECK: @constant.splat.f32 = constant <4 x float> splat (float -2.000000e+00)
+@constant.splat.f32 = constant <4 x float> splat (float -2.000000e+00)
+
+; CHECK: @constant.splat.f64 = constant <6 x double> splat (double -3.000000e+00)
+@constant.splat.f64 = constant <6 x double> splat (double -3.000000e+00)
+
+; CHECK: @constant.splat.128 = constant <8 x fp128> splat (fp128 0xL00000000000000018000000000000000)
+@constant.splat.128 = constant <8 x fp128> splat (fp128 0xL00000000000000018000000000000000)
+
+; CHECK: @constant.splat.bf16 = constant <1 x bfloat> splat (bfloat 0xRC0A0)
+@constant.splat.bf16 = constant <1 x bfloat> splat (bfloat 0xRC0A0)
+
+; CHECK: @constant.splat.x86_fp80 = constant <3 x x86_fp80> splat (x86_fp80 0xK4000C8F5C28F5C28F800)
+@constant.splat.x86_fp80 = constant <3 x x86_fp80> splat (x86_fp80 0xK4000C8F5C28F5C28F800)
+
+; CHECK: @constant.splat.ppc_fp128 = constant <7 x ppc_fp128> splat (ppc_fp128 0xM80000000000000000000000000000000)
+@constant.splat.ppc_fp128 = constant <7 x ppc_fp128> splat (ppc_fp128 0xM80000000000000000000000000000000)
+
+define void @add_fixed_lenth_vector_splat_i32(<4 x i32> %a) {
+; CHECK: %add = add <4 x i32> %a, splat (i32 137)
+  %add = add <4 x i32> %a, splat (i32 137)
+  ret void
+}
+
+define <4 x i32> @ret_fixed_lenth_vector_splat_i32() {
+; CHECK: ret <4 x i32> splat (i32 56)
+  ret <4 x i32> splat (i32 56)
+}
+
+define void @add_fixed_lenth_vector_splat_double(<vscale x 2 x double> %a) {
+; CHECK: %add = fadd <vscale x 2 x double> %a, splat (double 5.700000e+00)
+  %add = fadd <vscale x 2 x double> %a, splat (double 5.700000e+00)
+  ret void
+}
+
+define <vscale x 4 x i32> @ret_scalable_vector_splat_i32() {
+; CHECK: ret <vscale x 4 x i32> splat (i32 78)
+  ret <vscale x 4 x i32> splat (i32 78)
+}
+
+define <4 x i32> @canonical_constant_vector() {
+; CHECK: ret <4 x i32> splat (i32 7)
+  ret <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+}
+
+define <4 x i32> @canonical_fixed_lnegth_vector_zero() {
+; CHECK: ret <4 x i32> zeroinitializer
+  ret <4 x i32> zeroinitializer
+}
+
+define <vscale x 4 x i32> @canonical_scalable_lnegth_vector_zero() {
+; CHECK: ret <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> zeroinitializer
+}

From 88e31f64a034ec6dead2106016ee5b797674edb0 Mon Sep 17 00:00:00 2001
From: Matt <MattPD@users.noreply.github.com>
Date: Thu, 22 Feb 2024 08:13:41 -0600
Subject: [PATCH 039/546] [OpenMP][FIX] Remove unsound omp_get_thread_limit
 deduplication (#79524)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The deduplication of the calls to `omp_get_thread_limit` used to be
legal when originally added in
<https://github.com/llvm/llvm-project/commit/e28936f6137c5a9c4f7673e248c192a9811543b6#diff-de101c82aff66b2bda2d1f53fde3dde7b0d370f14f1ff37b7919ce38531230dfR123>,
as the result (thread_limit) was immutable.

However, now that we have `thread_limit` clause, we no longer have
immutability; therefore `omp_get_thread_limit()` is not a deduplicable
runtime call.

Thus, removing `omp_get_thread_limit` from the
`DeduplicableRuntimeCallIDs` array.

Here's a simple example:
```
#include <omp.h>
#include <stdio.h>

int main()
{
#pragma omp target thread_limit(4)
{
printf("\n1:target thread_limit: %d\n", omp_get_thread_limit());
}

#pragma omp target thread_limit(3)
{
printf("\n2:target thread_limit: %d\n", omp_get_thread_limit());
}
return 0;
}
```

GCC-compiled binary execution: https://gcc.godbolt.org/z/Pjv3TWoTq
```
1:target thread_limit: 4
2:target thread_limit: 3
```

Clang/LLVM-compiled binary execution:
https://clang.godbolt.org/z/zdPbrdMPn
```
1:target thread_limit: 4
2:target thread_limit: 4
```

By my reading of the OpenMP spec GCC does the right thing here; cf.
<https://www.openmp.org/spec-html/5.2/openmpse12.html#x34-330002.4>:
> If a target construct with a thread_limit clause is encountered, the
thread-limit-var ICV from the data environment of the generated initial
task is instead set to an implementation deﬁned value between one and
the value speciﬁed in the clause.

The common subexpression elimination (CSE) of the second call to
`omp_get_thread_limit` by LLVM does not seem to be correct, as it's not
an available expression at any program point(s) (in the scope of the
clause in question) after the second target construct with a
`thread_limit` clause is encountered.

Compiling with `-Rpass=openmp-opt -Rpass-analysis=openmp-opt
-Rpass-missed=openmp-opt` we have:
https://clang.godbolt.org/z/G7dfhP7jh
```
<source>:8:42: remark: OpenMP runtime call omp_get_thread_limit deduplicated. [OMP170] [-Rpass=openmp-opt]
8 | printf("\n1:target thread_limit: %d\n",omp_get_thread_limit());
| ^
```

OMP170 has the following explanation:
https://openmp.llvm.org/remarks/OMP170.html

> This optimization remark indicates that a call to an OpenMP runtime
call was replaced with the result of an existing one. This occurs when
the compiler knows that the result of a runtime call is immutable.
Removing duplicate calls is done by replacing all calls to that function
with the result of the first call. This cannot be done automatically by
the compiler because the implementations of the OpenMP runtime calls
live in a separate library the compiler cannot see.
This optimization will trigger for known OpenMP runtime calls whose
return value will not change.

At the same time I do not believe we have an analysis checking whether
this precondition holds here: "This occurs when the compiler knows that
the result of a runtime call is immutable."

AFAICT, such analysis doesn't appear to exist in the original patch
introducing deduplication, either:

-
https://github.com/llvm/llvm-project/commit/9548b74a831ea005649465797f359e0521f3b8a9
- https://reviews.llvm.org/D69930

The fix is to remove it from `DeduplicableRuntimeCallIDs`, effectively
reverting the addition in this commit (noting that `omp_get_max_threads`
is not present in `DeduplicableRuntimeCallIDs`, so it's possible this
addition was incorrect in the first place):

- [OpenMP][Opt] Annotate known runtime functions and deduplicate more,
-
https://github.com/llvm/llvm-project/commit/e28936f6137c5a9c4f7673e248c192a9811543b6#diff-de101c82aff66b2bda2d1f53fde3dde7b0d370f14f1ff37b7919ce38531230dfR123

As a result, we're no longer unsoundly deduplicating the OpenMP runtime
call `omp_get_thread_limit` as illustrated by the test case: Note the
(correctly) repeated `call i32 @omp_get_thread_limit()`.

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  1 -
 .../OpenMP/deduplication_soundness.ll         | 59 +++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/OpenMP/deduplication_soundness.ll

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 4176d561363fb..77ca36d64029f 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1471,7 +1471,6 @@ struct OpenMPOpt {
         OMPRTL_omp_get_num_threads,
         OMPRTL_omp_in_parallel,
         OMPRTL_omp_get_cancellation,
-        OMPRTL_omp_get_thread_limit,
         OMPRTL_omp_get_supported_active_levels,
         OMPRTL_omp_get_level,
         OMPRTL_omp_get_ancestor_thread_num,
diff --git a/llvm/test/Transforms/OpenMP/deduplication_soundness.ll b/llvm/test/Transforms/OpenMP/deduplication_soundness.ll
new file mode 100644
index 0000000000000..9dd3219175fea
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/deduplication_soundness.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function main --scrub-attributes --filter "@omp_get_thread_limit|@use" --version 4
+; RUN: opt -passes=openmp-opt-cgscc -S < %s | FileCheck %s
+
+declare void @use(i32 noundef)
+declare i32 @omp_get_thread_limit()
+declare void @__kmpc_set_thread_limit(ptr, i32, i32)
+declare i32 @__kmpc_global_thread_num(ptr)
+declare noalias ptr @__kmpc_omp_task_alloc(ptr, i32, i32, i64, i64, ptr)
+declare void @__kmpc_omp_task_complete_if0(ptr, i32, ptr)
+declare void @__kmpc_omp_task_begin_if0(ptr, i32, ptr)
+
+%struct.ident_t = type { i32, i32, i32, i32, ptr }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
+
+define i32 @main() local_unnamed_addr {
+; CHECK-LABEL: define i32 @main() local_unnamed_addr {
+; CHECK:    [[CALL_I_I_I:%.*]] = call i32 @omp_get_thread_limit()
+; CHECK:    call void @use(i32 noundef [[CALL_I_I_I]])
+; CHECK:    [[CALL_I_I_I2:%.*]] = call i32 @omp_get_thread_limit()
+; CHECK:    call void @use(i32 noundef [[CALL_I_I_I2]])
+;
+entry:
+  %0 = call i32 @__kmpc_global_thread_num(ptr nonnull @1)
+  %1 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry.)
+  call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %1)
+  call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4)
+  %call.i.i.i = call i32 @omp_get_thread_limit()
+  call void @use(i32 noundef %call.i.i.i)
+  call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %1)
+  %2 = call ptr @__kmpc_omp_task_alloc(ptr nonnull @1, i32 %0, i32 1, i64 40, i64 0, ptr nonnull @.omp_task_entry..2)
+  call void @__kmpc_omp_task_begin_if0(ptr nonnull @1, i32 %0, ptr %2)
+  call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3)
+  %call.i.i.i2 = call i32 @omp_get_thread_limit()
+  call void @use(i32 noundef %call.i.i.i2)
+  call void @__kmpc_omp_task_complete_if0(ptr nonnull @1, i32 %0, ptr %2)
+  ret i32 0
+}
+
+define internal noundef i32 @.omp_task_entry.(i32 noundef %0, ptr noalias nocapture noundef readonly %1) {
+entry:
+  tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 4)
+  %call.i.i = tail call i32 @omp_get_thread_limit()
+  tail call void @use(i32 noundef %call.i.i)
+  ret i32 0
+}
+
+define internal noundef i32 @.omp_task_entry..2(i32 noundef %0, ptr noalias nocapture noundef readonly %1) {
+entry:
+  tail call void @__kmpc_set_thread_limit(ptr nonnull @1, i32 %0, i32 3)
+  %call.i.i = tail call i32 @omp_get_thread_limit()
+  tail call void @use(i32 noundef %call.i.i)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 7, !"openmp", i32 51}

From d3f6dd6585f4866a38a794b80db55a62c1050c77 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 22 Feb 2024 15:25:17 +0100
Subject: [PATCH 040/546] [InstCombine] Pick bfloat over half when shrinking
 ops that started with an fpext from bfloat (#82493)

This fixes the case where we would shrink an frem to half and then
bitcast to bfloat, producing invalid results. The transformation was
written under the assumption that there is only one type with a given
bit width.

Also add a strategic assert to CastInst::CreateFPCast to turn this
miscompilation into a crash.
---
 llvm/lib/IR/Instructions.cpp                  |  1 +
 .../InstCombine/InstCombineCasts.cpp          | 23 +++++++++++--------
 llvm/test/Transforms/InstCombine/fpextend.ll  | 11 +++++++++
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index ce0df53d9ffb9..fc5c9b201487e 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -3525,6 +3525,7 @@ CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
          "Invalid cast");
   unsigned SrcBits = C->getType()->getScalarSizeInBits();
   unsigned DstBits = Ty->getScalarSizeInBits();
+  assert((C->getType() == Ty || SrcBits != DstBits) && "Invalid cast");
   Instruction::CastOps opcode =
     (SrcBits == DstBits ? Instruction::BitCast :
      (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index ed47de287302e..33ed1d5575375 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1543,11 +1543,14 @@ static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   return !losesInfo;
 }
 
-static Type *shrinkFPConstant(ConstantFP *CFP) {
+static Type *shrinkFPConstant(ConstantFP *CFP, bool PreferBFloat) {
   if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
     return nullptr;  // No constant folding of this.
+  // See if the value can be truncated to bfloat and then reextended.
+  if (PreferBFloat && fitsInFPType(CFP, APFloat::BFloat()))
+    return Type::getBFloatTy(CFP->getContext());
   // See if the value can be truncated to half and then reextended.
-  if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+  if (!PreferBFloat && fitsInFPType(CFP, APFloat::IEEEhalf()))
     return Type::getHalfTy(CFP->getContext());
   // See if the value can be truncated to float and then reextended.
   if (fitsInFPType(CFP, APFloat::IEEEsingle()))
@@ -1562,7 +1565,7 @@ static Type *shrinkFPConstant(ConstantFP *CFP) {
 
 // Determine if this is a vector of ConstantFPs and if so, return the minimal
 // type we can safely truncate all elements to.
-static Type *shrinkFPConstantVector(Value *V) {
+static Type *shrinkFPConstantVector(Value *V, bool PreferBFloat) {
   auto *CV = dyn_cast<Constant>(V);
   auto *CVVTy = dyn_cast<FixedVectorType>(V->getType());
   if (!CV || !CVVTy)
@@ -1582,7 +1585,7 @@ static Type *shrinkFPConstantVector(Value *V) {
     if (!CFP)
       return nullptr;
 
-    Type *T = shrinkFPConstant(CFP);
+    Type *T = shrinkFPConstant(CFP, PreferBFloat);
     if (!T)
       return nullptr;
 
@@ -1597,7 +1600,7 @@ static Type *shrinkFPConstantVector(Value *V) {
 }
 
 /// Find the minimum FP type we can safely truncate to.
-static Type *getMinimumFPType(Value *V) {
+static Type *getMinimumFPType(Value *V, bool PreferBFloat) {
   if (auto *FPExt = dyn_cast<FPExtInst>(V))
     return FPExt->getOperand(0)->getType();
 
@@ -1605,7 +1608,7 @@ static Type *getMinimumFPType(Value *V) {
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
   if (auto *CFP = dyn_cast<ConstantFP>(V))
-    if (Type *T = shrinkFPConstant(CFP))
+    if (Type *T = shrinkFPConstant(CFP, PreferBFloat))
       return T;
 
   // We can only correctly find a minimum type for a scalable vector when it is
@@ -1617,7 +1620,7 @@ static Type *getMinimumFPType(Value *V) {
 
   // Try to shrink a vector of FP constants. This returns nullptr on scalable
   // vectors
-  if (Type *T = shrinkFPConstantVector(V))
+  if (Type *T = shrinkFPConstantVector(V, PreferBFloat))
     return T;
 
   return V->getType();
@@ -1686,8 +1689,10 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Type *Ty = FPT.getType();
   auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
   if (BO && BO->hasOneUse()) {
-    Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
-    Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+    Type *LHSMinType =
+        getMinimumFPType(BO->getOperand(0), /*PreferBFloat=*/Ty->isBFloatTy());
+    Type *RHSMinType =
+        getMinimumFPType(BO->getOperand(1), /*PreferBFloat=*/Ty->isBFloatTy());
     unsigned OpWidth = BO->getType()->getFPMantissaWidth();
     unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
     unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
diff --git a/llvm/test/Transforms/InstCombine/fpextend.ll b/llvm/test/Transforms/InstCombine/fpextend.ll
index a41f2a4ca300f..19f512d717a97 100644
--- a/llvm/test/Transforms/InstCombine/fpextend.ll
+++ b/llvm/test/Transforms/InstCombine/fpextend.ll
@@ -437,3 +437,14 @@ define half @bf16_to_f32_to_f16(bfloat %a) nounwind {
   %z = fptrunc float %y to half
   ret half %z
 }
+
+define bfloat @bf16_frem(bfloat %x) {
+; CHECK-LABEL: @bf16_frem(
+; CHECK-NEXT:    [[FREM:%.*]] = frem bfloat [[X:%.*]], 0xR40C9
+; CHECK-NEXT:    ret bfloat [[FREM]]
+;
+  %t1 = fpext bfloat %x to float
+  %t2 = frem float %t1, 6.281250e+00
+  %t3 = fptrunc float %t2 to bfloat
+  ret bfloat %t3
+}

From 9dbedcac1243e8e99103bdff37da51dded67b766 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 22 Feb 2024 06:28:12 -0800
Subject: [PATCH 041/546] [build] Check RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES
 for libc also (#82561)

When checking whether we need to build libc-hdrgen, we need to check
LLVM_ENABLE_RUNTIMES and RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES, just
the former is not sufficient since libc may be enabled only for certain
targets.
---
 libc/CMakeLists.txt | 17 +++++++++++++++--
 llvm/CMakeLists.txt | 14 +++++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 616beae13d9aa..9f9839423499e 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -57,9 +57,21 @@ if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
   endif()
 endif()
 
+set(NEED_LIBC_HDRGEN FALSE)
+if(NOT LLVM_RUNTIMES_BUILD)
+  if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
+    set(NEED_LIBC_HDRGEN TRUE)
+  else()
+    foreach(_name ${LLVM_RUNTIME_TARGETS})
+      if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
+        set(NEED_LIBC_HDRGEN TRUE)
+        break()
+      endif()
+    endforeach()
+  endif()
+endif()
 option(LIBC_HDRGEN_ONLY "Only build the 'libc-hdrgen' executable" OFF)
-if(("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT LLVM_RUNTIMES_BUILD) OR 
-   LIBC_HDRGEN_ONLY)
+if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN)
   # When libc is build as part of the runtimes/bootstrap build's CMake run, we
   # only need to build the host tools to build the libc. So, we just do enough
   # to build libc-hdrgen and return.
@@ -70,6 +82,7 @@ if(("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT LLVM_RUNTIMES_BUILD) OR
   endif()
   return()
 endif()
+unset(NEED_LIBC_HDRGEN)
 
 option(LIBC_CMAKE_VERBOSE_LOGGING
   "Log details warnings and notifications during CMake configuration." OFF)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 98cef005f07e0..dbd5fbf226bd5 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -168,7 +168,18 @@ foreach(proj IN LISTS LLVM_ENABLE_RUNTIMES)
   endif()
 endforeach()
 
-if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
+set(NEED_LIBC_HDRGEN FALSE)
+if("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
+  set(NEED_LIBC_HDRGEN TRUE)
+else()
+  foreach(_name ${LLVM_RUNTIME_TARGETS})
+    if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
+      set(NEED_LIBC_HDRGEN TRUE)
+      break()
+    endif()
+  endforeach()
+endif()
+if(NEED_LIBC_HDRGEN)
   # To build the libc runtime, we need to be able to build few libc build
   # tools from the "libc" project. So, we add it to the list of enabled
   # projects.
@@ -177,6 +188,7 @@ if ("libc" IN_LIST LLVM_ENABLE_RUNTIMES)
     list(APPEND LLVM_ENABLE_PROJECTS "libc")
   endif()
 endif()
+unset(NEED_LIBC_HDRGEN)
 
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for

From cf8fc53a96f844328be8d20435c5b4151a7b8f92 Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Thu, 22 Feb 2024 15:33:48 +0100
Subject: [PATCH 042/546] [Flang][LLVM][OpenMP] Relax target data restrictions
 to be more inline with the specification (#82537)

Currently we emit errors whenever a map is not provided on a target data
directive, however, I believe that's incorrect behavior, the
specification states:

"At least one map, use_device_addr or use_device_ptr clause must appear
on the directive"

So provided one is present, the directive is legal in this case.
Slightly different to its siblings (enter/exit/update) which don't have
use_device_addr/use_device_ptr.
---
 flang/test/Semantics/OpenMP/device-constructs.f90 | 12 +++++++++++-
 llvm/include/llvm/Frontend/OpenMP/OMP.td          |  8 +++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/flang/test/Semantics/OpenMP/device-constructs.f90 b/flang/test/Semantics/OpenMP/device-constructs.f90
index 51f00700b6daf..1ac00ef922c6b 100644
--- a/flang/test/Semantics/OpenMP/device-constructs.f90
+++ b/flang/test/Semantics/OpenMP/device-constructs.f90
@@ -2,9 +2,11 @@
 ! Check OpenMP clause validity for the following directives:
 !     2.10 Device constructs
 program main
+   use iso_c_binding
 
   real(8) :: arrayA(256), arrayB(256)
   integer :: N
+  type(c_ptr) :: cptr
 
   arrayA = 1.414
   arrayB = 3.14
@@ -135,7 +137,15 @@ program main
   enddo
   !$omp end target data
 
-  !ERROR: At least one of MAP clause must appear on the TARGET DATA directive
+  !$omp target data device(0) use_device_addr(cptr)
+   cptr = c_null_ptr
+  !$omp end target data
+
+  !$omp target data device(0) use_device_addr(cptr)
+   cptr = c_null_ptr
+  !$omp end target data
+
+  !ERROR: At least one of MAP, USE_DEVICE_ADDR, USE_DEVICE_PTR clause must appear on the TARGET DATA directive
   !$omp target data device(0)
   do i = 1, N
      a = 3.14
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 1481328bf483b..77d207f2b10a8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -710,16 +710,14 @@ def OMP_Requires : Directive<"requires"> {
 }
 def OMP_Nothing : Directive<"nothing"> {}
 def OMP_TargetData : Directive<"target data"> {
-  let allowedClauses = [
-    VersionedClause<OMPC_UseDevicePtr>,
-    VersionedClause<OMPC_UseDeviceAddr, 50>
-  ];
   let allowedOnceClauses = [
     VersionedClause<OMPC_Device>,
     VersionedClause<OMPC_If>
   ];
   let requiredClauses = [
-    VersionedClause<OMPC_Map>
+    VersionedClause<OMPC_Map>,
+    VersionedClause<OMPC_UseDevicePtr>,
+    VersionedClause<OMPC_UseDeviceAddr, 50>
   ];
 }
 def OMP_TargetEnterData : Directive<"target enter data"> {

From 27498e9942dbb8dd005588a03d6777088d2255ce Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 22 Feb 2024 14:35:05 +0000
Subject: [PATCH 043/546] [Flang][OpenMP] Prevent ICE for certain constructs in
 unnamed programs (#73938)

This patch fixes #72748 by modifying the processing of program units to
search for a symbol to which OpenMP REQUIRES clauses can bind to. Rather
than picking up the first PFT node with a source reference and getting
its associated scope, it picks up the last one.

This avoids using the source from the first specification construct of
a nameless program, which can sometimes not be associated to any scope,
causing an ICE due to an invalid source location.
---
 flang/lib/Semantics/resolve-directives.cpp | 2 +-
 flang/test/Semantics/OpenMP/struct.f90     | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Semantics/OpenMP/struct.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index a826f0181e580..215a3c9060a24 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -26,7 +26,7 @@
 template <typename T>
 static Fortran::semantics::Scope *GetScope(
     Fortran::semantics::SemanticsContext &context, const T &x) {
-  std::optional<Fortran::parser::CharBlock> source{GetSource(x)};
+  std::optional<Fortran::parser::CharBlock> source{GetLastSource(x)};
   return source ? &context.FindScope(*source) : nullptr;
 }
 
diff --git a/flang/test/Semantics/OpenMP/struct.f90 b/flang/test/Semantics/OpenMP/struct.f90
new file mode 100644
index 0000000000000..8ae1fbe4da86f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/struct.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+! Check OpenMP compatibility with the DEC STRUCTURE extension
+
+structure /s/
+end structure
+
+end

From 8e28037374934c60602cb8c85874f443e3348b9e Mon Sep 17 00:00:00 2001
From: Kai Nacke <kai.peter.nacke@ibm.com>
Date: Thu, 22 Feb 2024 09:52:44 -0500
Subject: [PATCH 044/546] [SystemZ] Add SystemZ path for the PR labeler
 (#82515)

Similar to #82200:
Add paths for SystemZ related changes to the PR labeler.

There is no pr-subscribers-backend:SystemZ team in the llvm org yet.
Much appreciated if some admin can help to create the team.
---
 .github/new-prs-labeler.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 7a37a96d6e381..8ed976fbdddc6 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -846,6 +846,26 @@ backend:PowerPC:
   - clang/lib/Driver/ToolChains/Arch/PPC.*
   - clang/test/CodeGen/PowerPC/**
 
+backend:SystemZ:
+  - llvm/include/llvm/BinaryFormat/ELFRelocs/SystemZ*
+  - llvm/include/llvm/BinaryFormat/GOFF.h
+  - llvm/include/llvm/IR/IntrinsicsSystemZ.td
+  - llvm/lib/Target/SystemZ/**
+  - llvm/test/Analysis/**/SystemZ/**
+  - llvm/test/CodeGen/SystemZ/**
+  - llvm/test/DebugInfo/SystemZ/**
+  - llvm/test/ExecutionEngine/**/SystemZ/**
+  - llvm/test/MC/Disassembler/SystemZ/**
+  - llvm/test/MC/GOFF/**
+  - llvm/test/MC/SystemZ/**
+  - llvm/test/Transforms/**/SystemZ/**
+  - clang/include/clang/Basic/BuiltinsSystemZ.*
+  - clang/lib/Basic/Targets/SystemZ.*
+  - clang/lib/CodeGen/Targets/SystemZ.cpp
+  - clang/lib/Driver/ToolChains/ZOS*
+  - clang/lib/Driver/ToolChains/Arch/SystemZ.*
+  - clang/test/CodeGen/SystemZ/**
+
 third-party:unittests:
   - third-party/unittests/**
 

From 307409a8872ff27339d5d5c6a7e7777254972f34 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 22 Feb 2024 14:59:50 +0000
Subject: [PATCH 045/546] [flang] Fix warning fix

This fixes 73c646a3b27293f8cb4ba120de7bc01c223b4b5f.

I misread the #ifdefs and didn't realise that they were in
the middle of passing parameters to a function.

Move the workaround outside this.
---
 flang/lib/Evaluate/fold-integer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Evaluate/fold-integer.cpp b/flang/lib/Evaluate/fold-integer.cpp
index 09b2f91debda2..25ae4831ab208 100644
--- a/flang/lib/Evaluate/fold-integer.cpp
+++ b/flang/lib/Evaluate/fold-integer.cpp
@@ -704,6 +704,7 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
         return common::visit(
             [&funcRef, &context, &FromInt64](const auto &str) -> Expr<T> {
               using Char = typename std::decay_t<decltype(str)>::Result;
+              (void)FromInt64;
               return FoldElementalIntrinsic<T, Char>(context,
                   std::move(funcRef),
                   ScalarFunc<T, Char>(
@@ -719,7 +720,6 @@ Expr<Type<TypeCategory::Integer, KIND>> FoldIntrinsicFunction(
       // CharacterUtils<2>::ICHAR(). Can't find a work-around,
       // so remove the FromInt64 error checking lambda that
       // seems to have caused the proble.
-                      (void)FromInt64;
                       [](const Scalar<Char> &c) {
                         return CharacterUtils<Char::kind>::ICHAR(
                             CharacterUtils<Char::kind>::Resize(c, 1));

From 20434bf3731389773fb8569889bd5d06375683bf Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Thu, 22 Feb 2024 15:12:43 +0000
Subject: [PATCH 046/546] [RemoveDIs][NFC] Add DPLabel class [2/3] (#82376)

Patch 2 of 3 to add llvm.dbg.label support to the RemoveDIs project. The
patch stack adds the DPLabel class, which is the RemoveDIs
llvm.dbg.label
equivalent.

       1. Add DbgRecord base class for DPValue and the not-yet-added
          DPLabel class.
    -> 2. Add the DPLabel class.
       3. Enable dbg.label conversion and add support to passes.

This will be used (and tested) in the final patch(es), coming next.
---
 .../include/llvm/IR/DebugProgramInstruction.h | 32 ++++++++++++--
 llvm/lib/IR/AsmWriter.cpp                     | 43 +++++++++++++++++--
 llvm/lib/IR/DebugProgramInstruction.cpp       | 23 +++++++---
 3 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 1fa6b6f519640..1c8619741eb69 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -79,14 +79,13 @@ class raw_ostream;
 ///   deleteRecord
 ///   clone
 ///   isIdenticalToWhenDefined
-///   isEquivalentTo
 ///   both print methods
 class DbgRecord : public ilist_node<DbgRecord> {
 public:
   /// Marker that this DbgRecord is linked into.
   DPMarker *Marker = nullptr;
   /// Subclass discriminator.
-  enum Kind : uint8_t { ValueKind };
+  enum Kind : uint8_t { ValueKind, LabelKind };
 
 protected:
   DebugLoc DbgLoc;
@@ -104,9 +103,11 @@ class DbgRecord : public ilist_node<DbgRecord> {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &O, ModuleSlotTracker &MST, bool IsForDebug) const;
   bool isIdenticalToWhenDefined(const DbgRecord &R) const;
-  bool isEquivalentTo(const DbgRecord &R) const;
   ///@}
 
+  /// Same as isIdenticalToWhenDefined but checks DebugLoc too.
+  bool isEquivalentTo(const DbgRecord &R) const;
+
   Kind getRecordKind() const { return RecordKind; }
 
   void setMarker(DPMarker *M) { Marker = M; }
@@ -156,6 +157,31 @@ class DbgRecord : public ilist_node<DbgRecord> {
   ~DbgRecord() = default;
 };
 
+/// Records a position in IR for a source label (DILabel). Corresponds to the
+/// llvm.dbg.label intrinsic.
+/// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord.
+class DPLabel : public DbgRecord {
+  DILabel *Label;
+
+public:
+  DPLabel(DILabel *Label, DebugLoc DL)
+      : DbgRecord(LabelKind, DL), Label(Label) {
+    assert(Label && "Unexpected nullptr");
+  }
+
+  DPLabel *clone() const;
+  void print(raw_ostream &O, bool IsForDebug = false) const;
+  void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
+
+  void setLabel(DILabel *NewLabel) { Label = NewLabel; }
+  DILabel *getLabel() const { return Label; }
+
+  /// Support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(const DbgRecord *E) {
+    return E->getRecordKind() == LabelKind;
+  }
+};
+
 /// Record of a variable value-assignment, aka a non instruction representation
 /// of the dbg.value intrinsic.
 ///
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ac0f119b00bde..c2a470c5fc716 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -292,8 +292,8 @@ static const Module *getModuleFromDPI(const DPMarker *Marker) {
   return M ? M->getParent() : nullptr;
 }
 
-static const Module *getModuleFromDPI(const DPValue *DPV) {
-  return DPV->getMarker() ? getModuleFromDPI(DPV->getMarker()) : nullptr;
+static const Module *getModuleFromDPI(const DbgRecord *DR) {
+  return DR->getMarker() ? getModuleFromDPI(DR->getMarker()) : nullptr;
 }
 
 static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
@@ -2699,6 +2699,7 @@ class AssemblyWriter {
   void printInstruction(const Instruction &I);
   void printDPMarker(const DPMarker &DPI);
   void printDPValue(const DPValue &DPI);
+  void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
@@ -4602,8 +4603,10 @@ void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
 void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
   if (auto *DPV = dyn_cast<DPValue>(&DR))
     printDPValue(*DPV);
+  else if (auto *DPL = dyn_cast<DPLabel>(&DR))
+    printDPLabel(*DPL);
   else
-    llvm_unreachable("unsupported dbg record");
+    llvm_unreachable("Unexpected DbgRecord kind");
 }
 
 void AssemblyWriter::printDPValue(const DPValue &Value) {
@@ -4645,6 +4648,16 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
   Out << " }";
 }
 
+void AssemblyWriter::printDPLabel(const DPLabel &Label) {
+  // There's no formal representation of a DPLabel -- print purely as
+  // a debugging aid.
+  Out << "  DPLabel { ";
+  auto WriterCtx = getContext();
+  WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
+  Out << " marker @" << Label.getMarker();
+  Out << " }";
+}
+
 void AssemblyWriter::printMetadataAttachments(
     const SmallVectorImpl<std::pair<unsigned, MDNode *>> &MDs,
     StringRef Separator) {
@@ -4908,6 +4921,12 @@ void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   W.printDPMarker(*this);
 }
 
+void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
+
+  ModuleSlotTracker MST(getModuleFromDPI(this), true);
+  print(ROS, MST, IsForDebug);
+}
+
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
   // There's no formal representation of a DPValue -- print purely as a
@@ -4927,6 +4946,24 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   W.printDPValue(*this);
 }
 
+void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
+                    bool IsForDebug) const {
+  // There's no formal representation of a DbgLabelRecord -- print purely as
+  // a debugging aid.
+  formatted_raw_ostream OS(ROS);
+  SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
+  SlotTracker &SlotTable =
+      MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
+  auto incorporateFunction = [&](const Function *F) {
+    if (F)
+      MST.incorporateFunction(*F);
+  };
+  incorporateFunction(Marker->getParent() ? Marker->getParent()->getParent()
+                                          : nullptr);
+  AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
+  W.printDPLabel(*this);
+}
+
 void Value::print(raw_ostream &ROS, bool IsForDebug) const {
   bool ShouldInitializeAllMetadata = false;
   if (auto *I = dyn_cast<Instruction>(this))
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index eb18be5d817a9..2ca4533afa96c 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -64,6 +64,9 @@ void DbgRecord::deleteRecord() {
   case ValueKind:
     delete cast<DPValue>(this);
     return;
+  case LabelKind:
+    delete cast<DPLabel>(this);
+    return;
   }
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -73,6 +76,9 @@ void DbgRecord::print(raw_ostream &O, bool IsForDebug) const {
   case ValueKind:
     cast<DPValue>(this)->print(O, IsForDebug);
     return;
+  case LabelKind:
+    cast<DPLabel>(this)->print(O, IsForDebug);
+    return;
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -83,6 +89,9 @@ void DbgRecord::print(raw_ostream &O, ModuleSlotTracker &MST,
   case ValueKind:
     cast<DPValue>(this)->print(O, MST, IsForDebug);
     return;
+  case LabelKind:
+    cast<DPLabel>(this)->print(O, MST, IsForDebug);
+    return;
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
@@ -93,18 +102,14 @@ bool DbgRecord::isIdenticalToWhenDefined(const DbgRecord &R) const {
   switch (RecordKind) {
   case ValueKind:
     return cast<DPValue>(this)->isIdenticalToWhenDefined(*cast<DPValue>(&R));
+  case LabelKind:
+    return cast<DPLabel>(this)->getLabel() == cast<DPLabel>(R).getLabel();
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
 
 bool DbgRecord::isEquivalentTo(const DbgRecord &R) const {
-  if (RecordKind != R.RecordKind)
-    return false;
-  switch (RecordKind) {
-  case ValueKind:
-    return cast<DPValue>(this)->isEquivalentTo(*cast<DPValue>(&R));
-  };
-  llvm_unreachable("unsupported DbgRecord kind");
+  return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R);
 }
 
 DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
@@ -307,12 +312,16 @@ DbgRecord *DbgRecord::clone() const {
   switch (RecordKind) {
   case ValueKind:
     return cast<DPValue>(this)->clone();
+  case LabelKind:
+    return cast<DPLabel>(this)->clone();
   };
   llvm_unreachable("unsupported DbgRecord kind");
 }
 
 DPValue *DPValue::clone() const { return new DPValue(*this); }
 
+DPLabel *DPLabel::clone() const { return new DPLabel(Label, getDebugLoc()); }
+
 DbgVariableIntrinsic *
 DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   [[maybe_unused]] DICompileUnit *Unit =

From 601c9bec736739da9160092ef60e3468266816bd Mon Sep 17 00:00:00 2001
From: Victor Campos <victor.campos@arm.com>
Date: Thu, 22 Feb 2024 15:25:36 +0000
Subject: [PATCH 047/546] [clang][NFC] Fix arm_acle.h title headers (#82624)

Fix some title headers to align them with the actual ACLE document.
---
 clang/lib/Headers/arm_acle.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 9cd34948e3c53..6e557eda1dddc 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -313,7 +313,7 @@ __qdbl(int32_t __t) {
 }
 #endif
 
-/* 8.4.3 Accumultating multiplications */
+/* 8.4.3 Accumulating multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
@@ -545,7 +545,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
 }
 #endif
 
-/* 8.5.10 Parallel 16-bit multiplications */
+/* 8.5.10 Parallel 16-bit multiplication */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
@@ -748,7 +748,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
 
-/* 10.3 Memory Tagging Extensions (MTE) Intrinsics */
+/* 10.3 MTE intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
@@ -757,7 +757,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
 
-/* 18 Memory Operations Intrinsics */
+/* 18 memcpy family of operations intrinsics - MOPS */
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
   __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif

From 08eced5fccd2f103379292f119834a7a3c3b6b25 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 22 Feb 2024 15:29:04 +0000
Subject: [PATCH 048/546] [mlir][test] Add -march=aarch64 -mattr=+sve to
 test-scalable-interleave

Fix for https://lab.llvm.org/buildbot/#/builders/179/builds/9438
---
 .../Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir
index 8ae3eee6462ca..07989bd71f501 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-scalable-interleave.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd -e entry -entry-point-result=void  \
-// RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_arm_runner_utils | \
+// RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_arm_runner_utils \
+// RUN:   -march=aarch64 -mattr=+sve | \
 // RUN: FileCheck %s
 
 func.func @entry() {

From 695a9d84dc1dd003c31d3e5e22af3525c31218c2 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r@artagnon.com>
Date: Thu, 22 Feb 2024 16:00:33 +0000
Subject: [PATCH 049/546] LoopVectorize: add test for crash in #72969 (#74111)

---
 .../Transforms/LoopVectorize/X86/pr72969.ll   | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr72969.ll

diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
new file mode 100644
index 0000000000000..a54bd39f3ff60
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -0,0 +1,25 @@
+; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s
+; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s
+
+@h = global i64 0
+
+define void @test(ptr %p) {
+entry:
+  br label %for.body
+
+for.body:
+  %idx.ext.merge = phi i64 [ 1, %entry ], [ %idx, %for.body ]
+  %inc.merge = phi i16 [ 1, %entry ], [ %inc, %for.body ]
+  %idx.merge = phi i64 [ 0, %entry ], [ %idx.ext.merge, %for.body ]
+  %add = shl i64 %idx.merge, 1
+  %arrayidx = getelementptr i64, ptr %p, i64 %add
+  store i64 0, ptr %arrayidx
+  %inc = add i16 %inc.merge, 1
+  %idx = zext i16 %inc to i64
+  %gep = getelementptr i64, ptr %p, i64 %idx
+  %cmp = icmp ugt ptr %gep, @h
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  ret void
+}

From 9eb5f94f9b47154cf07160a6ba74ab1c31becfa3 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 22 Feb 2024 07:54:51 -0800
Subject: [PATCH 050/546] [RISCV][AArch64] Add vscale_range attribute to tests
 per architecture minimums

Spent a bunch of time tracing down an odd issue "in SCEV" which turned out
to be the fact that SCEV doesn't have access to TTI.  As a result, the only
way for it to get range facts on vscales (to avoid collapsing ranges of
element counts and type sizes to trivial ranges on multiplies) is to look
at the vscale_range attribute.  Since vscale_range is set by clang by
default, manually setting it in the tests shouldn't interfere with the
test intent.
---
 .../AArch64/clamped-trip-count.ll             | 90 +++++++++----------
 .../LoopVectorize/RISCV/low-trip-count.ll     |  2 +-
 2 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 44ace377ac792..3e895edcd4f4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s
 
-define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
+define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,16) {
 ; CHECK-LABEL: define void @clamped_tc_8(
 ; CHECK-SAME: ptr nocapture [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -18,20 +18,15 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 8, [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 8, [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i64> [[TMP13]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP16]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP17]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 8 x i64> [[TMP8]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP11]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -40,17 +35,17 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <vscale x 8 x i64> [[TMP20]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <vscale x 8 x i64> [[TMP15]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
@@ -61,8 +56,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP24]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
@@ -91,7 +86,7 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 }
 
-define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
+define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,16) {
 ; CHECK-LABEL: define void @clamped_tc_max_8(
 ; CHECK-SAME: ptr nocapture [[DST:%.*]], i32 [[N:%.*]], i64 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -115,20 +110,15 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[WIDE_TRIP_COUNT]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i64> [[TMP13]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP16]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP17]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 8 x i64> [[TMP8]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP11]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
@@ -137,17 +127,17 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc <vscale x 8 x i64> [[TMP20]] to <vscale x 8 x i8>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP21]], ptr [[TMP22]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <vscale x 8 x i64> [[TMP15]] to <vscale x 8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP11]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -158,8 +148,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val){
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP24]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
 ; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 0c5394cb95a61..acb4489bd76b0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -74,4 +74,4 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-attributes #0 = { "target-features"="+v,+d" }
+attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) }

From 0107c8824b695db86706bbc3466bbfd585a754aa Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 23 Feb 2024 00:18:56 +0800
Subject: [PATCH 051/546] [RISCV][SDAG] Improve codegen of select with
 constants if zicond is available (#82456)

This patch uses `add + czero.eqz/nez` to lower select with constants if
zicond is available.
```
(select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
(select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
```
The above code sequence is suggested by [RISCV Optimization
Guide](https://riscv-optimization-guide-riseproject-c94355ae3e6872252baa952524.gitlab.io/riscv-optimization-guide.html#_avoid_branches_using_conditional_moves).
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  20 ++
 llvm/test/CodeGen/RISCV/select.ll           | 252 +++++++++++++++++++-
 2 files changed, 262 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index cf0dc36a51b61..6bf02cf8c0f87 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7379,6 +7379,26 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     if (SDValue V = combineSelectToBinOp(Op.getNode(), DAG, Subtarget))
       return V;
 
+    // (select c, c1, c2) -> (add (czero_nez c2 - c1, c), c1)
+    // (select c, c1, c2) -> (add (czero_eqz c1 - c2, c), c2)
+    if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV)) {
+      const APInt &TrueVal = TrueV->getAsAPIntVal();
+      const APInt &FalseVal = FalseV->getAsAPIntVal();
+      const int TrueValCost = RISCVMatInt::getIntMatCost(
+          TrueVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+      const int FalseValCost = RISCVMatInt::getIntMatCost(
+          FalseVal, Subtarget.getXLen(), Subtarget, /*CompressionCost=*/true);
+      bool IsCZERO_NEZ = TrueValCost <= FalseValCost;
+      SDValue LHSVal = DAG.getConstant(
+          IsCZERO_NEZ ? FalseVal - TrueVal : TrueVal - FalseVal, DL, VT);
+      SDValue RHSVal =
+          DAG.getConstant(IsCZERO_NEZ ? TrueVal : FalseVal, DL, VT);
+      SDValue CMOV =
+          DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+                      DL, VT, LHSVal, CondV);
+      return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
+    }
+
     // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
     // Unless we have the short forward branch optimization.
     if (!Subtarget.hasConditionalMoveFusion())
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index e01984b7c5843..e07e52091e9e7 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1606,23 +1606,255 @@ define i32 @select_cst_unknown(i32 signext %a, i32 signext %b) {
 ; RV64IMXVTCONDOPS-LABEL: select_cst_unknown:
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    slt a0, a0, a1
-; RV64IMXVTCONDOPS-NEXT:    li a1, -7
-; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV64IMXVTCONDOPS-NEXT:    li a2, 5
-; RV64IMXVTCONDOPS-NEXT:    vt.maskc a0, a2, a0
-; RV64IMXVTCONDOPS-NEXT:    or a0, a0, a1
+; RV64IMXVTCONDOPS-NEXT:    li a1, -12
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 5
 ; RV64IMXVTCONDOPS-NEXT:    ret
 ;
 ; CHECKZICOND-LABEL: select_cst_unknown:
 ; CHECKZICOND:       # %bb.0:
 ; CHECKZICOND-NEXT:    slt a0, a0, a1
-; CHECKZICOND-NEXT:    li a1, -7
-; CHECKZICOND-NEXT:    czero.nez a1, a1, a0
-; CHECKZICOND-NEXT:    li a2, 5
-; CHECKZICOND-NEXT:    czero.eqz a0, a2, a0
-; CHECKZICOND-NEXT:    or a0, a0, a1
+; CHECKZICOND-NEXT:    li a1, -12
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 5
 ; CHECKZICOND-NEXT:    ret
   %cond = icmp slt i32 %a, %b
   %ret = select i1 %cond, i32 5, i32 -7
   ret i32 %ret
 }
+
+define i32 @select_cst1(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst1:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 10
+; RV32IM-NEXT:    bnez a1, .LBB43_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    li a0, 20
+; RV32IM-NEXT:  .LBB43_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst1:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 10
+; RV64IM-NEXT:    bnez a1, .LBB43_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    li a0, 20
+; RV64IM-NEXT:  .LBB43_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst1:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 10
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 10
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst1:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 10
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 10
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 10, i32 20
+  ret i32 %ret
+}
+
+define i32 @select_cst2(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst2:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 10
+; RV32IM-NEXT:    bnez a1, .LBB44_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 5
+; RV32IM-NEXT:    addi a0, a0, -480
+; RV32IM-NEXT:  .LBB44_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 10
+; RV64IM-NEXT:    bnez a1, .LBB44_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 5
+; RV64IM-NEXT:    addiw a0, a0, -480
+; RV64IM-NEXT:  .LBB44_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst2:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 5
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -490
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 10
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_cst2:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    lui a1, 5
+; RV32IMZICOND-NEXT:    addi a1, a1, -490
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, 10
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst2:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    lui a1, 5
+; RV64IMZICOND-NEXT:    addiw a1, a1, -490
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    addi a0, a0, 10
+; RV64IMZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 10, i32 20000
+  ret i32 %ret
+}
+
+define i32 @select_cst3(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst3:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    bnez a0, .LBB45_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 5
+; RV32IM-NEXT:    addi a0, a0, -480
+; RV32IM-NEXT:    ret
+; RV32IM-NEXT:  .LBB45_2:
+; RV32IM-NEXT:    lui a0, 7
+; RV32IM-NEXT:    addi a0, a0, 1328
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst3:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    bnez a0, .LBB45_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 5
+; RV64IM-NEXT:    addiw a0, a0, -480
+; RV64IM-NEXT:    ret
+; RV64IM-NEXT:  .LBB45_2:
+; RV64IM-NEXT:    lui a0, 7
+; RV64IM-NEXT:    addiw a0, a0, 1328
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst3:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 1048574
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -1808
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 7
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, 1328
+; RV64IMXVTCONDOPS-NEXT:    add a0, a0, a1
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_cst3:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    lui a1, 1048574
+; RV32IMZICOND-NEXT:    addi a1, a1, -1808
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    lui a1, 7
+; RV32IMZICOND-NEXT:    addi a1, a1, 1328
+; RV32IMZICOND-NEXT:    add a0, a0, a1
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst3:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    lui a1, 1048574
+; RV64IMZICOND-NEXT:    addiw a1, a1, -1808
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    lui a1, 7
+; RV64IMZICOND-NEXT:    addiw a1, a1, 1328
+; RV64IMZICOND-NEXT:    add a0, a0, a1
+; RV64IMZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 30000, i32 20000
+  ret i32 %ret
+}
+
+define i32 @select_cst4(i1 zeroext %cond) {
+; CHECK-LABEL: select_cst4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    xori a0, a0, 2047
+; CHECK-NEXT:    ret
+  %ret = select i1 %cond, i32 -2048, i32 2047
+  ret i32 %ret
+}
+
+define i32 @select_cst5(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst5:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 2047
+; RV32IM-NEXT:    bnez a1, .LBB47_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a0, a0, -2047
+; RV32IM-NEXT:  .LBB47_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst5:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 2047
+; RV64IM-NEXT:    bnez a1, .LBB47_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 1
+; RV64IM-NEXT:    addiw a0, a0, -2047
+; RV64IM-NEXT:  .LBB47_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst5:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 2
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 2047
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst5:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 2
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 2047
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 2047, i32 2049
+  ret i32 %ret
+}
+
+define i32 @select_cst6(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst6:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    bnez a0, .LBB48_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    li a0, 2047
+; RV32IM-NEXT:    ret
+; RV32IM-NEXT:  .LBB48_2:
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a0, a0, -2047
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst6:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    bnez a0, .LBB48_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    li a0, 2047
+; RV64IM-NEXT:    ret
+; RV64IM-NEXT:  .LBB48_2:
+; RV64IM-NEXT:    lui a0, 1
+; RV64IM-NEXT:    addiw a0, a0, -2047
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst6:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 2
+; RV64IMXVTCONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 2047
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst6:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 2
+; CHECKZICOND-NEXT:    czero.eqz a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 2047
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 2049, i32 2047
+  ret i32 %ret
+}

From 43f1fa99ca7d05be9545a102e15ad0d607887839 Mon Sep 17 00:00:00 2001
From: cmtice <cmtice@google.com>
Date: Thu, 22 Feb 2024 08:20:54 -0800
Subject: [PATCH 052/546] [LLVM][DebugInfo] Refactor some code for easier
 sharing. (#82153)

Refactor the code that calculates the offsets for the various pieces of
the DWARF .debug_names index section, to make it easier to share the
code with other tools, such as LLD.
---
 .../DebugInfo/DWARF/DWARFAcceleratorTable.h   | 26 +++++--
 .../DebugInfo/DWARF/DWARFAcceleratorTable.cpp | 73 ++++++++++++-------
 2 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index a26c44bf7e9c2..d368c7e0ece8f 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -562,6 +562,17 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     uint64_t getEntryOffset() const { return EntryOffset; }
   };
 
+  /// Offsets for the start of various important tables from the start of the
+  /// section.
+  struct DWARFDebugNamesOffsets {
+    uint64_t CUsBase;
+    uint64_t BucketsBase;
+    uint64_t HashesBase;
+    uint64_t StringOffsetsBase;
+    uint64_t EntryOffsetsBase;
+    uint64_t EntriesBase;
+  };
+
   /// Represents a single accelerator table within the DWARF v5 .debug_names
   /// section.
   class NameIndex {
@@ -572,12 +583,7 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     // Base of the whole unit and of various important tables, as offsets from
     // the start of the section.
     uint64_t Base;
-    uint64_t CUsBase;
-    uint64_t BucketsBase;
-    uint64_t HashesBase;
-    uint64_t StringOffsetsBase;
-    uint64_t EntryOffsetsBase;
-    uint64_t EntriesBase;
+    DWARFDebugNamesOffsets Offsets;
 
     void dumpCUs(ScopedPrinter &W) const;
     void dumpLocalTUs(ScopedPrinter &W) const;
@@ -638,7 +644,7 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     /// Returns the Entry at the relative `Offset` from the start of the Entry
     /// pool.
     Expected<Entry> getEntryAtRelativeOffset(uint64_t Offset) const {
-      auto OffsetFromSection = Offset + this->EntriesBase;
+      auto OffsetFromSection = Offset + this->Offsets.EntriesBase;
       return getEntry(&OffsetFromSection);
     }
 
@@ -793,6 +799,12 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
   const NameIndex *getCUNameIndex(uint64_t CUOffset);
 };
 
+/// Calculates the starting offsets for various sections within the
+/// .debug_names section.
+void findDebugNamesOffsets(DWARFDebugNames::DWARFDebugNamesOffsets &Offsets,
+                           uint64_t HdrSize, const dwarf::DwarfFormat Format,
+                           const DWARFDebugNames::Header &Hdr);
+
 /// If `Name` is the name of a templated function that includes template
 /// parameters, returns a substring of `Name` containing no template
 /// parameters.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 78f819dd052aa..9c65d85985f1b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -510,7 +510,7 @@ DWARFDebugNames::Abbrev DWARFDebugNames::AbbrevMapInfo::getTombstoneKey() {
 
 Expected<DWARFDebugNames::AttributeEncoding>
 DWARFDebugNames::NameIndex::extractAttributeEncoding(uint64_t *Offset) {
-  if (*Offset >= EntriesBase) {
+  if (*Offset >= Offsets.EntriesBase) {
     return createStringError(errc::illegal_byte_sequence,
                              "Incorrectly terminated abbreviation table.");
   }
@@ -536,7 +536,7 @@ DWARFDebugNames::NameIndex::extractAttributeEncodings(uint64_t *Offset) {
 
 Expected<DWARFDebugNames::Abbrev>
 DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
-  if (*Offset >= EntriesBase) {
+  if (*Offset >= Offsets.EntriesBase) {
     return createStringError(errc::illegal_byte_sequence,
                              "Incorrectly terminated abbreviation table.");
   }
@@ -552,32 +552,50 @@ DWARFDebugNames::NameIndex::extractAbbrev(uint64_t *Offset) {
   return Abbrev(Code, dwarf::Tag(Tag), AbbrevOffset, std::move(*AttrEncOr));
 }
 
+void llvm::findDebugNamesOffsets(
+    DWARFDebugNames::DWARFDebugNamesOffsets &Offsets, uint64_t HdrSize,
+    dwarf::DwarfFormat Format, const DWARFDebugNames::Header &Hdr) {
+  uint32_t DwarfSize = (Format == llvm::dwarf::DwarfFormat::DWARF64) ? 8 : 4;
+  uint64_t Offset = HdrSize;
+  Offsets.CUsBase = Offset;
+  Offset += Hdr.CompUnitCount * DwarfSize;
+  Offset += Hdr.LocalTypeUnitCount * DwarfSize;
+  Offset += Hdr.ForeignTypeUnitCount * 8;
+
+  Offsets.BucketsBase = Offset;
+  Offset += Hdr.BucketCount * 4;
+
+  Offsets.HashesBase = Offset;
+  if (Hdr.BucketCount > 0)
+    Offset += Hdr.NameCount * 4;
+
+  Offsets.StringOffsetsBase = Offset;
+  Offset += Hdr.NameCount * DwarfSize;
+
+  Offsets.EntryOffsetsBase = Offset;
+  Offset += Hdr.NameCount * DwarfSize;
+
+  Offset += Hdr.AbbrevTableSize;
+  Offsets.EntriesBase = Offset;
+}
+
 Error DWARFDebugNames::NameIndex::extract() {
   const DWARFDataExtractor &AS = Section.AccelSection;
-  uint64_t Offset = Base;
-  if (Error E = Hdr.extract(AS, &Offset))
+  uint64_t hdrSize = Base;
+  if (Error E = Hdr.extract(AS, &hdrSize))
     return E;
 
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  CUsBase = Offset;
-  Offset += Hdr.CompUnitCount * SectionOffsetSize;
-  Offset += Hdr.LocalTypeUnitCount * SectionOffsetSize;
-  Offset += Hdr.ForeignTypeUnitCount * 8;
-  BucketsBase = Offset;
-  Offset += Hdr.BucketCount * 4;
-  HashesBase = Offset;
-  if (Hdr.BucketCount > 0)
-    Offset += Hdr.NameCount * 4;
-  StringOffsetsBase = Offset;
-  Offset += Hdr.NameCount * SectionOffsetSize;
-  EntryOffsetsBase = Offset;
-  Offset += Hdr.NameCount * SectionOffsetSize;
+  findDebugNamesOffsets(Offsets, hdrSize, Hdr.Format, Hdr);
+
+  uint64_t Offset =
+      Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize);
 
   if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize))
     return createStringError(errc::illegal_byte_sequence,
                              "Section too small: cannot read abbreviations.");
 
-  EntriesBase = Offset + Hdr.AbbrevTableSize;
+  Offsets.EntriesBase = Offset + Hdr.AbbrevTableSize;
 
   for (;;) {
     auto AbbrevOr = extractAbbrev(&Offset);
@@ -679,7 +697,7 @@ void DWARFDebugNames::Entry::dumpParentIdx(
     return;
   }
 
-  auto AbsoluteOffset = NameIdx->EntriesBase + FormValue.getRawUValue();
+  auto AbsoluteOffset = NameIdx->Offsets.EntriesBase + FormValue.getRawUValue();
   W.getOStream() << "Entry @ 0x" + Twine::utohexstr(AbsoluteOffset);
 }
 
@@ -708,14 +726,15 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const {
 uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
   assert(CU < Hdr.CompUnitCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  uint64_t Offset = CUsBase + SectionOffsetSize * CU;
+  uint64_t Offset = Offsets.CUsBase + SectionOffsetSize * CU;
   return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
 }
 
 uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
   assert(TU < Hdr.LocalTypeUnitCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  uint64_t Offset = CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
+  uint64_t Offset =
+      Offsets.CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
   return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
 }
 
@@ -723,7 +742,7 @@ uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const {
   assert(TU < Hdr.ForeignTypeUnitCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
   uint64_t Offset =
-      CUsBase +
+      Offsets.CUsBase +
       SectionOffsetSize * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU;
   return Section.AccelSection.getU64(&Offset);
 }
@@ -759,28 +778,28 @@ DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
   uint64_t StringOffsetOffset =
-      StringOffsetsBase + SectionOffsetSize * (Index - 1);
+      Offsets.StringOffsetsBase + SectionOffsetSize * (Index - 1);
   uint64_t EntryOffsetOffset =
-      EntryOffsetsBase + SectionOffsetSize * (Index - 1);
+      Offsets.EntryOffsetsBase + SectionOffsetSize * (Index - 1);
   const DWARFDataExtractor &AS = Section.AccelSection;
 
   uint64_t StringOffset =
       AS.getRelocatedValue(SectionOffsetSize, &StringOffsetOffset);
   uint64_t EntryOffset = AS.getUnsigned(&EntryOffsetOffset, SectionOffsetSize);
-  EntryOffset += EntriesBase;
+  EntryOffset += Offsets.EntriesBase;
   return {Section.StringSection, Index, StringOffset, EntryOffset};
 }
 
 uint32_t
 DWARFDebugNames::NameIndex::getBucketArrayEntry(uint32_t Bucket) const {
   assert(Bucket < Hdr.BucketCount);
-  uint64_t BucketOffset = BucketsBase + 4 * Bucket;
+  uint64_t BucketOffset = Offsets.BucketsBase + 4 * Bucket;
   return Section.AccelSection.getU32(&BucketOffset);
 }
 
 uint32_t DWARFDebugNames::NameIndex::getHashArrayEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
-  uint64_t HashOffset = HashesBase + 4 * (Index - 1);
+  uint64_t HashOffset = Offsets.HashesBase + 4 * (Index - 1);
   return Section.AccelSection.getU32(&HashOffset);
 }
 

From f67ef1a8d9841718ce08a69d935ac8fd8e6112f9 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 22 Feb 2024 08:24:38 -0800
Subject: [PATCH 053/546] [RISCV][LV] Add additional small trip count loop
 coverage

---
 .../LoopVectorize/RISCV/low-trip-count.ll     | 368 +++++++++++++++++-
 1 file changed, 366 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index acb4489bd76b0..7ccbc98d26567 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -3,6 +3,116 @@
 
 target triple = "riscv64"
 
+define void @trip1_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip1_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 1
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip3_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip3_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 3, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 3)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 3
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
 define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
 ; CHECK-LABEL: @trip5_i8(
 ; CHECK-NEXT:  entry:
@@ -33,7 +143,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 16 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -50,7 +160,7 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -74,4 +184,258 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
+define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip8_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP7]], i64 8)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP9]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 8 x i8> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0(ptr [[TMP12]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 8 x i8> [[TMP10]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP13]], ptr [[TMP12]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP14]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP15]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 8
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip16_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <16 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 16
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip32_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <32 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <32 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <32 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 32
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip24_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <8 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <8 x i8> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <8 x i8> [[TMP6]], ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP8]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP9]]
+; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %i.08
+  %0 = load i8, ptr %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i.08
+  %1 = load i8, ptr %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 24
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
 attributes #0 = { "target-features"="+v,+d" vscale_range(2, 1024) }
+

From c9afd1ad783a67210bed4fd2f7108477fc986e15 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 21 Feb 2024 23:48:19 -0800
Subject: [PATCH 054/546] [RISCV] Add test case showing missed opportunity to
 form sextload when sext and zext nneg are both present. NFC

---
 llvm/test/CodeGen/RISCV/sext-zext-trunc.ll | 84 ++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index a2a953ca882ba..09516d91771ca 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -871,3 +871,87 @@ define void @zext_nneg_dominating_icmp_i32_zeroext(i16 signext %0) {
 5:
   ret void
 }
+
+; The load is used extended and non-extended in the successor basic block. The
+; signed compare will cause the non-extended value to exported out of the first
+; basic block using a sext to XLen. We need to CSE the zext nneg with the sext
+; so that we can form a sextload.
+define void @load_zext_nneg_sext_cse(ptr %p) nounwind {
+; RV32I-LABEL: load_zext_nneg_sext_cse:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lhu s0, 0(a0)
+; RV32I-NEXT:    slli a0, s0, 16
+; RV32I-NEXT:    bltz a0, .LBB50_2
+; RV32I-NEXT:  # %bb.1: # %bb1
+; RV32I-NEXT:    srai a0, a0, 16
+; RV32I-NEXT:    call bar_i16
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    tail bar_i32
+; RV32I-NEXT:  .LBB50_2: # %bb2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: load_zext_nneg_sext_cse:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lhu s0, 0(a0)
+; RV64I-NEXT:    slli a0, s0, 48
+; RV64I-NEXT:    bltz a0, .LBB50_2
+; RV64I-NEXT:  # %bb.1: # %bb1
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    call bar_i16
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    tail bar_i32
+; RV64I-NEXT:  .LBB50_2: # %bb2
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: load_zext_nneg_sext_cse:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    addi sp, sp, -16
+; RV64ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64ZBB-NEXT:    lhu s0, 0(a0)
+; RV64ZBB-NEXT:    sext.h a0, s0
+; RV64ZBB-NEXT:    bltz a0, .LBB50_2
+; RV64ZBB-NEXT:  # %bb.1: # %bb1
+; RV64ZBB-NEXT:    call bar_i16
+; RV64ZBB-NEXT:    mv a0, s0
+; RV64ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    addi sp, sp, 16
+; RV64ZBB-NEXT:    tail bar_i32
+; RV64ZBB-NEXT:  .LBB50_2: # %bb2
+; RV64ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64ZBB-NEXT:    addi sp, sp, 16
+; RV64ZBB-NEXT:    ret
+  %load = load i16, ptr %p
+  %zext = zext nneg i16 %load to i32
+  %cmp = icmp sgt i16 %load, -1
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  tail call void @bar_i16(i16 signext %load)
+  tail call void @bar_i32(i32 signext %zext)
+  br label %bb2
+
+bb2:
+  ret void
+}
+declare void @bar_i16(i16);

From a51f4afc5aec8145091fead1d68c81e7d210fc0d Mon Sep 17 00:00:00 2001
From: Shimin Cui <scui@ca.ibm.com>
Date: Thu, 22 Feb 2024 12:04:08 -0500
Subject: [PATCH 055/546] [HCS] Externd to outline overlapping sub/super cold
 regions (#80732)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, with hot cold splitting, when a cold region is identified, it
is added to the region list of ColdBlocks. Then when another cold region
(B) identified overlaps with a ColdBlocks region (A) already added to
the list, the region B is not added to the list because of the
overlapping with region A. The splitting analysis is performed, and the
region A may not get split, for example, if it’s considered too
expansive. This is to improve the handling the overlapping case when the
region A is not considered good for splitting, while the region B is
good for splitting.
 
The change is to move the cold region splitting analysis earlier to
allow more cold region splitting. If an identified region cannot be
split, it will not be added to the candidate list of ColdBlocks for
overlapping check.
---
 .../llvm/Transforms/IPO/HotColdSplitting.h    |  15 +-
 llvm/lib/Transforms/IPO/HotColdSplitting.cpp  | 154 ++++++++++--------
 .../assumption-cache-invalidation.ll          |   8 +-
 llvm/test/Transforms/HotColdSplit/eh-pads.ll  |   7 +-
 .../HotColdSplit/outline-disjoint-diamonds.ll |   9 +-
 .../HotColdSplit/outline-inner-region.ll      |  49 ++++++
 .../HotColdSplit/outline-outer-region.ll      |  52 ++++++
 7 files changed, 212 insertions(+), 82 deletions(-)
 create mode 100644 llvm/test/Transforms/HotColdSplit/outline-inner-region.ll
 create mode 100644 llvm/test/Transforms/HotColdSplit/outline-outer-region.ll

diff --git a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
index c87c6453500c5..13dda6d61284c 100644
--- a/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
+++ b/llvm/include/llvm/Transforms/IPO/HotColdSplitting.h
@@ -24,6 +24,7 @@ class TargetTransformInfo;
 class OptimizationRemarkEmitter;
 class AssumptionCache;
 class DominatorTree;
+class CodeExtractor;
 class CodeExtractorAnalysisCache;
 
 /// A sequence of basic blocks.
@@ -43,19 +44,17 @@ class HotColdSplitting {
 
 private:
   bool isFunctionCold(const Function &F) const;
-  bool isBasicBlockCold(BasicBlock* BB,
-                        BranchProbability ColdProbThresh,
-                        SmallPtrSetImpl<BasicBlock *> &ColdBlocks,
+  bool isBasicBlockCold(BasicBlock *BB, BranchProbability ColdProbThresh,
                         SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
                         BlockFrequencyInfo *BFI) const;
   bool shouldOutlineFrom(const Function &F) const;
   bool outlineColdRegions(Function &F, bool HasProfileSummary);
-  Function *extractColdRegion(const BlockSequence &Region,
+  bool isSplittingBeneficial(CodeExtractor &CE, const BlockSequence &Region,
+                             TargetTransformInfo &TTI);
+  Function *extractColdRegion(BasicBlock &EntryPoint, CodeExtractor &CE,
                               const CodeExtractorAnalysisCache &CEAC,
-                              DominatorTree &DT, BlockFrequencyInfo *BFI,
-                              TargetTransformInfo &TTI,
-                              OptimizationRemarkEmitter &ORE,
-                              AssumptionCache *AC, unsigned Count);
+                              BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+                              OptimizationRemarkEmitter &ORE);
   ProfileSummaryInfo *PSI;
   function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
   function_ref<TargetTransformInfo &(Function &)> GetTTI;
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index fabb3c5fb921d..5f03bd59b8cd1 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -215,15 +215,10 @@ bool HotColdSplitting::isFunctionCold(const Function &F) const {
   return false;
 }
 
-bool HotColdSplitting::isBasicBlockCold(BasicBlock *BB,
-                          BranchProbability ColdProbThresh,
-                          SmallPtrSetImpl<BasicBlock *> &ColdBlocks,
-                          SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
-                          BlockFrequencyInfo *BFI) const {
-  // This block is already part of some outlining region.
-  if (ColdBlocks.count(BB))
-    return true;
-
+bool HotColdSplitting::isBasicBlockCold(
+    BasicBlock *BB, BranchProbability ColdProbThresh,
+    SmallPtrSetImpl<BasicBlock *> &AnnotatedColdBlocks,
+    BlockFrequencyInfo *BFI) const {
   if (BFI) {
     if (PSI->isColdBlock(BB, BFI))
       return true;
@@ -372,18 +367,12 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
   return Penalty;
 }
 
-Function *HotColdSplitting::extractColdRegion(
-    const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
-    DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
-    OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
+// Determine if it is beneficial to split the \p Region.
+bool HotColdSplitting::isSplittingBeneficial(CodeExtractor &CE,
+                                             const BlockSequence &Region,
+                                             TargetTransformInfo &TTI) {
   assert(!Region.empty());
 
-  // TODO: Pass BFI and BPI to update profile information.
-  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
-                   /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
-                   /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
-                   /* Suffix */ "cold." + std::to_string(Count));
-
   // Perform a simple cost/benefit analysis to decide whether or not to permit
   // splitting.
   SetVector<Value *> Inputs, Outputs, Sinks;
@@ -394,9 +383,18 @@ Function *HotColdSplitting::extractColdRegion(
   LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
                     << ", penalty = " << OutliningPenalty << "\n");
   if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
-    return nullptr;
+    return false;
+
+  return true;
+}
 
-  Function *OrigF = Region[0]->getParent();
+// Split the single \p EntryPoint cold region. \p CE is the region code
+// extractor.
+Function *HotColdSplitting::extractColdRegion(
+    BasicBlock &EntryPoint, CodeExtractor &CE,
+    const CodeExtractorAnalysisCache &CEAC, BlockFrequencyInfo *BFI,
+    TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE) {
+  Function *OrigF = EntryPoint.getParent();
   if (Function *OutF = CE.extractCodeRegion(CEAC)) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
@@ -419,7 +417,7 @@ Function *HotColdSplitting::extractColdRegion(
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
     ORE.emit([&]() {
       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
-                                &*Region[0]->begin())
+                                &*EntryPoint.begin())
              << ore::NV("Original", OrigF) << " split cold code into "
              << ore::NV("Split", OutF);
     });
@@ -428,9 +426,9 @@ Function *HotColdSplitting::extractColdRegion(
 
   ORE.emit([&]() {
     return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
-                                    &*Region[0]->begin())
+                                    &*EntryPoint.begin())
            << "Failed to extract region at block "
-           << ore::NV("Block", Region.front());
+           << ore::NV("Block", &EntryPoint);
   });
   return nullptr;
 }
@@ -620,16 +618,18 @@ class OutliningRegion {
 } // namespace
 
 bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
-  bool Changed = false;
-
-  // The set of cold blocks.
+  // The set of cold blocks outlined.
   SmallPtrSet<BasicBlock *, 4> ColdBlocks;
 
+  // The set of cold blocks cannot be outlined.
+  SmallPtrSet<BasicBlock *, 4> CannotBeOutlinedColdBlocks;
+
   // Set of cold blocks obtained with RPOT.
   SmallPtrSet<BasicBlock *, 4> AnnotatedColdBlocks;
 
-  // The worklist of non-intersecting regions left to outline.
-  SmallVector<OutliningRegion, 2> OutliningWorklist;
+  // The worklist of non-intersecting regions left to outline. The first member
+  // of the pair is the entry point into the region to be outlined.
+  SmallVector<std::pair<BasicBlock *, CodeExtractor>, 2> OutliningWorklist;
 
   // Set up an RPO traversal. Experimentally, this performs better (outlines
   // more) than a PO traversal, because we prevent region overlap by keeping
@@ -655,10 +655,18 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
   if (ColdBranchProbDenom.getNumOccurrences())
     ColdProbThresh = BranchProbability(1, ColdBranchProbDenom.getValue());
 
+  unsigned OutlinedFunctionID = 1;
   // Find all cold regions.
   for (BasicBlock *BB : RPOT) {
-    if (!isBasicBlockCold(BB, ColdProbThresh, ColdBlocks, AnnotatedColdBlocks,
-                          BFI))
+    // This block is already part of some outlining region.
+    if (ColdBlocks.count(BB))
+      continue;
+
+    // This block is already part of some region cannot be outlined.
+    if (CannotBeOutlinedColdBlocks.count(BB))
+      continue;
+
+    if (!isBasicBlockCold(BB, ColdProbThresh, AnnotatedColdBlocks, BFI))
       continue;
 
     LLVM_DEBUG({
@@ -681,50 +689,68 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
         return markFunctionCold(F);
       }
 
-      // If this outlining region intersects with another, drop the new region.
-      //
-      // TODO: It's theoretically possible to outline more by only keeping the
-      // largest region which contains a block, but the extra bookkeeping to do
-      // this is tricky/expensive.
-      bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
-        return !ColdBlocks.insert(Block.first).second;
-      });
-      if (RegionsOverlap)
-        continue;
+      do {
+        BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
+        LLVM_DEBUG({
+          dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
+          for (BasicBlock *BB : SubRegion)
+            BB->dump();
+        });
+
+        // TODO: Pass BFI and BPI to update profile information.
+        CodeExtractor CE(
+            SubRegion, &*DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+            /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
+            /* AllowAlloca */ false, /* AllocaBlock */ nullptr,
+            /* Suffix */ "cold." + std::to_string(OutlinedFunctionID));
+
+        if (CE.isEligible() && isSplittingBeneficial(CE, SubRegion, TTI) &&
+            // If this outlining region intersects with another, drop the new
+            // region.
+            //
+            // TODO: It's theoretically possible to outline more by only keeping
+            // the largest region which contains a block, but the extra
+            // bookkeeping to do this is tricky/expensive.
+            none_of(SubRegion, [&](BasicBlock *Block) {
+              return ColdBlocks.contains(Block);
+            })) {
+          ColdBlocks.insert(SubRegion.begin(), SubRegion.end());
+
+          for (auto *Block : SubRegion) {
+            LLVM_DEBUG(dbgs()
+                       << "  contains cold block:" << Block->getName() << "\n");
+          }
+
+          OutliningWorklist.emplace_back(
+              std::make_pair(SubRegion[0], std::move(CE)));
+          ++OutlinedFunctionID;
+        } else {
+          // The cold block region cannot be outlined.
+          for (auto *Block : SubRegion)
+            if ((DT->dominates(BB, Block) && PDT->dominates(Block, BB)) ||
+                (PDT->dominates(BB, Block) && DT->dominates(Block, BB)))
+              // Will skip this cold block in the loop to save the compile time
+              CannotBeOutlinedColdBlocks.insert(Block);
+        }
+      } while (!Region.empty());
 
-      OutliningWorklist.emplace_back(std::move(Region));
       ++NumColdRegionsFound;
     }
   }
 
   if (OutliningWorklist.empty())
-    return Changed;
+    return false;
 
   // Outline single-entry cold regions, splitting up larger regions as needed.
-  unsigned OutlinedFunctionID = 1;
   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
   CodeExtractorAnalysisCache CEAC(F);
-  do {
-    OutliningRegion Region = OutliningWorklist.pop_back_val();
-    assert(!Region.empty() && "Empty outlining region in worklist");
-    do {
-      BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
-      LLVM_DEBUG({
-        dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
-        for (BasicBlock *BB : SubRegion)
-          BB->dump();
-      });
-
-      Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
-                                             ORE, AC, OutlinedFunctionID);
-      if (Outlined) {
-        ++OutlinedFunctionID;
-        Changed = true;
-      }
-    } while (!Region.empty());
-  } while (!OutliningWorklist.empty());
+  for (auto &BCE : OutliningWorklist) {
+    Function *Outlined =
+        extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE);
+    assert(Outlined && "Should be outlined");
+  }
 
-  return Changed;
+  return true;
 }
 
 bool HotColdSplitting::run(Module &M) {
diff --git a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
index 2154fb5cb5bc1..8bc71148352d2 100644
--- a/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
+++ b/llvm/test/Transforms/HotColdSplit/assumption-cache-invalidation.ll
@@ -13,13 +13,13 @@ target triple = "aarch64"
 ; CHECK-NOT: @llvm.assume
 ; CHECK: }
 ; CHECK: declare {{.*}}@llvm.assume
-; CHECK: define {{.*}}@f.cold.1()
-; CHECK-LABEL: newFuncRoot:
-; CHECK: }
-; CHECK: define {{.*}}@f.cold.2(i64 %load1)
+; CHECK: define {{.*}}@f.cold.1(i64 %load1)
 ; CHECK-LABEL: newFuncRoot:
 ; CHECK: %cmp1 = icmp eq i64 %load1, 0
 ; CHECK-NOT: call void @llvm.assume
+; CHECK: define {{.*}}@f.cold.2()
+; CHECK-LABEL: newFuncRoot:
+; CHECK: }
 
 define void @f() {
 entry:
diff --git a/llvm/test/Transforms/HotColdSplit/eh-pads.ll b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
index 415c7e4b2bde3..ad7baf97f68d0 100644
--- a/llvm/test/Transforms/HotColdSplit/eh-pads.ll
+++ b/llvm/test/Transforms/HotColdSplit/eh-pads.ll
@@ -84,13 +84,16 @@ cold4:
 ; CHECK: sink
 
 ; CHECK-LABEL: define {{.*}}@bar.cold.1(
+; CHECK: sideeffect(i32 0)
+
+; CHECK-LABEL: define {{.*}}@bar.cold.2(
 ; CHECK: sideeffect(i32 1)
 
 ; CHECK-LABEL: define {{.*}}@baz.cold.1(
-; CHECK: sideeffect(i32 1)
+; CHECK: sideeffect(i32 0)
 
 ; CHECK-LABEL: define {{.*}}@baz.cold.2(
-; CHECK: sideeffect(i32 0)
+; CHECK: sideeffect(i32 1)
 
 declare void @sideeffect(i32)
 
diff --git a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
index 65f8aad424066..0c055981260b2 100644
--- a/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
+++ b/llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: define {{.*}}@fun
-; CHECK: call {{.*}}@fun.cold.2(
-; CHECK-NEXT: ret void
 ; CHECK: call {{.*}}@fun.cold.1(
 ; CHECK-NEXT: ret void
+; CHECK: call {{.*}}@fun.cold.2(
+; CHECK-NEXT: ret void
 define void @fun() {
 entry:
   br i1 undef, label %A.then, label %A.else
@@ -49,9 +49,10 @@ B.cleanup:
 }
 
 ; CHECK-LABEL: define {{.*}}@fun.cold.1(
-; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
+; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
 ; CHECK-NEXT: unreachable
 
 ; CHECK-LABEL: define {{.*}}@fun.cold.2(
-; CHECK: %A.cleanup.dest.slot.0 = phi i32 [ 1, %A.then5 ], [ 0, %A.end ]
+; CHECK: %B.cleanup.dest.slot.0 = phi i32 [ 1, %B.then5 ], [ 0, %B.end ]
 ; CHECK-NEXT: unreachable
+
diff --git a/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll b/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll
new file mode 100644
index 0000000000000..73398bf365ff0
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-inner-region.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-max-params=1 < %s | FileCheck %s
+
+target datalayout = "E-m:a-p:32:32-i64:64-n32"
+target triple = "powerpc64-ibm-aix7.2.0.0"
+
+define void @foo(i32 %cond) {
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK:       if.then:
+; CHECK:       br i1 {{.*}}, label %if.then1, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT:  call void @foo.cold.1
+;
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, ptr %cond.addr
+  %0 = load i32, ptr %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %cond.addr
+  call void @sink(i32 %0)
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 2)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sink(i32 0)
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  br label %if.end2
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK:       call {{.*}}@sink
+; CHECK-NEXT:  call {{.*}}@sideeffect
+
+declare void @sideeffect(i32)
+
+declare void @sink(i32) cold
diff --git a/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll b/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll
new file mode 100644
index 0000000000000..4a3c96982a87b
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-outer-region.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=2 < %s | FileCheck %s
+
+target datalayout = "E-m:a-p:32:32-i64:64-n32"
+target triple = "powerpc64-ibm-aix7.2.0.0"
+
+define void @foo(i32 %cond, i32 %s0, i32 %s1) {
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK:       br i1 {{.*}}, label %codeRepl, label %if.end2
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end2:
+; CHECK: call void @sideeffect
+;
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, ptr %cond.addr
+  %0 = load i32, ptr %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %cond.addr
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sink(i32 %s0)
+  call void @sideeffect(i32 1)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  call void @sink(i32 %0)
+  ret void
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 %s1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: call {{.*}}@sink
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+
+declare void @sideeffect(i32)
+
+declare void @sink(i32) cold

From c1716e3fcf4e43b4a328731920f76b2fce9485d0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 22 Feb 2024 09:06:49 -0800
Subject: [PATCH 056/546] [DAGCombiner][RISCV] CSE zext nneg and sext. (#82597)

If we have a sext and a zext nneg with the same types and operand
we should combine them into the sext. We can't go the other way
because the nneg flag may only be valid in the context of the uses
of the zext nneg.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  7 ++
 llvm/test/CodeGen/RISCV/sext-zext-trunc.ll    | 69 +++++++------------
 2 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 89ef648ee7d7e..ed43dd7f52882 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13997,6 +13997,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
     return Res;
 
+  // CSE zext nneg with sext if the zext is not free.
+  if (N->getFlags().hasNonNeg() && !TLI.isZExtFree(N0.getValueType(), VT)) {
+    SDNode *CSENode = DAG.getNodeIfExists(ISD::SIGN_EXTEND, N->getVTList(), N0);
+    if (CSENode)
+      return SDValue(CSENode, 0);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index 09516d91771ca..87f2a6306bd60 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -882,11 +882,10 @@ define void @load_zext_nneg_sext_cse(ptr %p) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lhu s0, 0(a0)
-; RV32I-NEXT:    slli a0, s0, 16
-; RV32I-NEXT:    bltz a0, .LBB50_2
+; RV32I-NEXT:    lh s0, 0(a0)
+; RV32I-NEXT:    bltz s0, .LBB50_2
 ; RV32I-NEXT:  # %bb.1: # %bb1
-; RV32I-NEXT:    srai a0, a0, 16
+; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call bar_i16
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -899,48 +898,26 @@ define void @load_zext_nneg_sext_cse(ptr %p) nounwind {
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: load_zext_nneg_sext_cse:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lhu s0, 0(a0)
-; RV64I-NEXT:    slli a0, s0, 48
-; RV64I-NEXT:    bltz a0, .LBB50_2
-; RV64I-NEXT:  # %bb.1: # %bb1
-; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    call bar_i16
-; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    tail bar_i32
-; RV64I-NEXT:  .LBB50_2: # %bb2
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: load_zext_nneg_sext_cse:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    addi sp, sp, -16
-; RV64ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ZBB-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64ZBB-NEXT:    lhu s0, 0(a0)
-; RV64ZBB-NEXT:    sext.h a0, s0
-; RV64ZBB-NEXT:    bltz a0, .LBB50_2
-; RV64ZBB-NEXT:  # %bb.1: # %bb1
-; RV64ZBB-NEXT:    call bar_i16
-; RV64ZBB-NEXT:    mv a0, s0
-; RV64ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64ZBB-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
-; RV64ZBB-NEXT:    addi sp, sp, 16
-; RV64ZBB-NEXT:    tail bar_i32
-; RV64ZBB-NEXT:  .LBB50_2: # %bb2
-; RV64ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64ZBB-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
-; RV64ZBB-NEXT:    addi sp, sp, 16
-; RV64ZBB-NEXT:    ret
+; RV64-LABEL: load_zext_nneg_sext_cse:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lh s0, 0(a0)
+; RV64-NEXT:    bltz s0, .LBB50_2
+; RV64-NEXT:  # %bb.1: # %bb1
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    call bar_i16
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    tail bar_i32
+; RV64-NEXT:  .LBB50_2: # %bb2
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
   %load = load i16, ptr %p
   %zext = zext nneg i16 %load to i32
   %cmp = icmp sgt i16 %load, -1

From 5b53fa04db33a931b843b32946065490513484bf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 22 Feb 2024 09:07:21 -0800
Subject: [PATCH 057/546] [RISCV] Enable -riscv-enable-sink-fold by default.
 (#82026)

AArch64 has had it enabled since late November, so hopefully the main
issues have been resolved.

I see a small reduction in dynamic instruction count on every benchmark
in specint2017. The best improvement was 0.3% so nothing amazing.
---
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp              | 2 +-
 llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll            | 8 ++++----
 .../test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll | 8 ++++----
 llvm/test/CodeGen/RISCV/split-offsets.ll                  | 4 ++--
 llvm/test/CodeGen/RISCV/srem-vector-lkk.ll                | 8 ++++----
 llvm/test/CodeGen/RISCV/urem-vector-lkk.ll                | 8 ++++----
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index adef40e19cba4..3e20e451410f6 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -84,7 +84,7 @@ static cl::opt<bool> EnableRISCVDeadRegisterElimination(
 static cl::opt<bool>
     EnableSinkFold("riscv-enable-sink-fold",
                    cl::desc("Enable sinking and folding of instruction copies"),
-                   cl::init(false), cl::Hidden);
+                   cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableLoopDataPrefetch("riscv-enable-loop-data-prefetch", cl::Hidden,
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 91e73992bdfa3..3c2e84689c979 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I %s
+; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -code-model=medium < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I-MEDIUM %s
+; RUN:   | FileCheck -check-prefix=RV32I-MEDIUM %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I %s
+; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -code-model=medium < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I-MEDIUM %s
+; RUN:   | FileCheck -check-prefix=RV64I-MEDIUM %s
 
 ; We can often fold an ADDI into the offset of load/store instructions:
 ;   (load (addi base, off1), off2) -> (load base, off1+off2)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 88c299a19fb4e..a09ab3ee0252a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F
 
 declare <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i8>)
 
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index fc35bc4d2a16d..8d065daa2067c 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV64I
 
 ; Check that memory accesses to array elements with large offsets have those
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index ec6e978c2c68e..7fc4713ac2d6e 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IM %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64IM %s
 
 define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index eea8e64f2dddb..540883fdc517a 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV32IM %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV64I %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV64IM %s
 
 
From 26cc6f126a3b25644c595b3a5a0417b1e1ab42a8 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Thu, 22 Feb 2024 09:09:08 -0800
Subject: [PATCH 058/546] [OpenACC] Implement 'break' and 'continue' errors for
 Compute Cnstrcts (#82543)

OpenACC3.3 2.5.4 says: "A program may not branch into or out of a
compute construct". While some of this restriction isn't particularly
checkable, 'break' and 'continue' are possible and pretty trivial, so
this patch implements those limitations.

It IS unclear in the case of a 'break' in a 'switch' what should happen
(an antagonistic reading of the standard would prevent it from
 appearing), however we're choosing to special-case the break-in-switch
to ensure that this works (albeit, a 'parallel' directive on a 'switch'
isn't particularly useful, though permitted).

Future implementations of this rule will be in a follow-up patch.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +
 clang/include/clang/Sema/Scope.h              | 17 ++++
 clang/lib/Parse/ParseOpenACC.cpp              | 17 ++++
 clang/lib/Sema/Scope.cpp                      |  1 +
 clang/lib/Sema/SemaStmt.cpp                   | 22 +++++
 clang/test/SemaOpenACC/no-branch-in-out.c     | 95 +++++++++++++++++++
 6 files changed, 154 insertions(+)
 create mode 100644 clang/test/SemaOpenACC/no-branch-in-out.c

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a96f69d6ac760..ebda201361fb0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12203,4 +12203,6 @@ def warn_acc_clause_unimplemented
 def err_acc_construct_appertainment
     : Error<"OpenACC construct '%0' cannot be used here; it can only "
             "be used in a statement context">;
+def err_acc_branch_in_out
+    : Error<"invalid branch %select{out of|into}0 OpenACC region">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h
index 9e81706cd2aa1..e7f166fe3461f 100644
--- a/clang/include/clang/Sema/Scope.h
+++ b/clang/include/clang/Sema/Scope.h
@@ -150,6 +150,9 @@ class Scope {
     /// template scope in between), the outer scope does not increase the
     /// depth of recursion.
     LambdaScope = 0x8000000,
+    /// This is the scope of an OpenACC Compute Construct, which restricts
+    /// jumping into/out of it.
+    OpenACCComputeConstructScope = 0x10000000,
   };
 
 private:
@@ -469,6 +472,14 @@ class Scope {
     return false;
   }
 
+  /// Return true if this scope is a loop.
+  bool isLoopScope() const {
+    // 'switch' is the only loop that is not a 'break' scope as well, so we can
+    // just check BreakScope and not SwitchScope.
+    return (getFlags() & Scope::BreakScope) &&
+           !(getFlags() & Scope::SwitchScope);
+  }
+
   /// Determines whether this scope is the OpenMP directive scope
   bool isOpenMPDirectiveScope() const {
     return (getFlags() & Scope::OpenMPDirectiveScope);
@@ -504,6 +515,12 @@ class Scope {
     return getFlags() & Scope::OpenMPOrderClauseScope;
   }
 
+  /// Determine whether this scope is the statement associated with an OpenACC
+  /// Compute construct directive.
+  bool isOpenACCComputeConstructScope() const {
+    return getFlags() & Scope::OpenACCComputeConstructScope;
+  }
+
   /// Determine whether this scope is a while/do/for statement, which can have
   /// continue statements embedded into it.
   bool isContinueScope() const {
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 50e78e8687aea..4946a61fca007 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -560,6 +560,21 @@ bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) {
   llvm_unreachable("Unhandled directive->assoc stmt");
 }
 
+unsigned getOpenACCScopeFlags(OpenACCDirectiveKind DirKind) {
+  switch (DirKind) {
+  case OpenACCDirectiveKind::Parallel:
+    // Mark this as a BreakScope/ContinueScope as well as a compute construct
+    // so that we can diagnose trying to 'break'/'continue' inside of one.
+    return Scope::BreakScope | Scope::ContinueScope |
+           Scope::OpenACCComputeConstructScope;
+  case OpenACCDirectiveKind::Invalid:
+    llvm_unreachable("Shouldn't be creating a scope for an invalid construct");
+  default:
+    break;
+  }
+  return 0;
+}
+
 } // namespace
 
 // OpenACC 3.3, section 1.7:
@@ -1228,6 +1243,8 @@ StmtResult Parser::ParseOpenACCDirectiveStmt() {
 
   if (doesDirectiveHaveAssociatedStmt(DirInfo.DirKind)) {
     ParsingOpenACCDirectiveRAII DirScope(*this, /*Value=*/false);
+    ParseScope ACCScope(this, getOpenACCScopeFlags(DirInfo.DirKind));
+
     AssocStmt = getActions().ActOnOpenACCAssociatedStmt(DirInfo.DirKind,
                                                         ParseStatement());
   }
diff --git a/clang/lib/Sema/Scope.cpp b/clang/lib/Sema/Scope.cpp
index 4570d8c615fe5..cea6a62e34747 100644
--- a/clang/lib/Sema/Scope.cpp
+++ b/clang/lib/Sema/Scope.cpp
@@ -225,6 +225,7 @@ void Scope::dumpImpl(raw_ostream &OS) const {
       {CompoundStmtScope, "CompoundStmtScope"},
       {ClassInheritanceScope, "ClassInheritanceScope"},
       {CatchScope, "CatchScope"},
+      {OpenACCComputeConstructScope, "OpenACCComputeConstructScope"},
   };
 
   for (auto Info : FlagInfo) {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index dde3bd84e89f8..fcad09a63662b 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3356,6 +3356,14 @@ Sema::ActOnContinueStmt(SourceLocation ContinueLoc, Scope *CurScope) {
     // initialization of that variable.
     return StmtError(Diag(ContinueLoc, diag::err_continue_from_cond_var_init));
   }
+
+  // A 'continue' that would normally have execution continue on a block outside
+  // of a compute construct counts as 'branching out of' the compute construct,
+  // so diagnose here.
+  if (S->isOpenACCComputeConstructScope())
+    return StmtError(Diag(ContinueLoc, diag::err_acc_branch_in_out)
+                     << /*out of */ 0);
+
   CheckJumpOutOfSEHFinally(*this, ContinueLoc, *S);
 
   return new (Context) ContinueStmt(ContinueLoc);
@@ -3371,6 +3379,20 @@ Sema::ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope) {
   if (S->isOpenMPLoopScope())
     return StmtError(Diag(BreakLoc, diag::err_omp_loop_cannot_use_stmt)
                      << "break");
+
+  // OpenACC doesn't allow 'break'ing from a compute construct, so diagnose if
+  // we are trying to do so.  This can come in 2 flavors: 1-the break'able thing
+  // (besides the compute construct) 'contains' the compute construct, at which
+  // point the 'break' scope will be the compute construct.  Else it could be a
+  // loop of some sort that has a direct parent of the compute construct.
+  // However, a 'break' in a 'switch' marked as a compute construct doesn't
+  // count as 'branch out of' the compute construct.
+  if (S->isOpenACCComputeConstructScope() ||
+      (S->isLoopScope() && S->getParent() &&
+       S->getParent()->isOpenACCComputeConstructScope()))
+    return StmtError(Diag(BreakLoc, diag::err_acc_branch_in_out)
+                     << /*out of */ 0);
+
   CheckJumpOutOfSEHFinally(*this, BreakLoc, *S);
 
   return new (Context) BreakStmt(BreakLoc);
diff --git a/clang/test/SemaOpenACC/no-branch-in-out.c b/clang/test/SemaOpenACC/no-branch-in-out.c
new file mode 100644
index 0000000000000..622cf55f48473
--- /dev/null
+++ b/clang/test/SemaOpenACC/no-branch-in-out.c
@@ -0,0 +1,95 @@
+// RUN: %clang_cc1 %s -verify -fopenacc
+
+void BreakContinue() {
+
+#pragma acc parallel
+  for(int i =0; i < 5; ++i) {
+    switch(i) {
+      case 0:
+      break; // leaves switch, not 'for'.
+      default:
+      i +=2;
+      break;
+    }
+    if (i == 2)
+      continue;
+
+    break;  // expected-error{{invalid branch out of OpenACC region}}
+  }
+
+  int j;
+  switch(j) {
+    case 0:
+#pragma acc parallel
+    {
+      break; // expected-error{{invalid branch out of OpenACC region}}
+    }
+    case 1:
+#pragma acc parallel
+    {
+    }
+    break;
+  }
+
+#pragma acc parallel
+  for(int i = 0; i < 5; ++i) {
+    if (i > 1)
+      break; // expected-error{{invalid branch out of OpenACC region}}
+  }
+
+#pragma acc parallel
+  switch(j) {
+    case 1:
+      break;
+  }
+
+#pragma acc parallel
+  {
+    for(int i = 1; i < 100; i++) {
+      if (i > 4)
+        break;
+    }
+  }
+
+  for (int i =0; i < 5; ++i) {
+#pragma acc parallel
+    {
+      continue; // expected-error{{invalid branch out of OpenACC region}}
+    }
+  }
+
+#pragma acc parallel
+  for (int i =0; i < 5; ++i) {
+    continue;
+  }
+
+#pragma acc parallel
+  for (int i =0; i < 5; ++i) {
+    {
+      continue;
+    }
+  }
+
+  for (int i =0; i < 5; ++i) {
+#pragma acc parallel
+    {
+      break; // expected-error{{invalid branch out of OpenACC region}}
+    }
+  }
+
+#pragma acc parallel
+  while (j) {
+    --j;
+    if (j > 4)
+      break; // expected-error{{invalid branch out of OpenACC region}}
+  }
+
+#pragma acc parallel
+  do {
+    --j;
+    if (j > 4)
+      break; // expected-error{{invalid branch out of OpenACC region}}
+  } while (j );
+
+}
+

From 87b1e735b28f81d9012fd302cd07385db50a274f Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 23 Feb 2024 01:16:39 +0800
Subject: [PATCH 059/546] [ConstraintElim] Decompose sext-like insts for signed
 predicates (#82344)

Alive2: https://alive2.llvm.org/ce/z/A8dtGp
Fixes #82271.
---
 .../Scalar/ConstraintElimination.cpp          | 13 ++-
 .../ConstraintElimination/minmax.ll           |  9 +-
 .../Transforms/ConstraintElimination/sext.ll  | 84 +++++++++++++------
 3 files changed, 71 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index db05c63f388fb..9b6a39e98f5ce 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -499,6 +499,8 @@ static Decomposition decompose(Value *V,
   if (!Ty->isIntegerTy() || Ty->getIntegerBitWidth() > 64)
     return V;
 
+  bool IsKnownNonNegative = false;
+
   // Decompose \p V used with a signed predicate.
   if (IsSigned) {
     if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -507,6 +509,14 @@ static Decomposition decompose(Value *V,
     }
     Value *Op0;
     Value *Op1;
+
+    if (match(V, m_SExt(m_Value(Op0))))
+      V = Op0;
+    else if (match(V, m_NNegZExt(m_Value(Op0)))) {
+      V = Op0;
+      IsKnownNonNegative = true;
+    }
+
     if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1))))
       return MergeResults(Op0, Op1, IsSigned);
 
@@ -529,7 +539,7 @@ static Decomposition decompose(Value *V,
       }
     }
 
-    return V;
+    return {V, IsKnownNonNegative};
   }
 
   if (auto *CI = dyn_cast<ConstantInt>(V)) {
@@ -539,7 +549,6 @@ static Decomposition decompose(Value *V,
   }
 
   Value *Op0;
-  bool IsKnownNonNegative = false;
   if (match(V, m_ZExt(m_Value(Op0)))) {
     IsKnownNonNegative = true;
     V = Op0;
diff --git a/llvm/test/Transforms/ConstraintElimination/minmax.ll b/llvm/test/Transforms/ConstraintElimination/minmax.ll
index ab3e9f381245b..029b6508a2106 100644
--- a/llvm/test/Transforms/ConstraintElimination/minmax.ll
+++ b/llvm/test/Transforms/ConstraintElimination/minmax.ll
@@ -611,8 +611,7 @@ define i64 @pr82271(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]])
-; CHECK-NEXT:    ret i64 [[SMAX]]
+; CHECK-NEXT:    ret i64 [[SB]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i64 0
 ;
@@ -641,8 +640,7 @@ define i64 @pr82271_sext_zext_nneg(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]])
-; CHECK-NEXT:    ret i64 [[SMAX]]
+; CHECK-NEXT:    ret i64 [[SB]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i64 0
 ;
@@ -671,8 +669,7 @@ define i64 @pr82271_zext_nneg(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = zext nneg i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SB]], i64 [[ADD]])
-; CHECK-NEXT:    ret i64 [[SMAX]]
+; CHECK-NEXT:    ret i64 [[SB]]
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i64 0
 ;
diff --git a/llvm/test/Transforms/ConstraintElimination/sext.ll b/llvm/test/Transforms/ConstraintElimination/sext.ll
index ed8dd502b6ef9..5a8a37d0d5703 100644
--- a/llvm/test/Transforms/ConstraintElimination/sext.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sext.ll
@@ -11,8 +11,7 @@ define i1 @cmp_sext(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -31,33 +30,32 @@ else:
   ret i1 false
 }
 
-define i1 @cmp_sext_positive_increment(i32 %a, i32 %b, i64 %c){
-; CHECK-LABEL: define i1 @cmp_sext_positive_increment(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) {
+define i1 @cmp_sext_add(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @cmp_sext_add(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[POS:%.*]] = icmp sgt i64 [[C]], 0
-; CHECK-NEXT:    call void @llvm.assume(i1 [[POS]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], [[C]]
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    [[A1:%.*]] = add nsw i32 [[A]], 1
+; CHECK-NEXT:    [[B1:%.*]] = add nsw i32 [[B]], 1
+; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A1]] to i64
+; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B1]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
-  %pos = icmp sgt i64 %c, 0
-  call void @llvm.assume(i1 %pos)
   %cmp = icmp slt i32 %a, %b
   br i1 %cmp, label %then, label %else
 
 then:
-  %sa = sext i32 %a to i64
-  %sb = sext i32 %b to i64
-  %add = add nsw i64 %sa, %c
+  %a1 = add nsw i32 %a, 1
+  %b1 = add nsw i32 %b, 1
+  %sa = sext i32 %a1 to i64
+  %sb = sext i32 %b1 to i64
+  %add = add nsw i64 %sa, 1
   %cmp2 = icmp sge i64 %sb, %add
   ret i1 %cmp2
 
@@ -65,30 +63,33 @@ else:
   ret i1 false
 }
 
-define i1 @cmp_sext_sgt(i32 %a, i32 %b){
-; CHECK-LABEL: define i1 @cmp_sext_sgt(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+define i1 @cmp_sext_dynamic_increment(i32 %a, i32 %b, i64 %c){
+; CHECK-LABEL: define i1 @cmp_sext_dynamic_increment(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[POS:%.*]] = icmp slt i64 [[C]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[POS]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], [[C]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
+  %pos = icmp slt i64 %c, 2
+  call void @llvm.assume(i1 %pos)
   %cmp = icmp slt i32 %a, %b
   br i1 %cmp, label %then, label %else
 
 then:
   %sa = sext i32 %a to i64
   %sb = sext i32 %b to i64
-  %add = add nsw i64 %sa, 1
-  %cmp2 = icmp sgt i64 %sb, %add
+  %add = add nsw i64 %sa, %c
+  %cmp2 = icmp sge i64 %sb, %add
   ret i1 %cmp2
 
 else:
@@ -105,8 +106,7 @@ define i1 @cmp_zext_nneg(i32 %a, i32 %b){
 ; CHECK-NEXT:    [[SA:%.*]] = zext nneg i32 [[A]] to i64
 ; CHECK-NEXT:    [[SB:%.*]] = zext nneg i32 [[B]] to i64
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       else:
 ; CHECK-NEXT:    ret i1 false
 ;
@@ -216,3 +216,33 @@ then:
 else:
   ret i1 false
 }
+
+define i1 @cmp_sext_sgt(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @cmp_sext_sgt(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i64 [[SB]], [[ADD]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %then, label %else
+
+then:
+  %sa = sext i32 %a to i64
+  %sb = sext i32 %b to i64
+  %add = add nsw i64 %sa, 1
+  %cmp2 = icmp sgt i64 %sb, %add
+  ret i1 %cmp2
+
+else:
+  ret i1 false
+}

From 26d71d9ed56c4c23e6284dac7a9bdf603a5801f3 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 22 Feb 2024 09:24:21 -0800
Subject: [PATCH 060/546] [llvm-readobj,ELF] Support --decompress/-z (#82594)

When a section has the SHF_COMPRESSED flag, -p/-x dump the compressed
content by default. In GNU readelf, if --decompress/-z is specified,
-p/-x will dump the decompressed content. This patch implements the
option.

Close #82507
---
 llvm/docs/CommandGuide/llvm-readelf.rst       |  5 ++
 llvm/docs/CommandGuide/llvm-readobj.rst       |  5 ++
 .../ELF/decompress-zlib-unsupported.test      | 32 ++++++++
 .../llvm-readobj/ELF/decompress-zlib.test     | 76 +++++++++++++++++++
 .../ELF/decompress-zstd-unsupported.test      | 31 ++++++++
 .../llvm-readobj/ELF/decompress-zstd.test     | 28 +++++++
 llvm/tools/llvm-readobj/ObjDumper.cpp         | 26 ++++++-
 llvm/tools/llvm-readobj/ObjDumper.h           |  4 +-
 llvm/tools/llvm-readobj/Opts.td               |  2 +
 llvm/tools/llvm-readobj/llvm-readobj.cpp      |  6 +-
 10 files changed, 209 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test

diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst
index 6ee4a5dfb1591..675628fdda45e 100644
--- a/llvm/docs/CommandGuide/llvm-readelf.rst
+++ b/llvm/docs/CommandGuide/llvm-readelf.rst
@@ -38,6 +38,11 @@ OPTIONS
  Display the contents of the basic block address map section(s), which contain the
  address of each function, along with the relative offset of each basic block.
 
+.. option:: --decompress, -z
+
+  Dump decompressed section content when used with ``-x`` or ``-p``.
+  If the section(s) are not compressed, they are displayed as is.
+
 .. option:: --demangle, -C
 
  Display demangled symbol names in the output.
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index cb9232ef5e560..6d78a03872344 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -56,6 +56,11 @@ file formats.
 
  Display the address-significance table.
 
+.. option:: --decompress, -z
+
+  Dump decompressed section content when used with ``-x`` or ``-p``.
+  If the section(s) are not compressed, they are displayed as is.
+
 .. option:: --expand-relocs
 
  When used with :option:`--relocs`, display each relocation in an expanded
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
new file mode 100644
index 0000000000000..f4c73de7ca6c9
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib-unsupported.test
@@ -0,0 +1,32 @@
+# UNSUPPORTED: zlib
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t
+
+# CHECK:      String dump of section '.a':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+# CHECK-NEXT: [     0] .
+# CHECK-NEXT: [     8] .
+# CHECK-NEXT: [    10] .
+# CHECK-NEXT: [    18] x.c.
+# CHECK-NEXT: [    1e] .
+# CHECK-NEXT: [    20] .
+# CHECK-NEXT: Hex dump of section '.b':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+# CHECK-NEXT: 0x00000000 01000000 00000000 01000000 00000000 ................
+# CHECK-NEXT: 0x00000010 01000000 00000000 789c6304 00000200 ........x.c.....
+# CHECK-NEXT: 0x00000020 02                                  .
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000001000000000000000100000000000000789c63040000020002
+  - Name: .b
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000001000000000000000100000000000000789c63040000020002
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
new file mode 100644
index 0000000000000..ea7a8854eb1a0
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zlib.test
@@ -0,0 +1,76 @@
+# REQUIRES: zlib
+## Test --decompress/-z.
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-readelf -z -x .strings -x .not_null_terminated %t | FileCheck %s --check-prefix=HEX
+# RUN: llvm-readobj --decompress -p .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=STR
+
+# HEX:      Hex dump of section '.strings':
+# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st
+# HEX-NEXT: 0x00000010 72696e67 7300                       rings.
+# HEX:      Hex dump of section '.not_null_terminated':
+# HEX-NEXT: 0x00000000 6e6f006e 756c6c                     no.null
+
+# STR:      String dump of section '.strings':
+# STR-NEXT: [ 0] here
+# STR-NEXT: [ 5] are
+# STR-NEXT: [ 9] some
+# STR-NEXT: [ e] strings
+# STR-EMPTY:
+# STR-NEXT: String dump of section '.not_null_terminated':
+# STR-NEXT: [ 0] no
+# STR-NEXT: [ 3] null{{$}}
+# STR-NOT:  {{.}}
+
+# RUN: llvm-readobj -x .strings -p .not_null_terminated %t | FileCheck %s --check-prefix=COMPRESSED
+
+# COMPRESSED:      String dump of section '.not_null_terminated':
+# COMPRESSED-NEXT: [     0] no
+# COMPRESSED-NEXT: [     3] null
+# COMPRESSED-NEXT: Hex dump of section '.strings':
+# COMPRESSED-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
+# COMPRESSED-NEXT: 0x00000010 00000000 00000000 789ccb48 2d4a6548 ........x..H-JeH
+# COMPRESSED-NEXT: 0x00000020 04e2e2fc 5c205152 9499975e cc000058 ....\ QR...^...X
+# COMPRESSED-NEXT: 0x00000030 2e079b                              ...
+
+# RUN: llvm-readelf -z -p .invalid1 -x .invalid2 -x .invalid3 %t 2>&1 | FileCheck %s -DFILE=%t --check-prefix=INVALID
+
+# INVALID:      String dump of section '.invalid1':
+# INVALID-NEXT: warning: '[[FILE]]': corrupted compressed section header
+# INVALID-NEXT: [     0] .
+# INVALID-NEXT: Hex dump of section '.invalid2':
+# INVALID-NEXT: warning: '[[FILE]]': zlib error: Z_DATA_ERROR
+# INVALID-NEXT: 0x00000000 01000000 00000000 16000000 00000000 ................
+# INVALID-NEXT: 0x00000010 00000000 00000000 78                ........x
+# INVALID-EMPTY:
+# INVALID-NEXT: Hex dump of section '.invalid3':
+# INVALID-NEXT: warning: '[[FILE]]': unsupported compression type (3)
+# INVALID-NEXT: 0x00000000 03000000 00000000 04000000 00000000 ................
+# INVALID-NEXT: 0x00000010 00000000 00000000 789c6360          ........x.c`
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .strings
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 010000000000000016000000000000000000000000000000789ccb482d4a654804e2e2fc5c2051529499975ecc0000582e079b
+  - Name: .not_null_terminated
+    Type: SHT_PROGBITS
+    Content: 6e6f006e756c6c
+  - Name: .invalid1
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 01
+  - Name: .invalid2
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 01000000000000001600000000000000000000000000000078
+  - Name: .invalid3
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 030000000000000004000000000000000000000000000000789c6360
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
new file mode 100644
index 0000000000000..65da952687f52
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd-unsupported.test
@@ -0,0 +1,31 @@
+# UNSUPPORTED: zstd
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readobj -z -p .a -x .b %t 2>&1 | FileCheck %s -DFILE=%t
+
+# CHECK:      String dump of section '.a':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
+# CHECK-NEXT: [     0] .
+# CHECK-NEXT: [     8] .
+# CHECK-NEXT: [    10] .
+# CHECK-NEXT: [    18] (./. ..
+# CHECK-NEXT: [    21] .
+# CHECK-NEXT: Hex dump of section '.b':
+# CHECK-NEXT: warning: '[[FILE]]': LLVM was not built with LLVM_ENABLE_ZSTD or did not find zstd at build time
+# CHECK-NEXT: 0x00000000 02000000 00000000 01000000 00000000 ................
+# CHECK-NEXT: 0x00000010 01000000 00000000 28b52ffd 20010900 ........(./. ...
+# CHECK-NEXT: 0x00000020 0001                                ..
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .a
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001
+  - Name: .b
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000000100000000000000010000000000000028b52ffd200109000001
diff --git a/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test
new file mode 100644
index 0000000000000..519db879b18c1
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/decompress-zstd.test
@@ -0,0 +1,28 @@
+# REQUIRES: zstd
+## Test --decompress/-z for zstd.
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-readelf -z -x .strings %t | FileCheck %s --check-prefix=HEX
+# RUN: llvm-readobj --decompress -p .strings %t | FileCheck %s --check-prefix=STR
+
+# HEX:      Hex dump of section '.strings':
+# HEX-NEXT: 0x00000000 68657265 00617265 00736f6d 65007374 here.are.some.st
+# HEX-NEXT: 0x00000010 72696e67 7300                       rings.
+
+# STR:      String dump of section '.strings':
+# STR-NEXT: [ 0] here
+# STR-NEXT: [ 5] are
+# STR-NEXT: [ 9] some
+# STR-NEXT: [ e] strings
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_REL
+Sections:
+  - Name: .strings
+    Type: SHT_PROGBITS
+    Flags: [SHF_COMPRESSED]
+    Content: 02000000000000001600000000000000000000000000000028b52ffd2016b10000686572650061726500736f6d6500737472696e677300
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index 59060ac217e32..0d3fea71aafd4 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -14,6 +14,7 @@
 #include "ObjDumper.h"
 #include "llvm-readobj.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Decompressor.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -142,8 +143,23 @@ getSectionRefsByNameOrIndex(const object::ObjectFile &Obj,
   return Ret;
 }
 
+static void maybeDecompress(const object::ObjectFile &Obj,
+                            StringRef SectionName, StringRef &SectionContent,
+                            SmallString<0> &Out) {
+  Expected<object::Decompressor> Decompressor = object::Decompressor::create(
+      SectionName, SectionContent, Obj.isLittleEndian(), Obj.is64Bit());
+  if (!Decompressor)
+    reportWarning(Decompressor.takeError(), Obj.getFileName());
+  else if (auto Err = Decompressor->resizeAndDecompress(Out))
+    reportWarning(std::move(Err), Obj.getFileName());
+  else
+    SectionContent = Out;
+}
+
 void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
-                                      ArrayRef<std::string> Sections) {
+                                      ArrayRef<std::string> Sections,
+                                      bool Decompress) {
+  SmallString<0> Out;
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
@@ -156,12 +172,16 @@ void ObjDumper::printSectionsAsString(const object::ObjectFile &Obj,
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
+    if (Decompress && Section.isCompressed())
+      maybeDecompress(Obj, SectionName, SectionContent, Out);
     printAsStringList(SectionContent);
   }
 }
 
 void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
-                                   ArrayRef<std::string> Sections) {
+                                   ArrayRef<std::string> Sections,
+                                   bool Decompress) {
+  SmallString<0> Out;
   bool First = true;
   for (object::SectionRef Section :
        getSectionRefsByNameOrIndex(Obj, Sections)) {
@@ -174,6 +194,8 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
 
     StringRef SectionContent =
         unwrapOrError(Obj.getFileName(), Section.getContents());
+    if (Decompress && Section.isCompressed())
+      maybeDecompress(Obj, SectionName, SectionContent, Out);
     const uint8_t *SecContent = SectionContent.bytes_begin();
     const uint8_t *SecEnd = SecContent + SectionContent.size();
 
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 1d679453581bc..3958dd3a33333 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -175,9 +175,9 @@ class ObjDumper {
   void printAsStringList(StringRef StringContent, size_t StringDataOffset = 0);
 
   void printSectionsAsString(const object::ObjectFile &Obj,
-                             ArrayRef<std::string> Sections);
+                             ArrayRef<std::string> Sections, bool Decompress);
   void printSectionsAsHex(const object::ObjectFile &Obj,
-                          ArrayRef<std::string> Sections);
+                          ArrayRef<std::string> Sections, bool Decompress);
 
   std::function<Error(const Twine &Msg)> WarningHandler;
   void reportUniqueWarning(Error Err) const;
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index e2d93c6ec229e..018facc278e89 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -20,6 +20,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, --
 def arch_specific : FF<"arch-specific", "Display architecture-specific information">;
 def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">;
 def cg_profile : FF<"cg-profile", "Display call graph profile section">;
+def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">;
 defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">;
 def dependent_libraries : FF<"dependent-libraries", "Display the dependent libraries section">;
 def dyn_relocations : FF<"dyn-relocations", "Display the dynamic relocation entries in the file">;
@@ -139,3 +140,4 @@ def : F<"u", "Alias for --unwind">, Alias<unwind>;
 def : F<"X", "Alias for --extra-sym-info">, Alias<extra_sym_info>, Group<grp_elf>;
 def : F<"V", "Alias for --version-info">, Alias<version_info>, Group<grp_elf>;
 def : JoinedOrSeparate<["-"], "x">, Alias<hex_dump_EQ>, HelpText<"Alias for --hex-dump">, MetaVarName<"<name or index>">;
+def : F<"z", "Alias for --decompress">, Alias<decompress>;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index f9d605d35244b..979433d69011c 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -97,6 +97,7 @@ static bool ArchSpecificInfo;
 static bool BBAddrMap;
 bool ExpandRelocs;
 static bool CGProfile;
+static bool Decompress;
 bool Demangle;
 static bool DependentLibraries;
 static bool DynRelocs;
@@ -212,6 +213,7 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific);
   opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map);
   opts::CGProfile = Args.hasArg(OPT_cg_profile);
+  opts::Decompress = Args.hasArg(OPT_decompress);
   opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false);
   opts::DependentLibraries = Args.hasArg(OPT_dependent_libraries);
   opts::DynRelocs = Args.hasArg(OPT_dyn_relocations);
@@ -439,9 +441,9 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     Dumper->printSymbols(opts::Symbols, opts::DynamicSymbols,
                          opts::ExtraSymInfo, SymComp);
   if (!opts::StringDump.empty())
-    Dumper->printSectionsAsString(Obj, opts::StringDump);
+    Dumper->printSectionsAsString(Obj, opts::StringDump, opts::Decompress);
   if (!opts::HexDump.empty())
-    Dumper->printSectionsAsHex(Obj, opts::HexDump);
+    Dumper->printSectionsAsHex(Obj, opts::HexDump, opts::Decompress);
   if (opts::HashTable)
     Dumper->printHashTable();
   if (opts::GnuHashTable)

From 163eaf3bbc24e46a6ec9b71deda8c66f0354d2d7 Mon Sep 17 00:00:00 2001
From: Daniel Hoekwater <hoekwater@google.com>
Date: Thu, 22 Feb 2024 03:30:28 +0000
Subject: [PATCH 061/546] [CodeGen] Clean up MachineFunctionSplitter MBB safety
 checking (NFC)

Move the "is MBB safe to split" check out of `isColdBlock` and update
the comment since we're no longer using a temporary hack.
---
 llvm/lib/CodeGen/MachineFunctionSplitter.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index 38c1c56d2823e..0ddd945896992 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -109,12 +109,6 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
                         const MachineBlockFrequencyInfo *MBFI,
                         ProfileSummaryInfo *PSI) {
   std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
-
-  // Temporary hack to cope with AArch64's jump table encoding
-  const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
-  if (!TII.isMBBSafeToSplitToCold(MBB))
-    return false;
-
   // For instrumentation profiles and sample profiles, we use different ways
   // to judge whether a block is cold and should be split.
   if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) {
@@ -178,7 +172,8 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
 
     if (MBB.isEHPad())
       LandingPads.push_back(&MBB);
-    else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode)
+    else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) &&
+             TII.isMBBSafeToSplitToCold(MBB) && !SplitAllEHCode)
       MBB.setSectionID(MBBSectionID::ColdSectionID);
   }
 
@@ -190,7 +185,7 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
     // Here we have UseProfileData == true.
     bool HasHotLandingPads = false;
     for (const MachineBasicBlock *LP : LandingPads) {
-      if (!isColdBlock(*LP, MBFI, PSI))
+      if (!isColdBlock(*LP, MBFI, PSI) || !TII.isMBBSafeToSplitToCold(*LP))
         HasHotLandingPads = true;
     }
     if (!HasHotLandingPads) {

From 6599c022be7c797cd0fafeea4c538e01aae78fd4 Mon Sep 17 00:00:00 2001
From: yandalur <quic_yandalur@quicinc.com>
Date: Thu, 22 Feb 2024 23:18:06 +0530
Subject: [PATCH 062/546] [HEXAGON] Fix bit boundary for isub_hi in
 HexagonBitSimplify (#82336)

Use bit boundary of 32 for high subregisters in HexagonBitSimplify. This
fixes the subregister used in an upper half register store.
---
 .../lib/Target/Hexagon/HexagonBitSimplify.cpp |  3 ++-
 .../Hexagon/bit-store-upper-sub-hi.mir        | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir

diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 6024d9f7b1547..3b8234c011843 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1957,7 +1957,8 @@ bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) {
     return false;
   const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg);
   RegHalf H;
-  if (!matchHalf(0, RC, 0, H))
+  unsigned B = (RS.Sub == Hexagon::isub_hi) ? 32 : 0;
+  if (!matchHalf(0, RC, B, H))
     return false;
   if (H.Low)
     return false;
diff --git a/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir
new file mode 100644
index 0000000000000..ef84043cf5021
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir
@@ -0,0 +1,21 @@
+# RUN: llc -march=hexagon -run-pass=hexagon-bit-simplify -o - %s | FileCheck %s
+
+# This test checks if the HexagonBitSimplify pass correctly replaces a
+# S2_storerh_io with a S2_storerf_io that stores the upper halfword
+# of a high subregister using appropriate subregister boundaries.
+
+# CHECK: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_hi
+# CHECK-NOT: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_lo
+
+---
+name: test_store
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0
+    %0:intregs = COPY $r0
+    %1:doubleregs = IMPLICIT_DEF
+    %2:doubleregs = IMPLICIT_DEF
+    %3:doubleregs = S2_shuffoh %2, %1
+    S2_storerh_io %0, 28, %3.isub_hi
+...

From b0edc1c45284586fdb12edd666f95d99f5f62b43 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 23 Feb 2024 01:49:19 +0800
Subject: [PATCH 063/546] [Loads] Fix crash in isSafeToLoadUnconditionally with
 scalable accessed type (#82650)

This fixes #82606 by updating isSafeToLoadUnconditionally to handle
fixed sized loads from a scalable accessed type.
---
 llvm/lib/Analysis/Loads.cpp                   |  6 +++---
 .../VectorCombine/RISCV/load-widening.ll      | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll

diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 6bf0d2f56eb4e..5916d2ab48ece 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -364,7 +364,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 
   if (Size.getBitWidth() > 64)
     return false;
-  const uint64_t LoadSize = Size.getZExtValue();
+  const TypeSize LoadSize = TypeSize::getFixed(Size.getZExtValue());
 
   // Otherwise, be a little bit aggressive by scanning the local block where we
   // want to check to see if the pointer is already being loaded or stored
@@ -414,11 +414,11 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
 
     // Handle trivial cases.
     if (AccessedPtr == V &&
-        LoadSize <= DL.getTypeStoreSize(AccessedTy))
+        TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
       return true;
 
     if (AreEquivalentAddressValues(AccessedPtr->stripPointerCasts(), V) &&
-        LoadSize <= DL.getTypeStoreSize(AccessedTy))
+        TypeSize::isKnownLE(LoadSize, DL.getTypeStoreSize(AccessedTy)))
       return true;
   }
   return false;
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll
new file mode 100644
index 0000000000000..0a43ad2f9a368
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/RISCV/load-widening.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv32 -mattr=+v | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+define void @fixed_load_scalable_src(ptr %p) {
+; CHECK-LABEL: define void @fixed_load_scalable_src(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x i16> zeroinitializer, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret void
+;
+entry:
+  store <vscale x 4 x i16> zeroinitializer, ptr %p
+  %0 = load <4 x i16>, ptr %p
+  %1 = shufflevector <4 x i16> %0, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret void
+}

From 5b079af169cd04b457465fd7ca31714efeefe6d9 Mon Sep 17 00:00:00 2001
From: Michael Jones <71531609+michaelrj-google@users.noreply.github.com>
Date: Thu, 22 Feb 2024 09:52:16 -0800
Subject: [PATCH 064/546] [libc] add FXBits class (#82065)

The FXBits class is what will be used to modify fixed point numbers on a
bit level. This patch adds a basic implementation as well as basic
tests.
---
 libc/src/__support/fixed_point/CMakeLists.txt |   2 +
 libc/src/__support/fixed_point/fx_bits.h      |  78 ++++
 libc/test/src/__support/CMakeLists.txt        |   1 +
 .../test/src/__support/FPUtil/fpbits_test.cpp |   2 +-
 .../src/__support/fixed_point/CMakeLists.txt  |  16 +
 .../__support/fixed_point/fx_bits_test.cpp    | 348 ++++++++++++++++++
 6 files changed, 446 insertions(+), 1 deletion(-)
 create mode 100644 libc/test/src/__support/fixed_point/CMakeLists.txt
 create mode 100644 libc/test/src/__support/fixed_point/fx_bits_test.cpp

diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt
index c6bb9e17adfa8..64f9dacc7ba5f 100644
--- a/libc/src/__support/fixed_point/CMakeLists.txt
+++ b/libc/src/__support/fixed_point/CMakeLists.txt
@@ -17,5 +17,7 @@ add_header_library(
     libc.include.llvm-libc-macros.stdfix_macros
     libc.src.__support.macros.attributes
     libc.src.__support.macros.optimization
+    libc.src.__support.CPP.type_traits
     libc.src.__support.CPP.bit
+    libc.src.__support.math_extras
 )
diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h
index b26be169a593a..fcd47cd72cbb3 100644
--- a/libc/src/__support/fixed_point/fx_bits.h
+++ b/libc/src/__support/fixed_point/fx_bits.h
@@ -14,6 +14,7 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math_extras.h"
 
 #include "fx_rep.h"
 
@@ -21,6 +22,83 @@
 
 namespace LIBC_NAMESPACE::fixed_point {
 
+template <typename T> struct FXBits {
+private:
+  using fx_rep = FXRep<T>;
+  using StorageType = typename fx_rep::StorageType;
+
+  StorageType value;
+
+  static_assert(fx_rep::FRACTION_LEN > 0);
+
+  static constexpr size_t FRACTION_OFFSET = 0; // Just for completeness
+  static constexpr size_t INTEGRAL_OFFSET =
+      fx_rep::INTEGRAL_LEN == 0 ? 0 : fx_rep::FRACTION_LEN;
+  static constexpr size_t SIGN_OFFSET =
+      fx_rep::SIGN_LEN == 0
+          ? 0
+          : ((sizeof(StorageType) * CHAR_BIT) - fx_rep::SIGN_LEN);
+
+  static constexpr StorageType FRACTION_MASK =
+      mask_trailing_ones<StorageType, fx_rep::FRACTION_LEN>()
+      << FRACTION_OFFSET;
+  static constexpr StorageType INTEGRAL_MASK =
+      mask_trailing_ones<StorageType, fx_rep::INTEGRAL_LEN>()
+      << INTEGRAL_OFFSET;
+  static constexpr StorageType SIGN_MASK =
+      (fx_rep::SIGN_LEN == 0 ? 0 : StorageType(1) << SIGN_OFFSET);
+
+public:
+  LIBC_INLINE constexpr FXBits() = default;
+
+  template <typename XType> LIBC_INLINE constexpr explicit FXBits(XType x) {
+    using Unqual = typename cpp::remove_cv_t<XType>;
+    if constexpr (cpp::is_same_v<Unqual, T>) {
+      value = cpp::bit_cast<StorageType>(x);
+    } else if constexpr (cpp::is_same_v<Unqual, StorageType>) {
+      value = x;
+    } else {
+      // We don't want accidental type promotions/conversions, so we require
+      // exact type match.
+      static_assert(cpp::always_false<XType>);
+    }
+  }
+
+  LIBC_INLINE constexpr StorageType get_fraction() {
+    return (value & FRACTION_MASK) >> FRACTION_OFFSET;
+  }
+
+  LIBC_INLINE constexpr StorageType get_integral() {
+    return (value & INTEGRAL_MASK) >> INTEGRAL_OFFSET;
+  }
+
+  // TODO: replace bool with Sign
+  LIBC_INLINE constexpr bool get_sign() {
+    return static_cast<bool>((value & SIGN_MASK) >> SIGN_OFFSET);
+  }
+
+  // This represents the effective negative exponent applied to this number
+  LIBC_INLINE constexpr int get_exponent() { return fx_rep::FRACTION_LEN; }
+
+  LIBC_INLINE constexpr void set_fraction(StorageType fraction) {
+    value = (value & (~FRACTION_MASK)) |
+            ((fraction << FRACTION_OFFSET) & FRACTION_MASK);
+  }
+
+  LIBC_INLINE constexpr void set_integral(StorageType integral) {
+    value = (value & (~INTEGRAL_MASK)) |
+            ((integral << INTEGRAL_OFFSET) & INTEGRAL_MASK);
+  }
+
+  // TODO: replace bool with Sign
+  LIBC_INLINE constexpr void set_sign(bool sign) {
+    value = (value & (~SIGN_MASK)) |
+            ((static_cast<StorageType>(sign) << SIGN_OFFSET) & SIGN_MASK);
+  }
+
+  LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(value); }
+};
+
 // Bit-wise operations are not available for fixed point types yet.
 template <typename T>
 LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_fixed_point_v<T>, T>
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 493ef9ddabe1e..9801621e6b399 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -188,4 +188,5 @@ add_subdirectory(File)
 add_subdirectory(RPC)
 add_subdirectory(OSUtil)
 add_subdirectory(FPUtil)
+add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index 46f7d25059687..4f9f53afe5478 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for the DyadicFloat class -------------------------------===//
+//===-- Unittests for the FPBits class ------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libc/test/src/__support/fixed_point/CMakeLists.txt b/libc/test/src/__support/fixed_point/CMakeLists.txt
new file mode 100644
index 0000000000000..384cc9394ee79
--- /dev/null
+++ b/libc/test/src/__support/fixed_point/CMakeLists.txt
@@ -0,0 +1,16 @@
+if(NOT LIBC_COMPILER_HAS_FIXED_POINT)
+  return()
+endif()
+
+add_custom_target(libc-fixed-point-tests)
+
+add_libc_test(
+  fx_bits_test
+  SUITE
+    libc-fixed-point-tests
+  SRCS
+    fx_bits_test.cpp
+  DEPENDS
+    libc.src.__support.fixed_point.fx_bits
+    libc.src.__support.integer_literals
+)
diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
new file mode 100644
index 0000000000000..58627816eb8d9
--- /dev/null
+++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
@@ -0,0 +1,348 @@
+//===-- Unittests for the FXBits class ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+
+#include "src/__support/fixed_point/fx_bits.h"
+#include "src/__support/integer_literals.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::fixed_point::FXBits;
+using LIBC_NAMESPACE::fixed_point::FXRep;
+
+using LIBC_NAMESPACE::operator""_u8;
+using LIBC_NAMESPACE::operator""_u16;
+using LIBC_NAMESPACE::operator""_u32;
+using LIBC_NAMESPACE::operator""_u64;
+
+// -------------------------------- SHORT TESTS --------------------------------
+
+TEST(LlvmLibcFxBitsTest, FXBits_UnsignedShortFract) {
+  auto bits_var = FXBits<unsigned short fract>(0b00000000_u8);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00_u8);
+
+  // Since an unsigned fract has no sign or integral components, setting either
+  // should have no effect.
+
+  bits_var.set_sign(true);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00_u8);
+
+  bits_var.set_integral(0xab);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00_u8);
+
+  // but setting the fraction should work
+
+  bits_var.set_fraction(0xcd);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0xcd_u8);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_UnsignedShortAccum) {
+  auto bits_var = FXBits<unsigned short accum>(0b00000000'00000000_u16);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_sign(true); // 0 sign bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_integral(0xabcd); // 8 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_fraction(0x21fe); // 8 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00fe_u16);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_ShortFract) {
+  auto bits_var = FXBits<short fract>(0b0'0000000_u8);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00_u8);
+
+  bits_var.set_sign(true); // 1 sign bit used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00_u8);
+
+  bits_var.set_integral(0xab); // 0 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00_u8);
+
+  bits_var.set_fraction(0xcd); // 7 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00_u8);
+  EXPECT_EQ(bits_var.get_fraction(), 0x4d_u8);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_ShortAccum) {
+  auto bits_var = FXBits<short accum>(0b0'00000000'0000000_u16);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_sign(true); // 1 sign bit used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_integral(0xabcd); // 8 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_fraction(0x21fe); // 7 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00cd_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x007e_u16);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_UnsignedFract) {
+  auto bits_var = FXBits<unsigned fract>(0b0000000000000000_u16);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_sign(true); // 0 sign bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_integral(0xabcd); // 0 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_fraction(0xef12); // 16 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0xef12_u16);
+}
+
+// -------------------------------- NORMAL TESTS -------------------------------
+
+TEST(LlvmLibcFxBitsTest, FXBits_UnsignedAccum) {
+  auto bits_var =
+      FXBits<unsigned accum>(0b0000000000000000'0000000000000000_u32);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_sign(true); // 0 sign bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_integral(0xabcd); // 16 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_fraction(0xef12); // 16 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000ef12_u32);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_Fract) {
+  auto bits_var = FXBits<fract>(0b0'000000000000000_u16);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_sign(true); // 1 sign bit used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_integral(0xabcd); // 0 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000_u16);
+
+  bits_var.set_fraction(0xef12); // 15 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000_u16);
+  EXPECT_EQ(bits_var.get_fraction(), 0x6f12_u16);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_Accum) {
+  auto bits_var = FXBits<accum>(0b0'0000000000000000'000000000000000_u32);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_sign(true); // 1 sign bit used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_integral(0xabcd); // 16 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_fraction(0xef12); // 15 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000abcd_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00006f12_u32);
+}
+
+// --------------------------------- LONG TESTS --------------------------------
+
+TEST(LlvmLibcFxBitsTest, FXBits_UnsignedLongFract) {
+  auto bits_var =
+      FXBits<unsigned long fract>(0b00000000000000000000000000000000_u32);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_sign(true); // 0 sign bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_integral(0xabcdef12); // 0 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_fraction(0xfedcba98); // 32 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0xfedcba98_u32);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_UnsignedLongAccum) {
+  auto bits_var = FXBits<unsigned long accum>(
+      0b00000000000000000000000000000000'00000000000000000000000000000000_u64);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64);
+
+  bits_var.set_sign(true); // 0 sign bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64);
+
+  bits_var.set_integral(0xabcdef12); // 32 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64);
+
+  bits_var.set_fraction(0xfedcba98); // 32 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000fedcba98_u64);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_LongFract) {
+  auto bits_var = FXBits<long fract>(0b0'0000000000000000000000000000000_u32);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_sign(true); // 1 sign bit used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_integral(0xabcdef12); // 0 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x00000000_u32);
+
+  bits_var.set_fraction(0xfedcba98); // 31 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000_u32);
+  EXPECT_EQ(bits_var.get_fraction(), 0x7edcba98_u32);
+}
+
+TEST(LlvmLibcFxBitsTest, FXBits_LongAccum) {
+  auto bits_var = FXBits<long accum>(
+      0b0'00000000000000000000000000000000'0000000000000000000000000000000_u64);
+
+  EXPECT_EQ(bits_var.get_sign(), false);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64);
+
+  bits_var.set_sign(true); // 1 sign bit used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x0000000000000000_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64);
+
+  bits_var.set_integral(0xabcdef12); // 32 integral bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x0000000000000000_u64);
+
+  bits_var.set_fraction(0xfedcba98); // 31 fractional bits used
+
+  EXPECT_EQ(bits_var.get_sign(), true);
+  EXPECT_EQ(bits_var.get_integral(), 0x00000000abcdef12_u64);
+  EXPECT_EQ(bits_var.get_fraction(), 0x000000007edcba98_u64);
+}

From 3a85594cb340aabe7ad993eb3912987f4246925e Mon Sep 17 00:00:00 2001
From: sethp <seth@codecopse.net>
Date: Thu, 22 Feb 2024 09:52:48 -0800
Subject: [PATCH 065/546] [NFC] Fix typo in ReleaseNotes.rst (#82655)

Deletes the leading 7 from the textual issue number.
---
 clang/docs/ReleaseNotes.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bac166e6c3562..d8f8a2cb38442 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -270,7 +270,7 @@ Bug Fixes to C++ Support
   local variable, which is supported as a C11 extension in C++. Previously, it
   was only accepted at namespace scope but not at local function scope.
 - Clang no longer tries to call consteval constructors at runtime when they appear in a member initializer.
-  (`#782154 <https://github.com/llvm/llvm-project/issues/82154>`_`)
+  (`#82154 <https://github.com/llvm/llvm-project/issues/82154>`_`)
 - Fix crash when using an immediate-escalated function at global scope.
   (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
 - Correctly immediate-escalate lambda conversion functions.

From bc841bb0f8b55d18ed97440df878d0121701a317 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 22 Feb 2024 09:27:02 -0800
Subject: [PATCH 066/546] [clang] Rename installapi tests, NFC

* Reduces redundancy
---
 clang/test/InstallAPI/{installapi-basic.test => basic.test}       | 0
 ...pi-driver-invalid-options.test => driver-invalid-options.test} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename clang/test/InstallAPI/{installapi-basic.test => basic.test} (100%)
 rename clang/test/InstallAPI/{installapi-driver-invalid-options.test => driver-invalid-options.test} (100%)

diff --git a/clang/test/InstallAPI/installapi-basic.test b/clang/test/InstallAPI/basic.test
similarity index 100%
rename from clang/test/InstallAPI/installapi-basic.test
rename to clang/test/InstallAPI/basic.test
diff --git a/clang/test/InstallAPI/installapi-driver-invalid-options.test b/clang/test/InstallAPI/driver-invalid-options.test
similarity index 100%
rename from clang/test/InstallAPI/installapi-driver-invalid-options.test
rename to clang/test/InstallAPI/driver-invalid-options.test

From e630a451b457e4d8d071a2b4f102b342bbea2d02 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 22 Feb 2024 18:58:36 +0100
Subject: [PATCH 067/546] [HCS] Fix unused variable warnings. NFCI.

---
 llvm/lib/Transforms/IPO/HotColdSplitting.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 5f03bd59b8cd1..5aefcbf13182c 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -716,10 +716,10 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
             })) {
           ColdBlocks.insert(SubRegion.begin(), SubRegion.end());
 
-          for (auto *Block : SubRegion) {
-            LLVM_DEBUG(dbgs()
-                       << "  contains cold block:" << Block->getName() << "\n");
-          }
+          LLVM_DEBUG({
+            for (auto *Block : SubRegion)
+              dbgs() << "  contains cold block:" << Block->getName() << "\n";
+          });
 
           OutliningWorklist.emplace_back(
               std::make_pair(SubRegion[0], std::move(CE)));
@@ -748,6 +748,7 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
     Function *Outlined =
         extractColdRegion(*BCE.first, BCE.second, CEAC, BFI, TTI, ORE);
     assert(Outlined && "Should be outlined");
+    (void)Outlined;
   }
 
   return true;

From ea174c09342275d6c6fec48fb846eaf28fae5b51 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 12:01:52 -0600
Subject: [PATCH 068/546] [Libomptarget] Remove global ctor and use reference
 counting (#80499)

Summary:
Currently we rely on global constructors to initialize and shut down the
OpenMP runtime library and plugin manager. This causes some issues
because we do not have a defined lifetime that we can rely on to release
and allocate resources. This patch instead adds some simple reference
counted initialization and deinitialization function.

A future patch will use the `deinit` interface to more intelligently
handle plugin deinitilization. Right now we do nothing and rely on
`atexit` inside of the plugins to tear them down. This isn't great
because it limits our ability to control these things.

Note that I made the `__tgt_register_lib` functions do the
initialization instead of adding calls to the new runtime functions in
the linker wrapper. The reason for this is because in the past it's been
easier to not introduce a new function call, since sometimes the user's
compiler will link against an older `libomptarget`. Maybe if we change
the name with offloading in the future we can simplify this.

Depends on https://github.com/llvm/llvm-project/pull/80460
---
 openmp/libomptarget/include/PluginManager.h   |  6 +++
 openmp/libomptarget/include/omptarget.h       |  6 +++
 openmp/libomptarget/src/OffloadRTL.cpp        | 38 +++++++++++++------
 openmp/libomptarget/src/PluginManager.cpp     |  2 +-
 openmp/libomptarget/src/exports               |  2 +
 openmp/libomptarget/src/interface.cpp         | 20 +++++++++-
 .../test/offloading/runtime_init.c            | 30 +++++++++++++++
 7 files changed, 89 insertions(+), 15 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/runtime_init.c

diff --git a/openmp/libomptarget/include/PluginManager.h b/openmp/libomptarget/include/PluginManager.h
index ec5d98dc8cd30..5e5306ac776f0 100644
--- a/openmp/libomptarget/include/PluginManager.h
+++ b/openmp/libomptarget/include/PluginManager.h
@@ -206,6 +206,12 @@ struct PluginManager {
   ProtectedObj<DeviceContainerTy> Devices;
 };
 
+/// Initialize the plugin manager and OpenMP runtime.
+void initRuntime();
+
+/// Deinitialize the plugin and delete it.
+void deinitRuntime();
+
 extern PluginManager *PM;
 
 #endif // OMPTARGET_PLUGIN_MANAGER_H
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index c4faa23427f11..9a2bd1340e3b4 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -312,6 +312,12 @@ void *llvm_omp_target_dynamic_shared_alloc();
 /// add the clauses of the requires directives in a given file
 void __tgt_register_requires(int64_t Flags);
 
+/// Initializes the runtime library.
+void __tgt_rtl_init();
+
+/// Deinitializes the runtime library.
+void __tgt_rtl_deinit();
+
 /// adds a target shared library to the target execution image
 void __tgt_register_lib(__tgt_bin_desc *Desc);
 
diff --git a/openmp/libomptarget/src/OffloadRTL.cpp b/openmp/libomptarget/src/OffloadRTL.cpp
index 86ef0d5bc91cf..dd75b1b181505 100644
--- a/openmp/libomptarget/src/OffloadRTL.cpp
+++ b/openmp/libomptarget/src/OffloadRTL.cpp
@@ -20,25 +20,39 @@
 extern void llvm::omp::target::ompt::connectLibrary();
 #endif
 
-__attribute__((constructor(101))) void init() {
+static std::mutex PluginMtx;
+static uint32_t RefCount = 0;
+
+void initRuntime() {
+  std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
   Profiler::get();
   TIMESCOPE();
 
-  DP("Init offload library!\n");
-
-  PM = new PluginManager();
+  if (PM == nullptr)
+    PM = new PluginManager();
 
+  RefCount++;
+  if (RefCount == 1) {
+    DP("Init offload library!\n");
 #ifdef OMPT_SUPPORT
-  // Initialize OMPT first
-  llvm::omp::target::ompt::connectLibrary();
+    // Initialize OMPT first
+    llvm::omp::target::ompt::connectLibrary();
 #endif
 
-  PM->init();
-
-  PM->registerDelayedLibraries();
+    PM->init();
+    PM->registerDelayedLibraries();
+  }
 }
 
-__attribute__((destructor(101))) void deinit() {
-  DP("Deinit offload library!\n");
-  delete PM;
+void deinitRuntime() {
+  std::scoped_lock<decltype(PluginMtx)> Lock(PluginMtx);
+  assert(PM && "Runtime not initialized");
+
+  if (RefCount == 1) {
+    DP("Deinit offload library!\n");
+    delete PM;
+    PM = nullptr;
+  }
+
+  RefCount--;
 }
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 34f1f4969da30..09f9c6400569c 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 using namespace llvm::sys;
 
-PluginManager *PM;
+PluginManager *PM = nullptr;
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {ENABLED_OFFLOAD_PLUGINS};
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
index af882a2642647..d5432a9eed380 100644
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -1,5 +1,7 @@
 VERS1.0 {
   global:
+    __tgt_rtl_init;
+    __tgt_rtl_deinit;
     __tgt_register_requires;
     __tgt_register_lib;
     __tgt_unregister_lib;
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d2707f39a1aa3..8b89bc3ff7124 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -38,9 +38,13 @@ EXTERN void __tgt_register_requires(int64_t Flags) {
           __PRETTY_FUNCTION__);
 }
 
+EXTERN void __tgt_rtl_init() { initRuntime(); }
+EXTERN void __tgt_rtl_deinit() { deinitRuntime(); }
+
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
+  initRuntime();
   if (PM->delayRegisterLib(Desc))
     return;
 
@@ -49,12 +53,17 @@ EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Initialize all available devices without registering any image
-EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
+EXTERN void __tgt_init_all_rtls() {
+  assert(PM && "Runtime not initialized");
+  PM->initAllPlugins();
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
   PM->unregisterLib(Desc);
+
+  deinitRuntime();
 }
 
 template <typename TargetAsyncInfoTy>
@@ -64,6 +73,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
            map_var_info_t *ArgNames, void **ArgMappers,
            TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
            const char *RegionName) {
+  assert(PM && "Runtime not initialized");
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
@@ -239,6 +249,7 @@ template <typename TargetAsyncInfoTy>
 static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                int32_t ThreadLimit, void *HostPtr,
                                KernelArgsTy *KernelArgs) {
+  assert(PM && "Runtime not initialized");
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
@@ -345,6 +356,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                         void *VAddr, bool IsRecord,
                                         bool SaveOutput,
                                         uint64_t &ReqPtrArgOffset) {
+  assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
@@ -380,7 +392,7 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
                                       ptrdiff_t *TgtOffsets, int32_t NumArgs,
                                       int32_t NumTeams, int32_t ThreadLimit,
                                       uint64_t LoopTripCount) {
-
+  assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
   if (checkDeviceAndCtors(DeviceId, Loc)) {
     DP("Not offloading to device %" PRId64 "\n", DeviceId);
@@ -431,6 +443,7 @@ EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
 }
 
 EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
+  assert(PM && "Runtime not initialized");
   std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
   InfoLevel.store(NewInfoLevel);
   for (auto &R : PM->pluginAdaptors()) {
@@ -440,6 +453,7 @@ EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
 }
 
 EXTERN int __tgt_print_device_info(int64_t DeviceId) {
+  assert(PM && "Runtime not initialized");
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -448,7 +462,9 @@ EXTERN int __tgt_print_device_info(int64_t DeviceId) {
 }
 
 EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
+  assert(PM && "Runtime not initialized");
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+
   if (!AsyncHandle || !*AsyncHandle) {
     FATAL_MESSAGE0(
         1, "Receive an invalid async handle from the current OpenMP task. Is "
diff --git a/openmp/libomptarget/test/offloading/runtime_init.c b/openmp/libomptarget/test/offloading/runtime_init.c
new file mode 100644
index 0000000000000..96fd50f51da1e
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/runtime_init.c
@@ -0,0 +1,30 @@
+// RUN: %libomptarget-compile-generic
+// RUN:   env LIBOMPTARGET_DEBUG=1 %libomptarget-run-generic 2>&1 \
+// RUN: %fcheck-generic
+
+// REQUIRES: libomptarget-debug
+
+#include <omp.h>
+#include <stdio.h>
+
+extern void __tgt_rtl_init(void);
+extern void __tgt_rtl_deinit(void);
+
+// Sanity checks to make sure that this works and is thread safe.
+int main() {
+  // CHECK: Init offload library!
+  // CHECK: Deinit offload library!
+  __tgt_rtl_init();
+#pragma omp parallel num_threads(8)
+  {
+    __tgt_rtl_init();
+    __tgt_rtl_deinit();
+  }
+  __tgt_rtl_deinit();
+
+  __tgt_rtl_init();
+  __tgt_rtl_deinit();
+
+  // CHECK: PASS
+  printf("PASS\n");
+}

From ec24094b56793478909783c1156fd57ce5ec2006 Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Fri, 23 Feb 2024 01:05:06 +0700
Subject: [PATCH 069/546] [LTO] Remove Config.UseDefaultPipeline (#82587)

This option is not used. It was added in
[D122133](https://reviews.llvm.org/D122133), 5856f30b, with the only
usage in `ClangLinkerWrapper.cpp`, which was later updated in a1d57fc2,
and then finally removed in [D142650](https://reviews.llvm.org/D142650),
6185246f.
---
 llvm/include/llvm/LTO/Config.h | 3 ---
 llvm/lib/LTO/LTOBackend.cpp    | 2 --
 2 files changed, 5 deletions(-)

diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 6fb55f1cf1686..482b6e55a19d3 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -60,9 +60,6 @@ struct Config {
   bool VerifyEach = false;
   bool DisableVerify = false;
 
-  /// Use the standard optimization pipeline.
-  bool UseDefaultPipeline = false;
-
   /// Flag to indicate that the optimizer should not assume builtins are present
   /// on the target.
   bool Freestanding = false;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 7b3a7590dfa74..6cfe67779b1a7 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -330,8 +330,6 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
       report_fatal_error(Twine("unable to parse pass pipeline description '") +
                          Conf.OptPipeline + "': " + toString(std::move(Err)));
     }
-  } else if (Conf.UseDefaultPipeline) {
-    MPM.addPass(PB.buildPerModuleDefaultPipeline(OL));
   } else if (IsThinLTO) {
     MPM.addPass(PB.buildThinLTODefaultPipeline(OL, ImportSummary));
   } else {

From 54a6cf15069e7e88125477e0b3ce1ab063c893c6 Mon Sep 17 00:00:00 2001
From: "S. Bharadwaj Yadavalli" <Bharadwaj.Yadavalli@microsoft.com>
Date: Thu, 22 Feb 2024 13:10:58 -0500
Subject: [PATCH 070/546] [DirectX][NFC] Use LLVM Types in DXIL Operation
 specifications in DXIL.td (#81692)

This change uniformly uses LLVM Types in the specification of parameter
types and overload types of DXIL operation.

Updated (a) parameter types accordingly in the specification of existing
DXILOperations and (b) DXILEmitter.
---
 llvm/lib/Target/DirectX/DXIL.td     | 80 ++++++++++++-----------------
 llvm/utils/TableGen/DXILEmitter.cpp | 79 +++++++++++++---------------
 2 files changed, 69 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 52158139a2584..8a3454c89542c 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -35,30 +35,18 @@ def BinaryUintCategory : DXILOpCategory<"Binary uint">;
 def UnaryFloatCategory : DXILOpCategory<"Unary float">;
 def ComputeIDCategory : DXILOpCategory<"Compute/Mesh/Amplification shader">;
 
-// Following are the scalar types supported by DXIL operations and are synonymous
-// to llvm_*_ty defined for readability and ease of use in the context of this file.
-
-def voidTy  : LLVMType<isVoid>;
-
-// Floating point types
-def f16Ty   : LLVMType<f16>;
-def f32Ty   : LLVMType<f32>;
-def f64Ty   : LLVMType<f64>;
-
-// Integer types
-def i1Ty   : LLVMType<i1>;
-def i8Ty   : LLVMType<i8>;
-def i16Ty  : LLVMType<i16>;
-def i32Ty  : LLVMType<i32>;
-def i64Ty  : LLVMType<i64>;
+// Represent as any pointer type with an option to change to a qualified pointer
+// type with address space specified.
+def dxil_handle_ty  : LLVMAnyPointerType;
+def dxil_cbuffer_ty : LLVMAnyPointerType;
+def dxil_resource_ty : LLVMAnyPointerType;
 
 // The parameter description for a DXIL operation
-class DXILOpParameter<int pos, string type, string name, string doc,
+class DXILOpParameter<int pos, LLVMType type, string name, string doc,
                  bit isConstant = 0, string enumName = "",
                  int maxValue = 0> {
   int Pos = pos;               // Position in parameter list
-  string Type = type;          // LLVM type name, $o for overload, $r for resource
-                               // type, $cb for legacy cbuffer, $u4 for u4 struct
+  LLVMType ParamType = type;   // Parameter type
   string Name = name;          // Short, unique parameter name
   string Doc = doc;            // Description of this parameter
   bit IsConstant = isConstant; // Whether this parameter requires a constant value in the IR
@@ -108,55 +96,55 @@ class DXILOperation<string name, int opCode, DXILOpClass opClass, DXILOpCategory
 class LLVMIntrinsic<Intrinsic llvm_intrinsic_> { Intrinsic llvm_intrinsic = llvm_intrinsic_; }
 
 def Sin : DXILOperation<"Sin", 13, UnaryClass, UnaryFloatCategory, "returns sine(theta) for theta in radians.",
-  [f16Ty,f32Ty], ReadNone,
+  [llvm_half_ty, llvm_float_ty], ReadNone,
   [
-    DXILOpParameter<0, "$o", "", "operation result">,
-    DXILOpParameter<1, "i32", "opcode", "DXIL opcode">,
-    DXILOpParameter<2, "$o", "value", "input value">
+    DXILOpParameter<0, llvm_anyfloat_ty, "", "operation result">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_anyfloat_ty, "value", "input value">
   ],
   ["floats"]>,
   LLVMIntrinsic<int_sin>;
 
-def UMax : DXILOperation< "UMax", 39,  BinaryClass,  BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
-    [i16Ty,i32Ty,i64Ty],  ReadNone,
+def UMax : DXILOperation< "UMax", 39, BinaryClass, BinaryUintCategory, "unsigned integer maximum. UMax(a,b) = a > b ? a : b",
+    [llvm_i16_ty, llvm_i32_ty, llvm_i64_ty], ReadNone,
   [
-    DXILOpParameter<0,  "$o",  "",  "operation result">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "$o",  "a",  "input value">,
-    DXILOpParameter<3,  "$o",  "b",  "input value">
+    DXILOpParameter<0, llvm_anyint_ty, "", "operation result">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_anyint_ty, "a", "input value">,
+    DXILOpParameter<3, llvm_anyint_ty, "b", "input value">
   ],
   ["uints"]>,
   LLVMIntrinsic<int_umax>;
 
-def ThreadId : DXILOperation< "ThreadId", 93,  ThreadIdClass, ComputeIDCategory, "reads the thread ID", [i32Ty],  ReadNone,
+def ThreadId : DXILOperation< "ThreadId", 93, ThreadIdClass, ComputeIDCategory, "reads the thread ID", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "thread ID component">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0, llvm_i32_ty, "", "thread ID component">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id>;
 
-def GroupId : DXILOperation< "GroupId", 94,  GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [i32Ty],  ReadNone,
+def GroupId : DXILOperation< "GroupId", 94, GroupIdClass, ComputeIDCategory, "reads the group ID (SV_GroupID)", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "group ID component">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "i32",  "component",  "component to read">
+    DXILOpParameter<0, llvm_i32_ty, "", "group ID component">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_i32_ty, "component", "component to read">
   ]>,
   LLVMIntrinsic<int_dx_group_id>;
 
-def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95,  ThreadIdInGroupClass, ComputeIDCategory,
-  "reads the thread ID within the group (SV_GroupThreadID)", [i32Ty],  ReadNone,
+def ThreadIdInGroup : DXILOperation< "ThreadIdInGroup", 95, ThreadIdInGroupClass, ComputeIDCategory,
+  "reads the thread ID within the group (SV_GroupThreadID)", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "thread ID in group component">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">,
-    DXILOpParameter<2,  "i32",  "component",  "component to read (x,y,z)">
+    DXILOpParameter<0, llvm_i32_ty, "", "thread ID in group component">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">,
+    DXILOpParameter<2, llvm_i32_ty, "component", "component to read (x,y,z)">
   ]>,
   LLVMIntrinsic<int_dx_thread_id_in_group>;
 
-def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96,  FlattenedThreadIdInGroupClass, ComputeIDCategory,
-   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [i32Ty],  ReadNone,
+def FlattenedThreadIdInGroup : DXILOperation< "FlattenedThreadIdInGroup", 96, FlattenedThreadIdInGroupClass, ComputeIDCategory,
+   "provides a flattened index for a given thread within a given group (SV_GroupIndex)", [llvm_i32_ty], ReadNone,
   [
-    DXILOpParameter<0,  "i32",  "",  "result">,
-    DXILOpParameter<1,  "i32",  "opcode",  "DXIL opcode">
+    DXILOpParameter<0, llvm_i32_ty, "", "result">,
+    DXILOpParameter<1, llvm_i32_ty, "opcode", "DXIL opcode">
   ]>,
   LLVMIntrinsic<int_dx_flattened_thread_id_in_group>;
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 768e8052975b7..d47df597d53a3 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -74,44 +74,32 @@ struct DXILOperationDesc {
 };
 } // end anonymous namespace
 
-// Convert DXIL type name string to dxil::ParameterKind
-//
-// @param typeNameStr Type name string
-// @return ParameterKind as defined in llvm/Support/DXILABI.h
-static ParameterKind getDXILTypeNameToKind(StringRef typeNameStr) {
-  return StringSwitch<ParameterKind>(typeNameStr)
-      .Case("voidTy", ParameterKind::VOID)
-      .Case("f16Ty", ParameterKind::HALF)
-      .Case("f32Ty", ParameterKind::FLOAT)
-      .Case("f64Ty", ParameterKind::DOUBLE)
-      .Case("i1Ty", ParameterKind::I1)
-      .Case("i8Ty", ParameterKind::I8)
-      .Case("i16Ty", ParameterKind::I16)
-      .Case("i32Ty", ParameterKind::I32)
-      .Case("i64Ty", ParameterKind::I64)
-      .Case("overloadTy", ParameterKind::OVERLOAD)
-      .Case("handleTy", ParameterKind::DXIL_HANDLE)
-      .Case("cbufferRetTy", ParameterKind::CBUFFER_RET)
-      .Case("resourceRetTy", ParameterKind::RESOURCE_RET)
-      .Default(ParameterKind::INVALID);
-}
-
-static ParameterKind parameterTypeNameToKind(StringRef Name) {
-  return StringSwitch<ParameterKind>(Name)
-      .Case("void", ParameterKind::VOID)
-      .Case("half", ParameterKind::HALF)
-      .Case("float", ParameterKind::FLOAT)
-      .Case("double", ParameterKind::DOUBLE)
-      .Case("i1", ParameterKind::I1)
-      .Case("i8", ParameterKind::I8)
-      .Case("i16", ParameterKind::I16)
-      .Case("i32", ParameterKind::I32)
-      .Case("i64", ParameterKind::I64)
-      .Case("$o", ParameterKind::OVERLOAD)
-      .Case("dx.types.Handle", ParameterKind::DXIL_HANDLE)
-      .Case("dx.types.CBufRet", ParameterKind::CBUFFER_RET)
-      .Case("dx.types.ResRet", ParameterKind::RESOURCE_RET)
-      .Default(ParameterKind::INVALID);
+/*!
+ Convert DXIL type name string to dxil::ParameterKind
+
+ @param typeNameStr Type name string
+ @return ParameterKind As defined in llvm/Support/DXILABI.h
+*/
+static ParameterKind lookupParameterKind(StringRef typeNameStr) {
+  auto paramKind = StringSwitch<ParameterKind>(typeNameStr)
+                       .Case("llvm_void_ty", ParameterKind::VOID)
+                       .Case("llvm_half_ty", ParameterKind::HALF)
+                       .Case("llvm_float_ty", ParameterKind::FLOAT)
+                       .Case("llvm_double_ty", ParameterKind::DOUBLE)
+                       .Case("llvm_i1_ty", ParameterKind::I1)
+                       .Case("llvm_i8_ty", ParameterKind::I8)
+                       .Case("llvm_i16_ty", ParameterKind::I16)
+                       .Case("llvm_i32_ty", ParameterKind::I32)
+                       .Case("llvm_i64_ty", ParameterKind::I64)
+                       .Case("llvm_anyfloat_ty", ParameterKind::OVERLOAD)
+                       .Case("llvm_anyint_ty", ParameterKind::OVERLOAD)
+                       .Case("dxil_handle_ty", ParameterKind::DXIL_HANDLE)
+                       .Case("dxil_cbuffer_ty", ParameterKind::CBUFFER_RET)
+                       .Case("dxil_resource_ty", ParameterKind::RESOURCE_RET)
+                       .Default(ParameterKind::INVALID);
+  assert(paramKind != ParameterKind::INVALID &&
+         "Unsupported DXIL Type specified");
+  return paramKind;
 }
 
 DXILOperationDesc::DXILOperationDesc(const Record *R) {
@@ -143,7 +131,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
 
   for (unsigned I = 0; I < OverloadTypeList->size(); ++I) {
     Record *R = OverloadTypeList->getElementAsRecord(I);
-    OverloadTypes.emplace_back(getDXILTypeNameToKind(R->getNameInitAsString()));
+    OverloadTypes.emplace_back(lookupParameterKind(R->getNameInitAsString()));
   }
   Attr = StringRef(R->getValue("Attribute")->getNameInitAsString());
 }
@@ -151,7 +139,8 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) {
 DXILParameter::DXILParameter(const Record *R) {
   Name = R->getValueAsString("Name");
   Pos = R->getValueAsInt("Pos");
-  Kind = parameterTypeNameToKind(R->getValueAsString("Type"));
+  Kind =
+      lookupParameterKind(R->getValue("ParamType")->getValue()->getAsString());
   if (R->getValue("Doc"))
     Doc = R->getValueAsString("Doc");
   IsConst = R->getValueAsBit("IsConstant");
@@ -296,10 +285,12 @@ static void emitDXILIntrinsicMap(std::vector<DXILOperationDesc> &Ops,
   OS << "\n";
 }
 
-// Convert operation attribute string to Attribute enum
-//
-// @param Attr string reference
-// @return std::string Attribute enum string
+/*!
+ Convert operation attribute string to Attribute enum
+
+ @param Attr string reference
+ @return std::string Attribute enum string
+ */
 static std::string emitDXILOperationAttr(StringRef Attr) {
   return StringSwitch<std::string>(Attr)
       .Case("ReadNone", "Attribute::ReadNone")

From 2e7cacfced573283d5424830f20333e2a6731251 Mon Sep 17 00:00:00 2001
From: Emilia Kond <emilia@rymiel.space>
Date: Thu, 22 Feb 2024 20:22:05 +0200
Subject: [PATCH 071/546] [clang-format] Fix crash in TokenAnnotator (#82349)

The while loop on line 3814 can cause a segmentation fault getting the
Next field on a nullptr. This is because further down, on line 3823,
there is another for loop, which assigns Tok to Tok->Next in its
initializer. This for loop has a condition to check if the result of
that isn't null. If it is, the loop is skipped and we drop back out to
the outer loop, except, now Tok is null, and we try to dereference it
without checking first.

This patch adds a defensive check that returns if Tok->Next is null
before we make it to the second for loop.

Fixes https://github.com/llvm/llvm-project/issues/82328

---------

Co-authored-by: Owen Pan <owenpiano@gmail.com>
---
 clang/lib/Format/TokenAnnotator.cpp   | 2 +-
 clang/unittests/Format/FormatTest.cpp | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index ec7b7f4dbe347..a60d6ae197a24 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3817,7 +3817,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
         do {
           Tok = Tok->Next;
         } while (Tok && Tok->isNot(TT_OverloadedOperatorLParen));
-        if (!Tok)
+        if (!Tok || !Tok->MatchingParen)
           break;
         const auto *LeftParen = Tok;
         for (Tok = Tok->Next; Tok && Tok != LeftParen->MatchingParen;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 8282e75bd847f..b8dc01f55b4fa 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -13503,6 +13503,12 @@ TEST_F(FormatTest, IncorrectCodeUnbalancedBraces) {
   verifyFormat("{");
   verifyFormat("#})");
   verifyNoCrash("(/**/[:!] ?[).");
+  verifyNoCrash("struct X {\n"
+                "  operator iunt(\n"
+                "};");
+  verifyNoCrash("struct Foo {\n"
+                "  operator foo(bar\n"
+                "};");
 }
 
 TEST_F(FormatTest, IncorrectUnbalancedBracesInMacrosWithUnicode) {

From a23d4ceb8866df91334750627827a1724363e755 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Thu, 22 Feb 2024 10:25:05 -0800
Subject: [PATCH 072/546] [lldb][llvm] Return an error instead of crashing when
 parsing a line table prologue. (#80769)

We recently ran into some bad DWARF where the `DW_AT_stmt_list` of many
compile units was randomly set to invalid values and was causing LLDB to
crash due to an assertion about address sizes not matching. Instead of
asserting, we should return an appropriate recoverable `llvm::Error`.
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp   | 22 ++++++++++++++++---
 .../DebugInfo/DWARF/DWARFDebugLineTest.cpp    |  4 +++-
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 28f05644a3aa1..572628f45fc23 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -389,9 +389,25 @@ Error DWARFDebugLine::Prologue::parse(
 
   if (getVersion() >= 5) {
     FormParams.AddrSize = DebugLineData.getU8(Cursor);
-    assert((!Cursor || DebugLineData.getAddressSize() == 0 ||
-            DebugLineData.getAddressSize() == getAddressSize()) &&
-           "Line table header and data extractor disagree");
+    const uint8_t DataAddrSize = DebugLineData.getAddressSize();
+    const uint8_t PrologueAddrSize = getAddressSize();
+    if (Cursor) {
+      if (DataAddrSize == 0) {
+        if (PrologueAddrSize != 4 && PrologueAddrSize != 8) {
+          RecoverableErrorHandler(createStringError(
+              errc::not_supported,
+              "parsing line table prologue at offset 0x%8.8" PRIx64
+              ": invalid address size %" PRIu8,
+              PrologueOffset, PrologueAddrSize));
+        }
+      } else if (DataAddrSize != PrologueAddrSize) {
+        RecoverableErrorHandler(createStringError(
+            errc::not_supported,
+            "parsing line table prologue at offset 0x%8.8" PRIx64 ": address "
+            "size %" PRIu8 " doesn't match architecture address size %" PRIu8,
+            PrologueOffset, PrologueAddrSize, DataAddrSize));
+      }
+    }
     SegSelectorSize = DebugLineData.getU8(Cursor);
   }
 
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
index d42a626fa9c1c..980b627625eef 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp
@@ -823,7 +823,9 @@ TEST_F(DebugLineBasicFixture, ErrorForUnsupportedAddressSizeDefinedInHeader) {
                                                     nullptr, RecordRecoverable);
   EXPECT_THAT_ERROR(
       std::move(Recoverable),
-      FailedWithMessage("address size 0x09 of DW_LNE_set_address opcode at "
+      FailedWithMessage("parsing line table prologue at offset 0x00000000: "
+                        "invalid address size 9",
+                        "address size 0x09 of DW_LNE_set_address opcode at "
                         "offset 0x00000038 is unsupported"));
   ASSERT_THAT_EXPECTED(ExpectedLineTable, Succeeded());
   ASSERT_EQ((*ExpectedLineTable)->Rows.size(), 3u);

From da1880cc56060c9da91cbd04daa7f8aa3ea0e829 Mon Sep 17 00:00:00 2001
From: Kevin Frei <kevinfrei@users.noreply.github.com>
Date: Thu, 22 Feb 2024 10:26:05 -0800
Subject: [PATCH 073/546] GSym aggregated output to JSON file (#81763)

In order to make tooling around dwarf health easier, I've added an
`--json-summary-file` option to `llvm-gsymutil` that will spit out error
summary data with counts to a JSON file.

I've added the same capability to `llvm-dwarfdump` in a [different
PR.](https://github.com/llvm/llvm-project/pull/81762)

The format of the json is:
```JSON
{
  "error-categories": {
    "<first category description>": {"count": 1234},
    "<next category description>": {"count":4321}
  },
  "error-count": 5555
}
```
for a clean run:
```JSON
{
  "error-categories": {},
  "error-count": 0
}
```

---------

Co-authored-by: Kevin Frei <freik@meta.com>
---
 llvm/tools/llvm-gsymutil/Opts.td           |  3 +++
 llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp | 29 ++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/llvm/tools/llvm-gsymutil/Opts.td b/llvm/tools/llvm-gsymutil/Opts.td
index 740291479f932..3aabc8029ccbe 100644
--- a/llvm/tools/llvm-gsymutil/Opts.td
+++ b/llvm/tools/llvm-gsymutil/Opts.td
@@ -35,3 +35,6 @@ defm address : Eq<"address", "Lookup an address in a GSYM file">;
 def addresses_from_stdin :
   FF<"addresses-from-stdin",
      "Lookup addresses in a GSYM file that are read from stdin\nEach input line is expected to be of the following format: <addr> <gsym-path>">;
+defm json_summary_file :
+  Eq<"json-summary-file",
+     "Output a categorized summary of errors into the JSON file specified.">;
diff --git a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
index 2de9c76fd68c0..00a24cdb33fe1 100644
--- a/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
+++ b/llvm/tools/llvm-gsymutil/llvm-gsymutil.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/LLVMDriver.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -87,6 +88,7 @@ static std::vector<std::string> InputFilenames;
 static std::string ConvertFilename;
 static std::vector<std::string> ArchFilters;
 static std::string OutputFilename;
+static std::string JsonSummaryFile;
 static bool Verify;
 static unsigned NumThreads;
 static uint64_t SegmentSize;
@@ -138,6 +140,9 @@ static void parseArgs(int argc, char **argv) {
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_out_file_EQ))
     OutputFilename = A->getValue();
 
+  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_json_summary_file_EQ))
+    JsonSummaryFile = A->getValue();
+
   Verify = Args.hasArg(OPT_verify);
 
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_num_threads_EQ)) {
@@ -515,10 +520,34 @@ int llvm_gsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
     // Call error() if we have an error and it will exit with a status of 1
     if (auto Err = convertFileToGSYM(Aggregation))
       error("DWARF conversion failed: ", std::move(Err));
+
     // Report the errors from aggregator:
     Aggregation.EnumerateResults([&](StringRef category, unsigned count) {
       OS << category << " occurred " << count << " time(s)\n";
     });
+    if (!JsonSummaryFile.empty()) {
+      std::error_code EC;
+      raw_fd_ostream JsonStream(JsonSummaryFile, EC, sys::fs::OF_Text);
+      if (EC) {
+        OS << "error opening aggregate error json file '" << JsonSummaryFile
+           << "' for writing: " << EC.message() << '\n';
+        return 1;
+      }
+
+      llvm::json::Object Categories;
+      uint64_t ErrorCount = 0;
+      Aggregation.EnumerateResults([&](StringRef Category, unsigned Count) {
+        llvm::json::Object Val;
+        Val.try_emplace("count", Count);
+        Categories.try_emplace(Category, std::move(Val));
+        ErrorCount += Count;
+      });
+      llvm::json::Object RootNode;
+      RootNode.try_emplace("error-categories", std::move(Categories));
+      RootNode.try_emplace("error-count", ErrorCount);
+
+      JsonStream << llvm::json::Value(std::move(RootNode));
+    }
     return 0;
   }
 

From 5c24c316496e221e1841418f0f39ccb7200c83c6 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Thu, 22 Feb 2024 22:30:31 +0400
Subject: [PATCH 074/546] [clang] Implement CWG2759 "`[[no_unique_address]` and
 common initial sequence" (#82607)

This patch implements said defect report resolution by adding additional
check to common initial sequence evaluation. Consequently, this fixes
CWG2759.
---
 clang/docs/ReleaseNotes.rst                   |  6 +-
 clang/lib/Sema/SemaChecking.cpp               |  3 +
 clang/test/CXX/drs/dr27xx.cpp                 | 97 +++++++++++++++++--
 .../SemaCXX/cxx2a-ms-no-unique-address.cpp    | 25 +++++
 clang/test/SemaCXX/type-traits.cpp            | 10 +-
 clang/www/cxx_dr_status.html                  |  2 +-
 6 files changed, 128 insertions(+), 15 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d8f8a2cb38442..74bb9a07f0b13 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -83,8 +83,6 @@ C++20 Feature Support
 
 - Implemented the `__is_layout_compatible` intrinsic to support
   `P0466R5: Layout-compatibility and Pointer-interconvertibility Traits <https://wg21.link/P0466R5>`_.
-  Note: `CWG2759: [[no_unique_address] and common initial sequence <https://cplusplus.github.io/CWG/issues/2759.html>`_
-  is not yet implemented.
 
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -108,6 +106,10 @@ Resolutions to C++ Defect Reports
   of two types.
   (`CWG1719: Layout compatibility and cv-qualification revisited <https://cplusplus.github.io/CWG/issues/1719.html>`_).
 
+- ``[[no_unique_address]]`` is now respected when evaluating layout
+  compatibility of two types.
+  (`CWG2759: [[no_unique_address] and common initial sequence  <https://cplusplus.github.io/CWG/issues/2759.html>`_).
+
 C Language Changes
 ------------------
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 710437b354521..7fa295ebd9404 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -19036,6 +19036,9 @@ static bool isLayoutCompatible(ASTContext &C, FieldDecl *Field1,
       return false;
   }
 
+  if (Field1->hasAttr<clang::NoUniqueAddressAttr>() ||
+      Field2->hasAttr<clang::NoUniqueAddressAttr>())
+    return false;
   return true;
 }
 
diff --git a/clang/test/CXX/drs/dr27xx.cpp b/clang/test/CXX/drs/dr27xx.cpp
index dd3fd5a20163f..c956c4355abd3 100644
--- a/clang/test/CXX/drs/dr27xx.cpp
+++ b/clang/test/CXX/drs/dr27xx.cpp
@@ -1,15 +1,98 @@
-// RUN: %clang_cc1 -std=c++98 -verify=expected %s
-// RUN: %clang_cc1 -std=c++11 -verify=expected %s
-// RUN: %clang_cc1 -std=c++14 -verify=expected %s
-// RUN: %clang_cc1 -std=c++17 -verify=expected %s
-// RUN: %clang_cc1 -std=c++20 -verify=expected %s
-// RUN: %clang_cc1 -std=c++23 -verify=expected,since-cxx23 %s
-// RUN: %clang_cc1 -std=c++2c -verify=expected,since-cxx23,since-cxx26 %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++98 -verify=expected %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++11 -verify=expected %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++14 -verify=expected %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++17 -verify=expected %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++20 -verify=expected %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -verify=expected,since-cxx23 %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++2c -verify=expected,since-cxx23,since-cxx26 %s
 
 #if __cplusplus <= 202002L
 // expected-no-diagnostics
 #endif
 
+namespace dr2759 { // dr2759: 19
+#if __cplusplus >= 201103L
+
+struct CStruct {
+  int one;
+  int two;
+};
+
+struct CEmptyStruct {};
+struct CEmptyStruct2 {};
+
+struct CStructNoUniqueAddress {
+  int one;
+  [[no_unique_address]] int two;
+};
+
+struct CStructNoUniqueAddress2 {
+  int one;
+  [[no_unique_address]] int two;
+};
+
+union UnionLayout {
+  int a;
+  double b;
+  CStruct c;
+  [[no_unique_address]] CEmptyStruct d;
+  [[no_unique_address]] CEmptyStruct2 e;
+};
+
+union UnionLayout2 {
+  CStruct c;
+  int a;
+  CEmptyStruct2 e;
+  double b;
+  [[no_unique_address]] CEmptyStruct d;
+};
+
+union UnionLayout3 {
+  CStruct c;
+  int a;
+  double b;
+  [[no_unique_address]] CEmptyStruct d;
+};
+
+struct StructWithAnonUnion {
+  union {
+    int a;
+    double b;
+    CStruct c;
+    [[no_unique_address]] CEmptyStruct d;
+    [[no_unique_address]] CEmptyStruct2 e;
+  };
+};
+
+struct StructWithAnonUnion2 {
+  union {
+    CStruct c;
+    int a;
+    CEmptyStruct2 e;
+    double b;
+    [[no_unique_address]] CEmptyStruct d;
+  };
+};
+
+struct StructWithAnonUnion3 {
+  union {
+    CStruct c;
+    int a;
+    CEmptyStruct2 e;
+    double b;
+    [[no_unique_address]] CEmptyStruct d;
+  } u;
+};
+
+static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) != bool(__has_cpp_attribute(no_unique_address)), "");
+static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) != bool(__has_cpp_attribute(no_unique_address)), "");
+static_assert(!__is_layout_compatible(UnionLayout, UnionLayout2), "");
+static_assert(!__is_layout_compatible(UnionLayout, UnionLayout3), "");
+static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), "");
+static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), "");
+#endif
+} // namespace dr2759
+
 namespace dr2789 { // dr2789: 18
 #if __cplusplus >= 202302L
 template <typename T = int>
diff --git a/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp b/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp
index 42058559a087a..822ed752fa9c7 100644
--- a/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp
+++ b/clang/test/SemaCXX/cxx2a-ms-no-unique-address.cpp
@@ -17,3 +17,28 @@ struct [[msvc::no_unique_address]] S { // expected-error {{only applies to non-b
 
   int [[msvc::no_unique_address]] c; // expected-error {{cannot be applied to types}} unsupported-error {{cannot be applied to types}}
 };
+
+struct CStructNoUniqueAddress {
+  int one;
+  [[no_unique_address]] int two;
+  // expected-warning@-1 {{unknown attribute 'no_unique_address' ignored}}
+};
+
+struct CStructMSVCNoUniqueAddress {
+  int one;
+  [[msvc::no_unique_address]] int two;
+  // unsupported-warning@-1 {{unknown attribute 'no_unique_address' ignored}}
+};
+
+struct CStructMSVCNoUniqueAddress2 {
+  int one;
+  [[msvc::no_unique_address]] int two;
+  // unsupported-warning@-1 {{unknown attribute 'no_unique_address' ignored}}
+};
+
+static_assert(__has_cpp_attribute(no_unique_address) == 0);
+// unsupported-error@-1 {{static assertion failed due to requirement '201803L == 0'}}
+static_assert(!__is_layout_compatible(CStructNoUniqueAddress, CStructMSVCNoUniqueAddress), "");
+static_assert(__is_layout_compatible(CStructMSVCNoUniqueAddress, CStructMSVCNoUniqueAddress), "");
+static_assert(!__is_layout_compatible(CStructMSVCNoUniqueAddress, CStructMSVCNoUniqueAddress2), "");
+// unsupported-error@-1 {{static assertion failed due to requirement '!__is_layout_compatible(CStructMSVCNoUniqueAddress, CStructMSVCNoUniqueAddress2)':}}
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 2c35d5ee19a4c..23c339ebdf082 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -1768,8 +1768,8 @@ void is_layout_compatible(int n)
   static_assert(!__is_layout_compatible(CppStructNonStandardBySameBase, CppStructNonStandardBySameBase2), "");
   static_assert(!__is_layout_compatible(CppStructNonStandardBy2ndVirtBase, CppStructNonStandardBy2ndVirtBase2), "");
   static_assert(__is_layout_compatible(CStruct, CStructWithQualifiers), "");
-  static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) == bool(__has_cpp_attribute(no_unique_address)), ""); // FIXME: this is CWG2759
-  static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) == bool(__has_cpp_attribute(no_unique_address)), ""); // FIXME: this is CWG2759
+  static_assert(__is_layout_compatible(CStruct, CStructNoUniqueAddress) != bool(__has_cpp_attribute(no_unique_address)), "");
+  static_assert(__is_layout_compatible(CStructNoUniqueAddress, CStructNoUniqueAddress2) != bool(__has_cpp_attribute(no_unique_address)), "");
   static_assert(__is_layout_compatible(CStruct, CStructAlignment), "");
   static_assert(__is_layout_compatible(CStruct, CStructAlignedMembers), ""); // FIXME: alignment of members impact common initial sequence
   static_assert(__is_layout_compatible(CStructWithBitfelds, CStructWithBitfelds), "");
@@ -1782,10 +1782,10 @@ void is_layout_compatible(int n)
   static_assert(!__is_layout_compatible(void(CStruct2::*)(int), void(CStruct2::*)(char)), "");
   static_assert(__is_layout_compatible(CStructNested, CStructNested2), "");
   static_assert(__is_layout_compatible(UnionLayout, UnionLayout), "");
-  static_assert(__is_layout_compatible(UnionLayout, UnionLayout2), "");
+  static_assert(!__is_layout_compatible(UnionLayout, UnionLayout2), "");
   static_assert(!__is_layout_compatible(UnionLayout, UnionLayout3), "");
-  static_assert(__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), "");
-  static_assert(__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), "");
+  static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion2), "");
+  static_assert(!__is_layout_compatible(StructWithAnonUnion, StructWithAnonUnion3), "");
   static_assert(__is_layout_compatible(EnumLayout, EnumClassLayout), "");
   static_assert(__is_layout_compatible(EnumForward, EnumForward), "");
   static_assert(__is_layout_compatible(EnumForward, EnumClassForward), "");
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 38e2cb6314266..8b638e06f4aab 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -16362,7 +16362,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
     <td>DR</td>
     <td>[[no_unique_address] and common initial sequence</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>

From cc839275164a7768451531af868fa70eb9e71cbd Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 23 Feb 2024 02:42:49 +0800
Subject: [PATCH 075/546] [CVP] Canonicalize signed minmax into unsigned
 (#82478)

This patch turns signed minmax to unsigned to match the behavior for
signed icmps.
Alive2: https://alive2.llvm.org/ce/z/UAAM42
---
 .../Scalar/CorrelatedValuePropagation.cpp     | 25 +++++--
 .../CorrelatedValuePropagation/min-max.ll     | 73 ++++++++++++++++++-
 2 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index c71870bc1b656..6ce9eb3656c93 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -47,11 +47,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "correlated-value-propagation"
 
-static cl::opt<bool> CanonicalizeICmpPredicatesToUnsigned(
-    "canonicalize-icmp-predicates-to-unsigned", cl::init(true), cl::Hidden,
-    cl::desc("Enables canonicalization of signed relational predicates to "
-             "unsigned (e.g. sgt => ugt)"));
-
 STATISTIC(NumPhis,      "Number of phis propagated");
 STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
 STATISTIC(NumSelects,   "Number of selects propagated");
@@ -90,6 +85,8 @@ STATISTIC(NumSaturating,
     "Number of saturating arithmetics converted to normal arithmetics");
 STATISTIC(NumNonNull, "Number of function pointer arguments marked non-null");
 STATISTIC(NumMinMax, "Number of llvm.[us]{min,max} intrinsics removed");
+STATISTIC(NumSMinMax,
+          "Number of llvm.s{min,max} intrinsics simplified to unsigned");
 STATISTIC(NumUDivURemsNarrowedExpanded,
           "Number of bound udiv's/urem's expanded");
 STATISTIC(NumZExt, "Number of non-negative deductions");
@@ -289,9 +286,6 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
 }
 
 static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
-  if (!CanonicalizeICmpPredicatesToUnsigned)
-    return false;
-
   // Only for signed relational comparisons of scalar integers.
   if (Cmp->getType()->isVectorTy() ||
       !Cmp->getOperand(0)->getType()->isIntegerTy())
@@ -528,6 +522,7 @@ static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) {
 }
 
 // See if this min/max intrinsic always picks it's one specific operand.
+// If not, check whether we can canonicalize signed minmax into unsigned version
 static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) {
   CmpInst::Predicate Pred = CmpInst::getNonStrictPredicate(MM->getPredicate());
   ConstantRange LHS_CR = LVI->getConstantRangeAtUse(MM->getOperandUse(0),
@@ -546,6 +541,20 @@ static bool processMinMaxIntrinsic(MinMaxIntrinsic *MM, LazyValueInfo *LVI) {
     MM->eraseFromParent();
     return true;
   }
+
+  if (MM->isSigned() &&
+      ConstantRange::areInsensitiveToSignednessOfICmpPredicate(LHS_CR,
+                                                               RHS_CR)) {
+    ++NumSMinMax;
+    IRBuilder<> B(MM);
+    MM->replaceAllUsesWith(B.CreateBinaryIntrinsic(
+        MM->getIntrinsicID() == Intrinsic::smin ? Intrinsic::umin
+                                                : Intrinsic::umax,
+        MM->getLHS(), MM->getRHS()));
+    MM->eraseFromParent();
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
index d21b8f2418c2e..c9ee233b5a461 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/min-max.ll
@@ -176,8 +176,8 @@ define i8 @test15(i8 %x) {
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 41
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 42)
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 42)
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %lim = icmp sge i8 %x, 41
   call void @llvm.assume(i1 %lim)
@@ -189,8 +189,8 @@ define i8 @test16(i8 %x) {
 ; CHECK-LABEL: @test16(
 ; CHECK-NEXT:    [[LIM:%.*]] = icmp sge i8 [[X:%.*]], 41
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[LIM]])
-; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 42)
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 42)
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %lim = icmp sge i8 %x, 41
   call void @llvm.assume(i1 %lim)
@@ -290,3 +290,68 @@ if.end:
   %phi = phi i64 [%val, %bb1], [0, %entry]
   ret i64 %phi
 }
+
+define i8 @test_smax_to_umax_nneg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smax_to_umax_nneg(
+; CHECK-NEXT:    [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[NNEG_A]], i8 [[NNEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %nneg_a = and i8 %a, 127
+  %nneg_b = and i8 %b, 127
+  %ret = call i8 @llvm.smax.i8(i8 %nneg_a, i8 %nneg_b)
+  ret i8 %ret
+}
+
+define i8 @test_smax_to_umax_neg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smax_to_umax_neg(
+; CHECK-NEXT:    [[NEG_A:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[NEG_B:%.*]] = or i8 [[B:%.*]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[NEG_A]], i8 [[NEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %neg_a = or i8 %a, 128
+  %neg_b = or i8 %b, 128
+  %ret = call i8 @llvm.smax.i8(i8 %neg_a, i8 %neg_b)
+  ret i8 %ret
+}
+
+define i8 @test_smin_to_umin_nneg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smin_to_umin_nneg(
+; CHECK-NEXT:    [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[NNEG_A]], i8 [[NNEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %nneg_a = and i8 %a, 127
+  %nneg_b = and i8 %b, 127
+  %ret = call i8 @llvm.smin.i8(i8 %nneg_a, i8 %nneg_b)
+  ret i8 %ret
+}
+
+define i8 @test_smin_to_umin_neg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_smin_to_umin_neg(
+; CHECK-NEXT:    [[NEG_A:%.*]] = or i8 [[A:%.*]], -128
+; CHECK-NEXT:    [[NEG_B:%.*]] = or i8 [[B:%.*]], -128
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[NEG_A]], i8 [[NEG_B]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %neg_a = or i8 %a, 128
+  %neg_b = or i8 %b, 128
+  %ret = call i8 @llvm.smin.i8(i8 %neg_a, i8 %neg_b)
+  ret i8 %ret
+}
+
+define i8 @test_umax_nneg(i8 %a, i8 %b) {
+; CHECK-LABEL: @test_umax_nneg(
+; CHECK-NEXT:    [[NNEG_A:%.*]] = and i8 [[A:%.*]], 127
+; CHECK-NEXT:    [[NNEG_B:%.*]] = and i8 [[B:%.*]], 127
+; CHECK-NEXT:    [[RET:%.*]] = call i8 @llvm.umax.i8(i8 [[NNEG_A]], i8 [[NNEG_B]])
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %nneg_a = and i8 %a, 127
+  %nneg_b = and i8 %b, 127
+  %ret = call i8 @llvm.umax.i8(i8 %nneg_a, i8 %nneg_b)
+  ret i8 %ret
+}

From 33a6ce18373ffd1457ebd54e930b6f02fe4c39c1 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 22 Feb 2024 13:51:31 -0500
Subject: [PATCH 076/546] [HIP] Allow partial linking for `-fgpu-rdc` (#81700)

`-fgpu-rdc` mode allows device functions call device functions in
different TU. However, currently all device objects have to be linked
together since only one fat binary is supported. This is time consuming
for AMDGPU backend since it only supports LTO.

There are use cases that objects can be divided into groups in which
device functions are self-contained but host functions are not. It is
desirable to link/optimize/codegen the device code and generate a fatbin
for each group, whereas partially link the host code with `ld -r` or
generate a static library by using the `--emit-static-lib` option of
clang. This avoids linking all device code together, therefore decreases
the linking time for `-fgpu-rdc`.

Previously, clang emits an external symbol `__hip_fatbin` for all
objects for `-fgpu-rdc`. With this patch, clang emits an unique external
symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a
group of objects are linked together to generate a fatbin, the symbols
are merged by alias and point to the same fat binary. Each group has its
own fat binary. One executable or shared library can have multiple fat
binaries. Device linking is done for undefined fab binary symbols only
to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and
merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is
introduced to facilitate debugging and tooling.

Fixes: https://github.com/llvm/llvm-project/issues/77018
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  22 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  10 +-
 clang/lib/Driver/OffloadBundler.cpp           |  40 ++-
 clang/lib/Driver/ToolChains/HIPUtility.cpp    | 258 +++++++++++++++++-
 clang/test/CMakeLists.txt                     |   1 +
 clang/test/CodeGenCUDA/device-stub.cu         |  10 +-
 .../test/CodeGenCUDA/host-used-device-var.cu  |   5 +-
 clang/test/Driver/Inputs/hip.h                |  25 ++
 clang/test/Driver/clang-offload-bundler.c     |  13 +-
 clang/test/Driver/hip-partial-link.hip        |  97 +++++++
 clang/test/Driver/hip-toolchain-rdc.hip       |  38 ++-
 11 files changed, 469 insertions(+), 50 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/hip.h
 create mode 100644 clang/test/Driver/hip-partial-link.hip

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 5b43272bfa62f..49f93451db7bb 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
       // to contain the fat binary but will be populated somewhere else,
       // e.g. by lld through link script.
       FatBinStr = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.Int8Ty,
-        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
-        "__hip_fatbin", nullptr,
-        llvm::GlobalVariable::NotThreadLocal);
+          CGM.getModule(), CGM.Int8Ty,
+          /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+          "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
+          llvm::GlobalVariable::NotThreadLocal);
       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
     }
 
@@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
   // thread safety of the loaded program. Therefore we can assume sequential
   // execution of constructor functions here.
   if (IsHIP) {
-    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
-        llvm::GlobalValue::LinkOnceAnyLinkage;
+    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
+                                 : llvm::GlobalValue::ExternalLinkage;
     llvm::BasicBlock *IfBlock =
         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
     llvm::BasicBlock *ExitBlock =
@@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     // of HIP ABI.
     GpuBinaryHandle = new llvm::GlobalVariable(
         TheModule, PtrTy, /*isConstant=*/false, Linkage,
-        /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
-        "__hip_gpubin_handle");
-    if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
-      GpuBinaryHandle->setComdat(
-          CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
+        /*Initializer=*/
+        CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
+        CudaGpuBinary
+            ? "__hip_gpubin_handle"
+            : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
     // Prevent the weak symbol in different shared libraries being merged.
     if (Linkage != llvm::GlobalValue::InternalLinkage)
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 77fb3a62b356e..95e457bef28ed 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -915,7 +915,15 @@ void CodeGenModule::Release() {
         llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
     addCompilerUsedGlobal(GV);
   }
-
+  if (LangOpts.HIP) {
+    // Emit a unique ID so that host and device binaries from the same
+    // compilation unit can be associated.
+    auto *GV = new llvm::GlobalVariable(
+        getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
+        llvm::Constant::getNullValue(Int8Ty),
+        "__hip_cuid_" + getContext().getCUIDHash());
+    addCompilerUsedGlobal(GV);
+  }
   emitLLVMUsed();
   if (SanStats)
     SanStats->finish();
diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp
index b1091aca5616f..99a34d25cfcd5 100644
--- a/clang/lib/Driver/OffloadBundler.cpp
+++ b/clang/lib/Driver/OffloadBundler.cpp
@@ -588,8 +588,15 @@ class ObjectFileHandler final : public FileHandler {
     StringRef Content = *ContentOrErr;
 
     // Copy fat object contents to the output when extracting host bundle.
-    if (Content.size() == 1u && Content.front() == 0)
-      Content = StringRef(Input.getBufferStart(), Input.getBufferSize());
+    std::string ModifiedContent;
+    if (Content.size() == 1u && Content.front() == 0) {
+      auto HostBundleOrErr = getHostBundle();
+      if (!HostBundleOrErr)
+        return HostBundleOrErr.takeError();
+
+      ModifiedContent = std::move(*HostBundleOrErr);
+      Content = ModifiedContent;
+    }
 
     OS.write(Content.data(), Content.size());
     return Error::success();
@@ -692,6 +699,35 @@ class ObjectFileHandler final : public FileHandler {
     }
     return Error::success();
   }
+
+  Expected<std::string> getHostBundle() {
+    TempFileHandlerRAII TempFiles;
+
+    auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt);
+    if (!ModifiedObjPathOrErr)
+      return ModifiedObjPathOrErr.takeError();
+    StringRef ModifiedObjPath = *ModifiedObjPathOrErr;
+
+    BumpPtrAllocator Alloc;
+    StringSaver SS{Alloc};
+    SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"};
+
+    ObjcopyArgs.push_back("--regex");
+    ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*");
+    ObjcopyArgs.push_back("--");
+    ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front());
+    ObjcopyArgs.push_back(ModifiedObjPath);
+
+    if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs))
+      return std::move(Err);
+
+    auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath);
+    if (!BufOrErr)
+      return createStringError(BufOrErr.getError(),
+                               "Failed to read back the modified object file");
+
+    return BufOrErr->get()->getBuffer().str();
+  }
 };
 
 /// Handler for text files. The bundled file will have the following format.
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index f692458b775de..fcecf2e1313bb 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -9,13 +9,24 @@
 #include "HIPUtility.h"
 #include "CommonArgs.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/Options.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
+#include <deque>
+#include <set>
 
+using namespace clang;
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace llvm::opt;
+using llvm::dyn_cast;
 
 #if defined(_WIN32) || defined(_WIN64)
 #define NULL_FILE "nul"
@@ -36,6 +47,169 @@ static std::string normalizeForBundler(const llvm::Triple &T,
                      : T.normalize();
 }
 
+// Collect undefined __hip_fatbin* and __hip_gpubin_handle* symbols from all
+// input object or archive files.
+class HIPUndefinedFatBinSymbols {
+public:
+  HIPUndefinedFatBinSymbols(const Compilation &C)
+      : C(C), DiagID(C.getDriver().getDiags().getCustomDiagID(
+                  DiagnosticsEngine::Error,
+                  "Error collecting HIP undefined fatbin symbols: %0")),
+        Quiet(C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)),
+        Verbose(C.getArgs().hasArg(options::OPT_v)) {
+    populateSymbols();
+    if (Verbose) {
+      for (auto Name : FatBinSymbols)
+        llvm::errs() << "Found undefined HIP fatbin symbol: " << Name << "\n";
+      for (auto Name : GPUBinHandleSymbols)
+        llvm::errs() << "Found undefined HIP gpubin handle symbol: " << Name
+                     << "\n";
+    }
+  }
+
+  const std::set<std::string> &getFatBinSymbols() const {
+    return FatBinSymbols;
+  }
+
+  const std::set<std::string> &getGPUBinHandleSymbols() const {
+    return GPUBinHandleSymbols;
+  }
+
+private:
+  const Compilation &C;
+  unsigned DiagID;
+  bool Quiet;
+  bool Verbose;
+  std::set<std::string> FatBinSymbols;
+  std::set<std::string> GPUBinHandleSymbols;
+  std::set<std::string> DefinedFatBinSymbols;
+  std::set<std::string> DefinedGPUBinHandleSymbols;
+  const std::string FatBinPrefix = "__hip_fatbin";
+  const std::string GPUBinHandlePrefix = "__hip_gpubin_handle";
+
+  void populateSymbols() {
+    std::deque<const Action *> WorkList;
+    std::set<const Action *> Visited;
+
+    for (const auto &Action : C.getActions())
+      WorkList.push_back(Action);
+
+    while (!WorkList.empty()) {
+      const Action *CurrentAction = WorkList.front();
+      WorkList.pop_front();
+
+      if (!CurrentAction || !Visited.insert(CurrentAction).second)
+        continue;
+
+      if (const auto *IA = dyn_cast<InputAction>(CurrentAction)) {
+        std::string ID = IA->getId().str();
+        if (!ID.empty()) {
+          ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true);
+          FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str());
+          GPUBinHandleSymbols.insert(
+              Twine(GPUBinHandlePrefix + "_" + ID).str());
+          continue;
+        }
+        if (IA->getInputArg().getNumValues() == 0)
+          continue;
+        const char *Filename = IA->getInputArg().getValue();
+        if (!Filename)
+          continue;
+        auto BufferOrErr = llvm::MemoryBuffer::getFile(Filename);
+        // Input action could be options to linker, therefore, ignore it
+        // if cannot read it. If it turns out to be a file that cannot be read,
+        // the error will be caught by the linker.
+        if (!BufferOrErr)
+          continue;
+
+        processInput(BufferOrErr.get()->getMemBufferRef());
+      } else
+        WorkList.insert(WorkList.end(), CurrentAction->getInputs().begin(),
+                        CurrentAction->getInputs().end());
+    }
+  }
+
+  void processInput(const llvm::MemoryBufferRef &Buffer) {
+    // Try processing as object file first.
+    auto ObjFileOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
+    if (ObjFileOrErr) {
+      processSymbols(**ObjFileOrErr);
+      return;
+    }
+
+    // Then try processing as archive files.
+    llvm::consumeError(ObjFileOrErr.takeError());
+    auto ArchiveOrErr = llvm::object::Archive::create(Buffer);
+    if (ArchiveOrErr) {
+      llvm::Error Err = llvm::Error::success();
+      llvm::object::Archive &Archive = *ArchiveOrErr.get();
+      for (auto &Child : Archive.children(Err)) {
+        auto ChildBufOrErr = Child.getMemoryBufferRef();
+        if (ChildBufOrErr)
+          processInput(*ChildBufOrErr);
+        else
+          errorHandler(ChildBufOrErr.takeError());
+      }
+
+      if (Err)
+        errorHandler(std::move(Err));
+      return;
+    }
+
+    // Ignore other files.
+    llvm::consumeError(ArchiveOrErr.takeError());
+  }
+
+  void processSymbols(const llvm::object::ObjectFile &Obj) {
+    for (const auto &Symbol : Obj.symbols()) {
+      auto FlagOrErr = Symbol.getFlags();
+      if (!FlagOrErr) {
+        errorHandler(FlagOrErr.takeError());
+        continue;
+      }
+
+      auto NameOrErr = Symbol.getName();
+      if (!NameOrErr) {
+        errorHandler(NameOrErr.takeError());
+        continue;
+      }
+      llvm::StringRef Name = *NameOrErr;
+
+      bool isUndefined =
+          FlagOrErr.get() & llvm::object::SymbolRef::SF_Undefined;
+      bool isFatBinSymbol = Name.starts_with(FatBinPrefix);
+      bool isGPUBinHandleSymbol = Name.starts_with(GPUBinHandlePrefix);
+
+      // Handling for defined symbols
+      if (!isUndefined) {
+        if (isFatBinSymbol) {
+          DefinedFatBinSymbols.insert(Name.str());
+          FatBinSymbols.erase(Name.str());
+        } else if (isGPUBinHandleSymbol) {
+          DefinedGPUBinHandleSymbols.insert(Name.str());
+          GPUBinHandleSymbols.erase(Name.str());
+        }
+        continue;
+      }
+
+      // Add undefined symbols if they are not in the defined sets
+      if (isFatBinSymbol &&
+          DefinedFatBinSymbols.find(Name.str()) == DefinedFatBinSymbols.end())
+        FatBinSymbols.insert(Name.str());
+      else if (isGPUBinHandleSymbol &&
+               DefinedGPUBinHandleSymbols.find(Name.str()) ==
+                   DefinedGPUBinHandleSymbols.end())
+        GPUBinHandleSymbols.insert(Name.str());
+    }
+  }
+
+  void errorHandler(llvm::Error Err) {
+    if (Quiet)
+      return;
+    C.getDriver().Diag(DiagID) << llvm::toString(std::move(Err));
+  }
+};
+
 // Construct a clang-offload-bundler command to bundle code objects for
 // different devices into a HIP fat binary.
 void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
@@ -130,26 +304,84 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
   auto HostTriple =
       C.getSingleOffloadToolChain<Action::OFK_Host>()->getTriple();
 
+  HIPUndefinedFatBinSymbols Symbols(C);
+
+  std::string PrimaryHipFatbinSymbol;
+  std::string PrimaryGpuBinHandleSymbol;
+  bool FoundPrimaryHipFatbinSymbol = false;
+  bool FoundPrimaryGpuBinHandleSymbol = false;
+
+  std::vector<std::string> AliasHipFatbinSymbols;
+  std::vector<std::string> AliasGpuBinHandleSymbols;
+
+  // Iterate through symbols to find the primary ones and collect others for
+  // aliasing
+  for (const auto &Symbol : Symbols.getFatBinSymbols()) {
+    if (!FoundPrimaryHipFatbinSymbol) {
+      PrimaryHipFatbinSymbol = Symbol;
+      FoundPrimaryHipFatbinSymbol = true;
+    } else
+      AliasHipFatbinSymbols.push_back(Symbol);
+  }
+
+  for (const auto &Symbol : Symbols.getGPUBinHandleSymbols()) {
+    if (!FoundPrimaryGpuBinHandleSymbol) {
+      PrimaryGpuBinHandleSymbol = Symbol;
+      FoundPrimaryGpuBinHandleSymbol = true;
+    } else
+      AliasGpuBinHandleSymbols.push_back(Symbol);
+  }
+
   // Add MC directives to embed target binaries. We ensure that each
   // section and image is 16-byte aligned. This is not mandatory, but
   // increases the likelihood of data to be aligned with a cache block
   // in several main host machines.
   ObjStream << "#       HIP Object Generator\n";
   ObjStream << "# *** Automatically generated by Clang ***\n";
-  if (HostTriple.isWindowsMSVCEnvironment()) {
-    ObjStream << "  .section .hip_fatbin, \"dw\"\n";
-  } else {
-    ObjStream << "  .protected __hip_fatbin\n";
-    ObjStream << "  .type __hip_fatbin,@object\n";
-    ObjStream << "  .section .hip_fatbin,\"a\",@progbits\n";
+  if (FoundPrimaryGpuBinHandleSymbol) {
+    // Define the first gpubin handle symbol
+    if (HostTriple.isWindowsMSVCEnvironment())
+      ObjStream << "  .section .hip_gpubin_handle,\"dw\"\n";
+    else {
+      ObjStream << "  .protected " << PrimaryGpuBinHandleSymbol << "\n";
+      ObjStream << "  .type " << PrimaryGpuBinHandleSymbol << ",@object\n";
+      ObjStream << "  .section .hip_gpubin_handle,\"aw\"\n";
+    }
+    ObjStream << "  .globl " << PrimaryGpuBinHandleSymbol << "\n";
+    ObjStream << "  .p2align 3\n"; // Align 8
+    ObjStream << PrimaryGpuBinHandleSymbol << ":\n";
+    ObjStream << "  .zero 8\n"; // Size 8
+
+    // Generate alias directives for other gpubin handle symbols
+    for (const auto &AliasSymbol : AliasGpuBinHandleSymbols) {
+      ObjStream << "  .globl " << AliasSymbol << "\n";
+      ObjStream << "  .set " << AliasSymbol << "," << PrimaryGpuBinHandleSymbol
+                << "\n";
+    }
+  }
+  if (FoundPrimaryHipFatbinSymbol) {
+    // Define the first fatbin symbol
+    if (HostTriple.isWindowsMSVCEnvironment())
+      ObjStream << "  .section .hip_fatbin,\"dw\"\n";
+    else {
+      ObjStream << "  .protected " << PrimaryHipFatbinSymbol << "\n";
+      ObjStream << "  .type " << PrimaryHipFatbinSymbol << ",@object\n";
+      ObjStream << "  .section .hip_fatbin,\"a\",@progbits\n";
+    }
+    ObjStream << "  .globl " << PrimaryHipFatbinSymbol << "\n";
+    ObjStream << "  .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
+              << "\n";
+    // Generate alias directives for other fatbin symbols
+    for (const auto &AliasSymbol : AliasHipFatbinSymbols) {
+      ObjStream << "  .globl " << AliasSymbol << "\n";
+      ObjStream << "  .set " << AliasSymbol << "," << PrimaryHipFatbinSymbol
+                << "\n";
+    }
+    ObjStream << PrimaryHipFatbinSymbol << ":\n";
+    ObjStream << "  .incbin ";
+    llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true);
+    ObjStream << "\n";
   }
-  ObjStream << "  .globl __hip_fatbin\n";
-  ObjStream << "  .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
-            << "\n";
-  ObjStream << "__hip_fatbin:\n";
-  ObjStream << "  .incbin ";
-  llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true);
-  ObjStream << "\n";
   if (HostTriple.isOSLinux() && HostTriple.isOSBinFormatELF())
     ObjStream << "  .section .note.GNU-stack, \"\", @progbits\n";
   ObjStream.flush();
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 6b5cb0a18457b..fcfca354f4a75 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -136,6 +136,7 @@ if( NOT CLANG_BUILT_STANDALONE )
     llvm-strip
     llvm-symbolizer
     llvm-windres
+    obj2yaml
     opt
     split-file
     yaml2obj
diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu
index d7a7b1bb9fe95..60304647bd4c5 100644
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@@ -50,21 +50,19 @@
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
+// RUN: %clang_cc1 -cuid=123 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
 
 // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN
 
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
+// RUN: %clang_cc1 -cuid=123 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
 // RUN:     -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN,HIP,HIPNEF
 
 #include "Inputs/cuda.h"
 
-// HIPNEF: $__hip_gpubin_handle = comdat any
-
 #ifndef NOGLOBALS
 // NORDC-DAG: @device_var = internal global i32
 // RDC-DAG: @device_var = global i32
@@ -161,7 +159,7 @@ __device__ void device_use() {
 // * constant unnamed string with GPU binary
 // CUDA: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",
 // HIPEF: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",{{.*}}align 4096
-// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
+// HIPNEF: @[[FATBIN:__hip_fatbin_[0-9a-f]+]] = external constant i8, section ".hip_fatbin"
 // CUDANORDC-SAME: section ".nv_fatbin", align 8
 // CUDARDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
@@ -177,7 +175,7 @@ __device__ void device_use() {
 // HIP-SAME: section ".hipFatBinSegment"
 // * variable to save GPU binary handle after initialization
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global ptr null
-// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global ptr null
+// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = external hidden global ptr, align 8
 // * constant unnamed string with NVModuleID
 // CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
diff --git a/clang/test/CodeGenCUDA/host-used-device-var.cu b/clang/test/CodeGenCUDA/host-used-device-var.cu
index 7cb31aff84264..5328660c9dc9d 100644
--- a/clang/test/CodeGenCUDA/host-used-device-var.cu
+++ b/clang/test/CodeGenCUDA/host-used-device-var.cu
@@ -1,9 +1,9 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
 // RUN:   -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \
-// RUN:   | FileCheck -check-prefix=DEV %s
+// RUN:   -cuid=123 | FileCheck -check-prefix=DEV %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \
-// RUN:   -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s
+// RUN:   -std=c++17 -O3 -emit-llvm -o - -cuid=123 | FileCheck -check-prefix=HOST %s
 
 // Negative tests.
 
@@ -187,6 +187,7 @@ public:
 // DEV-SAME: {{^[^@]*}} @_ZL2u3
 // DEV-SAME: {{^[^@]*}} @_ZZ4fun1vE11static_var1
 // DEV-SAME: {{^[^@]*}} @_ZZZN21TestStaticVarInLambda3funEvENKUlPcE_clES0_E4var2
+// DEV-SAME: {{^[^@]*}} @__hip_cuid_{{[0-9a-f]+}}
 // DEV-SAME: {{^[^@]*}} @constexpr_var2b
 // DEV-SAME: {{^[^@]*}} @inline_var
 // DEV-SAME: {{^[^@]*}} @u1
diff --git a/clang/test/Driver/Inputs/hip.h b/clang/test/Driver/Inputs/hip.h
new file mode 100644
index 0000000000000..5be772a7b3413
--- /dev/null
+++ b/clang/test/Driver/Inputs/hip.h
@@ -0,0 +1,25 @@
+/* Minimal declarations for HIP support.  Testing purposes only. */
+
+#define __constant__ __attribute__((constant))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __host__ __attribute__((host))
+#define __shared__ __attribute__((shared))
+#define __managed__ __attribute__((managed))
+
+struct dim3 {
+  unsigned x, y, z;
+  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
+};
+
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, unsigned long long sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 unsigned long long sharedSize = 0,
+                                                 hipStream_t stream = 0);
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      unsigned long long sharedMem,
+                                      hipStream_t stream);
diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index 7d0b6b27a60ae..9d8b81ee9806e 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -10,6 +10,7 @@
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -emit-llvm -o %t.bc
 // RUN: %clang -O0 -target %itanium_abi_triple %s -S -o %t.s
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.o
+// RUN: obj2yaml %t.o > %t.o.yaml
 // RUN: %clang -O0 -target %itanium_abi_triple %s -emit-ast -o %t.ast
 
 //
@@ -305,11 +306,13 @@
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle3.o
 // RUN: clang-offload-bundler -type=o -input=%t.bundle3.o -list | FileCheck -check-prefix=CKLST %s
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle
-// RUN: diff %t.bundle3.o %t.res.o
+// RUN: obj2yaml %t.res.o > %t.res.o.yaml
+// RUN: diff %t.o.yaml %t.res.o.yaml
 // RUN: diff %t.tgt1 %t.res.tgt1
 // RUN: diff %t.tgt2 %t.res.tgt2
 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle
-// RUN: diff %t.bundle3.o %t.res.o
+// RUN: obj2yaml %t.res.o > %t.res.o.yaml
+// RUN: diff %t.o.yaml %t.res.o.yaml
 // RUN: diff %t.tgt1 %t.res.tgt1
 // RUN: diff %t.tgt2 %t.res.tgt2
 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu -output=%t.res.tgt1 -input=%t.bundle3.o -unbundle
@@ -318,11 +321,13 @@
 // Check if we can unbundle a file with no magic strings.
 // RUN: clang-offload-bundler -type=o -input=%t.o -list | FileCheck -check-prefix=CKLST2 --allow-empty %s
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles
-// RUN: diff %t.o %t.res.o
+// RUN: obj2yaml %t.res.o > %t.res.o.yaml
+// RUN: diff %t.o.yaml %t.res.o.yaml
 // RUN: diff %t.empty %t.res.tgt1
 // RUN: diff %t.empty %t.res.tgt2
 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles
-// RUN: diff %t.o %t.res.o
+// RUN: obj2yaml %t.res.o > %t.res.o.yaml
+// RUN: diff %t.o.yaml %t.res.o.yaml
 // RUN: diff %t.empty %t.res.tgt1
 // RUN: diff %t.empty %t.res.tgt2
 
diff --git a/clang/test/Driver/hip-partial-link.hip b/clang/test/Driver/hip-partial-link.hip
new file mode 100644
index 0000000000000..a1d31f9a65195
--- /dev/null
+++ b/clang/test/Driver/hip-partial-link.hip
@@ -0,0 +1,97 @@
+// REQUIRES: x86-registered-target, amdgpu-registered-target, lld, system-linux
+
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu \
+// RUN:   --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \
+// RUN:   -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.1.o
+
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DLIB \
+// RUN:   --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \
+// RUN:   -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.2.o
+
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DMAIN \
+// RUN:   --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \
+// RUN:   -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.main.o
+
+// RUN: llvm-nm  %t.1.o | FileCheck -check-prefix=OBJ1 %s
+// OBJ1:  B __hip_cuid_[[ID:[0-9a-f]+]]
+// OBJ1:  U __hip_fatbin_[[ID]]
+// OBJ1:  U __hip_gpubin_handle_[[ID]]
+
+// RUN: llvm-nm  %t.2.o | FileCheck -check-prefix=OBJ2 %s
+// OBJ2:  B __hip_cuid_[[ID:[0-9a-f]+]]
+// OBJ2:  U __hip_fatbin_[[ID]]
+// OBJ2:  U __hip_gpubin_handle_[[ID]]
+
+// Link %t.1.o and %t.2.o by -r and then link with %t.main.o
+
+// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
+// RUN:   --hip-link -fgpu-rdc --offload-arch=gfx906 \
+// RUN:   -r -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.lib.o \
+// RUN:   2>&1 | FileCheck -check-prefix=LD-R %s
+// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]]
+// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]]
+// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]]
+// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]]
+// LD-R: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle
+// LD-R: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu
+// LD-R: "{{.*}}/clang-offload-bundler"
+// LD-R: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu
+// LD-R: "{{.*}}/ld.lld" {{.*}} -r
+
+// RUN: llvm-nm  %t.lib.o | FileCheck -check-prefix=OBJ %s
+// OBJ:  B __hip_cuid_[[ID1:[0-9a-f]+]]
+// OBJ:  B __hip_cuid_[[ID2:[0-9a-f]+]]
+// OBJ:  R __hip_fatbin_[[ID1]]
+// OBJ:  R __hip_fatbin_[[ID2]]
+// OBJ:  D __hip_gpubin_handle_[[ID1]]
+// OBJ:  D __hip_gpubin_handle_[[ID2]]
+
+// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
+// RUN:   --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \
+// RUN:   -fuse-ld=lld -nostdlib -r %t.main.o %t.lib.o -o %t.final.o \
+// RUN:   2>&1 | FileCheck -check-prefix=LINK-O %s
+// LINK-O-NOT: Found undefined HIP {{.*}}symbol
+
+// Generate a static lib with %t.1.o and %t.2.o then link with %t.main.o
+
+// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
+// RUN:   --hip-link -fgpu-rdc --offload-arch=gfx906 \
+// RUN:   --emit-static-lib -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.a \
+// RUN:   2>&1 | FileCheck -check-prefix=STATIC %s
+// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]]
+// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]]
+// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]]
+// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]]
+// STATIC: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle
+// STATIC: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu
+// STATIC: "{{.*}}/clang-offload-bundler"
+// STATIC: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu
+// STATIC: "{{.*}}/llvm-ar"
+
+// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
+// RUN:   --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \
+// RUN:   -fuse-ld=lld -nostdlib -r %t.main.o %t.a -o %t.final.o \
+// RUN:   2>&1 | FileCheck -check-prefix=LINK-A %s
+// LINK-A-NOT: Found undefined HIP {{.*}}symbol
+
+#include "hip.h"
+
+#ifdef LIB
+__device__ int x;
+__device__ void libfun() {
+  x = 1;
+}
+#elif !defined(MAIN)
+__device__ void libfun();
+__global__ void kern() {
+  libfun();
+}
+void run() {
+  kern<<<1,1>>>();
+}
+#else
+extern void run();
+int main() {
+  run();
+}
+#endif
diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index 1827531f9cab7..d19d8ccd6cb29 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang -### --target=x86_64-linux-gnu \
+// RUN: %clang -### --target=x86_64-linux-gnu -v \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
@@ -12,7 +12,7 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LNX %s
 
-// RUN: %clang -### --target=x86_64-pc-windows-msvc \
+// RUN: %clang -### --target=x86_64-pc-windows-msvc -v \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
@@ -23,15 +23,31 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,MSVC %s
 
-// check code object alignment in dumped llvm-mc input
-// LNX: .protected __hip_fatbin
-// LNX: .type __hip_fatbin,@object
-// LNX: .section .hip_fatbin,"a",@progbits
-// MSVC: .section .hip_fatbin, "dw"
-// CHECK: .globl __hip_fatbin
-// CHECK: .p2align 12
-// CHECK: __hip_fatbin:
-// CHECK: .incbin "[[BUNDLE:.*hipfb]]"
+// check HIP fatbin and gpubin handle symbols and code object alignment in dumped llvm-mc input
+// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]]
+// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]]
+// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]]
+// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]]
+// LNX:  .protected __hip_gpubin_handle_[[ID1]]
+// LNX:  .type __hip_gpubin_handle_[[ID1]]
+// LNX-LABEL:  .section .hip_gpubin_handle,"aw"
+// MSVC-LABEL: .section .hip_gpubin_handle,"dw"
+// CHECK:  .globl __hip_gpubin_handle_[[ID1]]
+// CHECK-NEXT:  .p2align 3
+// CHECK-NEXT:__hip_gpubin_handle_[[ID1]]:
+// CHECK-NEXT:  .zero 8
+// CHECK-NEXT:  .globl __hip_gpubin_handle_[[ID2]]
+// CHECK-NEXT:  .set __hip_gpubin_handle_[[ID2]],__hip_gpubin_handle_[[ID1]]
+// LNX: .protected __hip_fatbin_[[ID1]]
+// LNX: .type __hip_fatbin_[[ID1]],@object
+// LNX-LABEL: .section .hip_fatbin,"a",@progbits
+// MSVC-LABEL: .section .hip_fatbin,"dw"
+// CHECK: .globl __hip_fatbin_[[ID1]]
+// CHECK-NEXT: .p2align 12
+// CHECK-NEXT:  .globl __hip_fatbin_[[ID2]]
+// CHECK-NEXT:  .set __hip_fatbin_[[ID2]],__hip_fatbin_[[ID1]]
+// CHECK-NEXT: __hip_fatbin_[[ID1]]:
+// CHECK-NEXT: .incbin "[[BUNDLE:.*hipfb]]"
 // LNX: .section .note.GNU-stack, "", @progbits
 // MSVC-NOT: .note.GNU-stack
 

From 1069823ce7d154aa8ef87ae5a0fd34b527eca2a0 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov
 <6532716+alexander-shaposhnikov@users.noreply.github.com>
Date: Thu, 22 Feb 2024 11:02:47 -0800
Subject: [PATCH 077/546] Enable JumpTableToSwitch pass by default (#82546)

Enable JumpTableToSwitch pass by default.

Test plan: ninja check-all
---
 llvm/lib/Passes/PassBuilderPipelines.cpp                    | 2 +-
 llvm/test/Other/new-pm-defaults.ll                          | 6 +-----
 llvm/test/Other/new-pm-thinlto-postlink-defaults.ll         | 1 +
 llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll     | 1 +
 .../Other/new-pm-thinlto-postlink-samplepgo-defaults.ll     | 1 +
 llvm/test/Other/new-pm-thinlto-prelink-defaults.ll          | 1 +
 llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll      | 1 +
 .../test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll | 1 +
 8 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 142bd50b3798e..17b55b63ac03c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -247,7 +247,7 @@ static cl::opt<bool>
 
 static cl::opt<bool> EnableJumpTableToSwitch(
     "enable-jump-table-to-switch",
-    cl::desc("Enable JumpTableToSwitch pass (default = off)"));
+    cl::desc("Enable JumpTableToSwitch pass (default = on)"), cl::init(true));
 
 // This option is used in simplifying testing SampleFDO optimizations for
 // profile loading.
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 51fb93daa4dfa..285077ff8e31a 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -71,10 +71,6 @@
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
 
-; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
-; RUN:     -passes='default<O3>' -enable-jump-table-to-switch -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext
-
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-matrix -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
@@ -155,7 +151,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362eabbf83..29a4d79037427 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -90,6 +90,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a44867e434a..bf06782c86f86 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -78,6 +78,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index ac80a31d8fd4b..0cc61121de01c 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -86,6 +86,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b49..0e5839797afe9 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -121,6 +121,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48badd..68c2e58146300 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -118,6 +118,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357d..8311a009711d1 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -90,6 +90,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass

From 4f7ab789bf43b49914815bdf4e4c3703f92e781d Mon Sep 17 00:00:00 2001
From: Boian Petkantchin <boian.petkantchin@amd.com>
Date: Thu, 22 Feb 2024 11:06:14 -0800
Subject: [PATCH 078/546] [mlir][mesh] add support in spmdization for
 incomplete sharding annotations (#82442)

Don't require that `mesh.shard` operations come in pairs. If there is
only a single `mesh.shard` operation we assume that the producer result
and consumer operand have the same sharding.
---
 .../Dialect/Mesh/Transforms/Spmdization.cpp   | 45 ++++++++++++-------
 mlir/test/Dialect/Mesh/spmdization.mlir       | 14 ++++++
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
index 7cbe0de048769..c4d8b0b15e462 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -593,7 +593,6 @@ static SmallVector<MeshShardingAttr> getOperandShardings(Operation &op) {
     Operation *definingOp = operand.getDefiningOp();
     assert(definingOp);
     ShardOp shardOp = llvm::cast<ShardOp>(definingOp);
-    assert(shardOp.getAnnotateForUsers());
     return shardOp.getShard();
   });
   return res;
@@ -615,34 +614,46 @@ static SmallVector<MeshShardingAttr> getResultShardings(Operation &op) {
                     assert(result.hasOneUse());
                     Operation *userOp = *result.getUsers().begin();
                     ShardOp shardOp = llvm::cast<ShardOp>(userOp);
-                    assert(!shardOp.getAnnotateForUsers());
                     return shardOp.getShard();
                   });
   return res;
 }
 
 static LogicalResult
-spmdizeOperation(Operation &op, IRMapping &spmdizationMap,
+spmdizeOperation(ShardOp shardOp, IRMapping &spmdizationMap,
                  SymbolTableCollection &symbolTableCollection,
                  OpBuilder &builder) {
-  ShardOp shardOp = llvm::dyn_cast<ShardOp>(op);
-  if (shardOp) {
-    if (!shardOp.getAnnotateForUsers()) {
-      return success();
-    }
-
+  Value targetSpmdValue;
+
+  // Check if 2 shard ops are chained. If not there is no need for resharding
+  // as the source and target shared the same sharding.
+  ShardOp srcShardOp =
+      dyn_cast_or_null<ShardOp>(shardOp.getOperand().getDefiningOp());
+  if (!srcShardOp) {
+    targetSpmdValue = spmdizationMap.lookup(shardOp.getOperand());
+  } else {
     // Insert resharding.
-    ShardOp srcShardOp =
-        llvm::cast<ShardOp>(shardOp.getOperand().getDefiningOp());
-    assert(!srcShardOp.getAnnotateForUsers());
+    assert(!srcShardOp.getAnnotateForUsers() && shardOp.getAnnotateForUsers());
     TypedValue<ShapedType> srcSpmdValue =
         spmdizationMap.lookup(srcShardOp.getOperand())
             .cast<TypedValue<ShapedType>>();
-    Value targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue,
-                                    symbolTableCollection);
-    assert(!spmdizationMap.contains(shardOp.getResult()));
-    spmdizationMap.map(shardOp.getResult(), targetSpmdValue);
-    return success();
+    targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue,
+                              symbolTableCollection);
+  }
+
+  assert(!spmdizationMap.contains(shardOp.getResult()));
+  spmdizationMap.map(shardOp.getResult(), targetSpmdValue);
+  return success();
+}
+
+static LogicalResult
+spmdizeOperation(Operation &op, IRMapping &spmdizationMap,
+                 SymbolTableCollection &symbolTableCollection,
+                 OpBuilder &builder) {
+  ShardOp shardOp = llvm::dyn_cast<ShardOp>(op);
+  if (shardOp) {
+    return spmdizeOperation(shardOp, spmdizationMap, symbolTableCollection,
+                            builder);
   }
 
   SmallVector<Value> spmdizedOperands;
diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir
index 2fb8029dfe64a..572d3eb55eaaa 100644
--- a/mlir/test/Dialect/Mesh/spmdization.mlir
+++ b/mlir/test/Dialect/Mesh/spmdization.mlir
@@ -127,3 +127,17 @@ func.func @multiple_chained_ops(
   // CHECK: return %[[RESHARD3]] : tensor<1xi8>
   return %7 : tensor<2xi8>
 }
+
+// CHECK-LABEL: func @incomplete_sharding
+func.func @incomplete_sharding(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<4x16xf32>
+  %arg0: tensor<8x16xf32>
+// CHECK-SAME: -> tensor<4x16xf32> {
+) -> tensor<8x16xf32> {
+  %0 = mesh.shard %arg0 to <@mesh_1d, [[0]]> annotate_for_users : tensor<8x16xf32>
+  // CHECK: %[[RES:.*]] = tosa.sigmoid %[[ARG]] : (tensor<4x16xf32>) -> tensor<4x16xf32>
+  %1 = tosa.sigmoid %0 : (tensor<8x16xf32>) -> tensor<8x16xf32>
+  %2 = mesh.shard %1 to <@mesh_1d, [[0]]> : tensor<8x16xf32>
+  // CHECK: return %[[RES]] : tensor<4x16xf32>
+  return %2 : tensor<8x16xf32>
+}

From 744c0057e7dc0d1d046a4867cece2f31fee9bb23 Mon Sep 17 00:00:00 2001
From: Nashe Mncube <nashe.mncube@arm.com>
Date: Thu, 22 Feb 2024 19:15:52 +0000
Subject: [PATCH 079/546] [AArch64][CodeGen] Fix crash when fptrunc returns
 fp16 with +nofp attr (#81724)

When performing lowering of the fptrunc opcode returning fp16 with the
+nofp flag enabled we could trigger a compiler crash. This is because we
had no custom lowering implemented. This patch
the case in which we need to promote an fp16 return type
for fptrunc when the +nofp attr is enabled.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  14 ++-
 .../16bit-float-promotion-with-nofp.ll        |  31 +++++
 .../AArch64/strictfp_f16_abi_promote.ll       | 115 +++++++++++++++---
 3 files changed, 138 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 184ebc19bc9ed..3b92e95d7c287 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -541,10 +541,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+  if (Subtarget->hasFPARMv8())
+    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
-  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+  if (Subtarget->hasFPARMv8())
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 
@@ -947,9 +949,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
-  setOperationAction(ISD::BITCAST, MVT::i16, Custom);
-  setOperationAction(ISD::BITCAST, MVT::f16, Custom);
-  setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+  if (Subtarget->hasFPARMv8()) {
+    setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+  }
 
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
new file mode 100644
index 0000000000000..bfe9ab8424bb0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=-fp-armv8 -o - %s | FileCheck %s
+
+define half @f2h(float %a) {
+; CHECK-LABEL: f2h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __gnu_f2h_ieee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = fptrunc float %a to half
+  ret half %0
+}
+
+define bfloat @f2bfloat(float %a) {
+; CHECK-LABEL: f2bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __truncsfbf2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = fptrunc float %a to bfloat
+  ret bfloat %0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
index a34f7abcc22a3..9fa5208cc8db6 100644
--- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -131,26 +131,107 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
   ret void
 }
 
-; FIXME:
-; define half @f16_return(float %arg) #0 {
-;   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret half %fptrunc
-; }
+ define half @f16_return(float %arg) #0 {
+; NOFP16-LABEL: f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; NOFP16-NEXT:    .cfi_offset w30, -16
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret half %fptrunc
+ }
 
-; define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
-;   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <2 x half> %fptrunc
-; }
+ define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+; NOFP16-LABEL: v2f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w20
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <2 x half> %fptrunc
+ }
 
-; define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
-;   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <3 x half> %fptrunc
-; }
+ define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+; NOFP16-LABEL: v3f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w2
+; NOFP16-NEXT:    mov w19, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w19
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    mov w2, w21
+; NOFP16-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <3 x half> %fptrunc
+ }
 
-; define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
-;   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <4 x half> %fptrunc
-; }
+ define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+; NOFP16-LABEL: v4f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w0, w3
+; NOFP16-NEXT:    mov w19, w2
+; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w22, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w21
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w20
+; NOFP16-NEXT:    mov w2, w19
+; NOFP16-NEXT:    mov w3, w22
+; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <4 x half> %fptrunc
+ }
 
 ; FIXME:
 ; define void @outgoing_f16_arg(ptr %ptr) #0 {

From 6ddb25ed9ca2cb0f4ad8f402d7411ac3328f598d Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Thu, 22 Feb 2024 11:19:02 -0800
Subject: [PATCH 080/546] [scudo] increase frames per stack to 16 for stack
 depot (#82427)

8 was very low and it is likely that in real workloads we have more than
an average of 8 frames per stack given on Android we have 3 at the
bottom: __start_main, __libc_init, main, and three at the top: malloc,
scudo_malloc and Allocator::allocate. That leaves 2 frames for
application code, which is clearly unreasonable.
---
 compiler-rt/lib/scudo/standalone/combined.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index f3c3d757c9f12..f13cf9498a793 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1522,7 +1522,12 @@ class Allocator {
     constexpr u32 kStacksPerRingBufferEntry = 2;
     constexpr u32 kMaxU32Pow2 = ~(UINT32_MAX >> 1);
     static_assert(isPowerOfTwo(kMaxU32Pow2));
-    constexpr u32 kFramesPerStack = 8;
+    // On Android we always have 3 frames at the bottom: __start_main,
+    // __libc_init, main, and 3 at the top: malloc, scudo_malloc and
+    // Allocator::allocate. This leaves 10 frames for the user app. The next
+    // smallest power of two (8) would only leave 2, which is clearly too
+    // little.
+    constexpr u32 kFramesPerStack = 16;
     static_assert(isPowerOfTwo(kFramesPerStack));
 
     // We need StackDepot to be aligned to 8-bytes so the ring we store after

From 242f98c7ab7c100d76cac29b555db20205619b38 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 22 Feb 2024 20:21:09 +0100
Subject: [PATCH 081/546] [Clang][SME] Skip writing output files to the source
 directory

---
 clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
index 7eb74f28a1c85..25aebeced9379 100644
--- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_NONE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_COMPATIBLE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_STREAMING %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -target-feature +sme -verify -DTEST_LOCALLY %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
 
 #define __ai __attribute__((always_inline))
 __ai void inlined_fn(void) {}

From 3168af56bcb827360c26957ef579b7871dad8e17 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 22 Feb 2024 20:25:58 +0100
Subject: [PATCH 082/546] LoopVectorize: Mark crash test as requiring
 assertions

---
 llvm/test/Transforms/LoopVectorize/X86/pr72969.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index a54bd39f3ff60..40633c6c8383b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -1,5 +1,6 @@
 ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s
 ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s
+; REQUIRES: asserts
 
 @h = global i64 0
 

From 32994cc0d63513f77223c64148faeeb50aebb702 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Thu, 22 Feb 2024 14:32:15 -0500
Subject: [PATCH 083/546] [SLP]Improve findReusedOrderedScalars and graph
 rotation.

Patch syncs the code in findReusedOrderedScalars with cost
estimation/codegen. It tries to use similar logic to better determine
best order.
Before, it just tried to find previously vectorized node without
checking if it is possible to use the vectorized value in the shuffle.
Now it relies on the more generalized version. If it determines, that
a single vector must be reordered (using same mechanism, as codegen and
cost estimation), it generates better order.

The comparison between new/ref ordering:

Metric: SLP.NumVectorInstructions

Program                                                                                                                                                SLP.NumVectorInstructions
                                                                                                                                                       results                   results0 diff
                                                                                               test-suite :: MultiSource/Benchmarks/nbench/nbench.test   139.00                    140.00   0.7%
                                                                             test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test   344.00                    346.00   0.6%
                                                                                        test-suite :: MultiSource/Benchmarks/FreeBench/pifft/pifft.test  1293.00                   1292.00  -0.1%
                                                                                test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test  5176.00                   5170.00  -0.1%
                                                                                        test-suite :: External/SPEC/CFP2006/453.povray/453.povray.test  5173.00                   5167.00  -0.1%
                                                                                test-suite :: External/SPEC/CFP2017rate/510.parest_r/510.parest_r.test 11692.00                  11660.00  -0.3%
                                                                                     test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test  1621.00                   1615.00  -0.4%
                                                                                             test-suite :: External/SPEC/CINT2006/403.gcc/403.gcc.test   795.00                    792.00  -0.4%
                                                                              test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 26499.00                  26338.00  -0.6%
                                                                                               test-suite :: MultiSource/Benchmarks/Bullet/bullet.test  7343.00                   7281.00  -0.8%
                                                                                          test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test  1104.00                   1094.00  -0.9%
                                                                                          test-suite :: MultiSource/Applications/JM/lencod/lencod.test  2216.00                   2180.00  -1.6%
                                                                                            test-suite :: External/SPEC/CFP2006/433.milc/433.milc.test   787.00                    637.00 -19.1%

Less 0% is better.
Most of the benchmarks see more vectorized code. The first ones just
have shuffles removed.

The ordering analysis still may require some improvements (e.g. for
alternate nodes), but this one should be produce better results.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: https://github.com/llvm/llvm-project/pull/77529
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 446 ++++++++++++++----
 .../AArch64/extractelements-to-shuffle.ll     |  16 +-
 .../AArch64/reorder-fmuladd-crash.ll          |   7 +-
 .../SLPVectorizer/AArch64/tsc-s116.ll         |  22 +-
 .../AArch64/vec3-reorder-reshuffle.ll         |  34 +-
 .../Transforms/SLPVectorizer/X86/pr35497.ll   |  16 +-
 .../SLPVectorizer/X86/reduction-transpose.ll  |  16 +-
 .../X86/reorder-clustered-node.ll             |  11 +-
 .../X86/reorder-reused-masked-gather.ll       |   7 +-
 .../SLPVectorizer/X86/reorder-vf-to-resize.ll |   2 +-
 .../X86/scatter-vectorize-reorder.ll          |  17 +-
 .../X86/shrink_after_reorder2.ll              |  11 +-
 .../X86/vec3-reorder-reshuffle.ll             |  17 +-
 13 files changed, 447 insertions(+), 175 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4e334748c9593..de4e56ff80659 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2422,18 +2422,25 @@ class BoUpSLP {
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
   /// permutations. Must form single-register vector.
+  /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+  /// commands to build the mask using the original vector value, without
+  /// relying on the potential reordering.
   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
   /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
   std::optional<TargetTransformInfo::ShuffleKind>
   isGatherShuffledSingleRegisterEntry(
       const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
+      bool ForOrder);
 
   /// Checks if the gathered \p VL can be represented as multi-register
   /// shuffle(s) of previous tree entries.
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
   /// permutations.
+  /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
+  /// commands to build the mask using the original vector value, without
+  /// relying on the potential reordering.
   /// \returns per-register series of ShuffleKind, if gathered values can be
   /// represented as shuffles of previous tree entries. \p Mask is filled with
   /// the shuffle mask (also on per-register base).
@@ -2441,7 +2448,7 @@ class BoUpSLP {
   isGatherShuffledEntry(
       const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
       SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-      unsigned NumParts);
+      unsigned NumParts, bool ForOrder = false);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -3788,65 +3795,163 @@ static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
 std::optional<BoUpSLP::OrdersType>
 BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
   assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
-  unsigned NumScalars = TE.Scalars.size();
+  // Try to find subvector extract/insert patterns and reorder only such
+  // patterns.
+  SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
+  Type *ScalarTy = GatheredScalars.front()->getType();
+  int NumScalars = GatheredScalars.size();
+  if (!isValidElementType(ScalarTy))
+    return std::nullopt;
+  auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
+  int NumParts = TTI->getNumberOfParts(VecTy);
+  if (NumParts == 0 || NumParts >= NumScalars)
+    NumParts = 1;
+  SmallVector<int> ExtractMask;
+  SmallVector<int> Mask;
+  SmallVector<SmallVector<const TreeEntry *>> Entries;
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
+      tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
+      isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
+                            /*ForOrder=*/true);
+  // No shuffled operands - ignore.
+  if (GatherShuffles.empty() && ExtractShuffles.empty())
+    return std::nullopt;
   OrdersType CurrentOrder(NumScalars, NumScalars);
-  SmallVector<int> Positions;
-  SmallBitVector UsedPositions(NumScalars);
-  const TreeEntry *STE = nullptr;
-  // Try to find all gathered scalars that are gets vectorized in other
-  // vectorize node. Here we can have only one single tree vector node to
-  // correctly identify order of the gathered scalars.
-  for (unsigned I = 0; I < NumScalars; ++I) {
-    Value *V = TE.Scalars[I];
-    if (!isa<LoadInst, ExtractElementInst, ExtractValueInst>(V))
-      continue;
-    if (const auto *LocalSTE = getTreeEntry(V)) {
-      if (!STE)
-        STE = LocalSTE;
-      else if (STE != LocalSTE)
-        // Take the order only from the single vector node.
-        return std::nullopt;
-      unsigned Lane =
-          std::distance(STE->Scalars.begin(), find(STE->Scalars, V));
-      if (Lane >= NumScalars)
-        return std::nullopt;
-      if (CurrentOrder[Lane] != NumScalars) {
-        if (Lane != I)
+  if (GatherShuffles.size() == 1 &&
+      *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+      Entries.front().front()->isSame(TE.Scalars)) {
+    // Perfect match in the graph, will reuse the previously vectorized
+    // node. Cost is 0.
+    std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
+    return CurrentOrder;
+  }
+  auto IsSplatMask = [](ArrayRef<int> Mask) {
+    int SingleElt = PoisonMaskElem;
+    return all_of(Mask, [&](int I) {
+      if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
+        SingleElt = I;
+      return I == PoisonMaskElem || I == SingleElt;
+    });
+  };
+  // Exclusive broadcast mask - ignore.
+  if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
+       (Entries.size() != 1 ||
+        Entries.front().front()->ReorderIndices.empty())) ||
+      (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
+    return std::nullopt;
+  SmallBitVector ShuffledSubMasks(NumParts);
+  auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
+                                  ArrayRef<int> Mask, int PartSz, int NumParts,
+                                  function_ref<unsigned(unsigned)> GetVF) {
+    for (int I : seq<int>(0, NumParts)) {
+      if (ShuffledSubMasks.test(I))
+        continue;
+      const int VF = GetVF(I);
+      if (VF == 0)
+        continue;
+      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+      // Shuffle of at least 2 vectors - ignore.
+      if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
+        std::fill(Slice.begin(), Slice.end(), NumScalars);
+        ShuffledSubMasks.set(I);
+        continue;
+      }
+      // Try to include as much elements from the mask as possible.
+      int FirstMin = INT_MAX;
+      int SecondVecFound = false;
+      for (int K : seq<int>(0, PartSz)) {
+        int Idx = Mask[I * PartSz + K];
+        if (Idx == PoisonMaskElem) {
+          Value *V = GatheredScalars[I * PartSz + K];
+          if (isConstant(V) && !isa<PoisonValue>(V)) {
+            SecondVecFound = true;
+            break;
+          }
           continue;
-        UsedPositions.reset(CurrentOrder[Lane]);
+        }
+        if (Idx < VF) {
+          if (FirstMin > Idx)
+            FirstMin = Idx;
+        } else {
+          SecondVecFound = true;
+          break;
+        }
       }
-      // The partial identity (where only some elements of the gather node are
-      // in the identity order) is good.
-      CurrentOrder[Lane] = I;
-      UsedPositions.set(I);
-    }
-  }
-  // Need to keep the order if we have a vector entry and at least 2 scalars or
-  // the vectorized entry has just 2 scalars.
-  if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) {
-    auto &&IsIdentityOrder = [NumScalars](ArrayRef<unsigned> CurrentOrder) {
-      for (unsigned I = 0; I < NumScalars; ++I)
-        if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars)
-          return false;
-      return true;
-    };
-    if (IsIdentityOrder(CurrentOrder))
-      return OrdersType();
-    auto *It = CurrentOrder.begin();
-    for (unsigned I = 0; I < NumScalars;) {
-      if (UsedPositions.test(I)) {
-        ++I;
+      FirstMin = (FirstMin / PartSz) * PartSz;
+      // Shuffle of at least 2 vectors - ignore.
+      if (SecondVecFound) {
+        std::fill(Slice.begin(), Slice.end(), NumScalars);
+        ShuffledSubMasks.set(I);
         continue;
       }
-      if (*It == NumScalars) {
-        *It = I;
-        ++I;
+      for (int K : seq<int>(0, PartSz)) {
+        int Idx = Mask[I * PartSz + K];
+        if (Idx == PoisonMaskElem)
+          continue;
+        Idx -= FirstMin;
+        if (Idx >= PartSz) {
+          SecondVecFound = true;
+          break;
+        }
+        if (CurrentOrder[I * PartSz + Idx] >
+                static_cast<unsigned>(I * PartSz + K) &&
+            CurrentOrder[I * PartSz + Idx] !=
+                static_cast<unsigned>(I * PartSz + Idx))
+          CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
+      }
+      // Shuffle of at least 2 vectors - ignore.
+      if (SecondVecFound) {
+        std::fill(Slice.begin(), Slice.end(), NumScalars);
+        ShuffledSubMasks.set(I);
+        continue;
       }
-      ++It;
     }
-    return std::move(CurrentOrder);
+  };
+  int PartSz = NumScalars / NumParts;
+  if (!ExtractShuffles.empty())
+    TransformMaskToOrder(
+        CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
+          if (!ExtractShuffles[I])
+            return 0U;
+          unsigned VF = 0;
+          for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+            int K = I * PartSz + Idx;
+            if (ExtractMask[K] == PoisonMaskElem)
+              continue;
+            if (!TE.ReuseShuffleIndices.empty())
+              K = TE.ReuseShuffleIndices[K];
+            if (!TE.ReorderIndices.empty())
+              K = std::distance(TE.ReorderIndices.begin(),
+                                find(TE.ReorderIndices, K));
+            auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
+            if (!EI)
+              continue;
+            VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
+                                  ->getElementCount()
+                                  .getKnownMinValue());
+          }
+          return VF;
+        });
+  // Check special corner case - single shuffle of the same entry.
+  if (GatherShuffles.size() == 1 && NumParts != 1) {
+    if (ShuffledSubMasks.any())
+      return std::nullopt;
+    PartSz = NumScalars;
+    NumParts = 1;
   }
-  return std::nullopt;
+  if (!Entries.empty())
+    TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
+      if (!GatherShuffles[I])
+        return 0U;
+      return std::max(Entries[I].front()->getVectorFactor(),
+                      Entries[I].back()->getVectorFactor());
+    });
+  int NumUndefs =
+      count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
+  if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
+    return std::nullopt;
+  return std::move(CurrentOrder);
 }
 
 namespace {
@@ -4168,9 +4273,59 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     //                           0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
     //                           element 3 is used twice in the second submask.
     unsigned Sz = TE.Scalars.size();
-    if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
-                                                     Sz))
+    if (TE.State == TreeEntry::NeedToGather) {
+      if (std::optional<OrdersType> CurrentOrder =
+              findReusedOrderedScalars(TE)) {
+        SmallVector<int> Mask;
+        fixupOrderingIndices(*CurrentOrder);
+        inversePermutation(*CurrentOrder, Mask);
+        ::addMask(Mask, TE.ReuseShuffleIndices);
+        OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
+        unsigned Sz = TE.Scalars.size();
+        for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
+          for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
+            if (Idx != PoisonMaskElem)
+              Res[Idx + K * Sz] = I + K * Sz;
+        }
+        return std::move(Res);
+      }
+    }
+    if (Sz == 2 && TE.getVectorFactor() == 4 &&
+        TTI->getNumberOfParts(FixedVectorType::get(
+            TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
       return std::nullopt;
+    if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
+                                                     Sz)) {
+      SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
+      if (TE.ReorderIndices.empty())
+        std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
+      else
+        inversePermutation(TE.ReorderIndices, ReorderMask);
+      ::addMask(ReorderMask, TE.ReuseShuffleIndices);
+      unsigned VF = ReorderMask.size();
+      OrdersType ResOrder(VF, VF);
+      unsigned NumParts = VF / Sz;
+      SmallBitVector UsedVals(NumParts);
+      for (unsigned I = 0; I < VF; I += Sz) {
+        int Val = PoisonMaskElem;
+        unsigned UndefCnt = 0;
+        if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+                   [&](int Idx) {
+                     if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
+                       Val = Idx;
+                     if (Idx == PoisonMaskElem)
+                       ++UndefCnt;
+                     return Idx != PoisonMaskElem && Idx != Val;
+                   }) ||
+            Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
+            UndefCnt > Sz / 2)
+          return std::nullopt;
+        UsedVals.set(Val);
+        for (unsigned K = 0; K < NumParts; ++K)
+          ResOrder[Val + Sz * K] = I + K;
+      }
+      return std::move(ResOrder);
+    }
     unsigned VF = TE.getVectorFactor();
     // Try build correct order for extractelement instructions.
     SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
@@ -4208,7 +4363,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
       std::advance(It, Sz);
     }
-    if (all_of(enumerate(ResOrder),
+    if (TE.State == TreeEntry::NeedToGather &&
+        all_of(enumerate(ResOrder),
                [](const auto &Data) { return Data.index() == Data.value(); }))
       return std::nullopt; // No need to reorder.
     return std::move(ResOrder);
@@ -4298,11 +4454,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       OrdersType CurrentOrder;
       bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
                                    /*ResizeAllowed=*/true);
-      if (Reuse || !CurrentOrder.empty()) {
-        if (!CurrentOrder.empty())
-          fixupOrderingIndices(CurrentOrder);
+      if (Reuse || !CurrentOrder.empty())
         return std::move(CurrentOrder);
-      }
     }
     // If the gather node is <undef, v, .., poison> and
     // insertelement poison, v, 0 [+ permute]
@@ -4335,8 +4488,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
         InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
             Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
             PoisonValue::get(Ty), *It);
-        if (InsertFirstCost + PermuteCost < InsertIdxCost)
+        if (InsertFirstCost + PermuteCost < InsertIdxCost) {
+          OrdersType Order(Sz, Sz);
+          Order[Idx] = 0;
           return std::move(Order);
+        }
       }
     }
     if (isSplat(TE.Scalars))
@@ -4392,6 +4548,28 @@ void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
     std::iota(It, std::next(It, Sz), 0);
 }
 
+static void combineOrders(MutableArrayRef<unsigned> Order,
+                          ArrayRef<unsigned> SecondaryOrder) {
+  assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
+         "Expected same size of orders");
+  unsigned Sz = Order.size();
+  SmallBitVector UsedIndices(Sz);
+  for (unsigned Idx : seq<unsigned>(0, Sz)) {
+    if (Order[Idx] != Sz)
+      UsedIndices.set(Order[Idx]);
+  }
+  if (SecondaryOrder.empty()) {
+    for (unsigned Idx : seq<unsigned>(0, Sz))
+      if (Order[Idx] == Sz && !UsedIndices.test(Idx))
+        Order[Idx] = Idx;
+  } else {
+    for (unsigned Idx : seq<unsigned>(0, Sz))
+      if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
+          !UsedIndices.test(SecondaryOrder[Idx]))
+        Order[Idx] = SecondaryOrder[Idx];
+  }
+}
+
 void BoUpSLP::reorderTopToBottom() {
   // Maps VF to the graph nodes.
   DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
@@ -4560,18 +4738,46 @@ void BoUpSLP::reorderTopToBottom() {
     }
     if (OrdersUses.empty())
       continue;
+    auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+      const unsigned Sz = Order.size();
+      for (unsigned Idx : seq<unsigned>(0, Sz))
+        if (Idx != Order[Idx] && Order[Idx] != Sz)
+          return false;
+      return true;
+    };
     // Choose the most used order.
-    ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
-    unsigned Cnt = OrdersUses.front().second;
-    for (const auto &Pair : drop_begin(OrdersUses)) {
-      if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+    unsigned IdentityCnt = 0;
+    unsigned FilledIdentityCnt = 0;
+    OrdersType IdentityOrder(VF, VF);
+    for (auto &Pair : OrdersUses) {
+      if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+        if (!Pair.first.empty())
+          FilledIdentityCnt += Pair.second;
+        IdentityCnt += Pair.second;
+        combineOrders(IdentityOrder, Pair.first);
+      }
+    }
+    MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+    unsigned Cnt = IdentityCnt;
+    for (auto &Pair : OrdersUses) {
+      // Prefer identity order. But, if filled identity found (non-empty order)
+      // with same number of uses, as the new candidate order, we can choose
+      // this candidate order.
+      if (Cnt < Pair.second ||
+          (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
+           Cnt == Pair.second && !BestOrder.empty() &&
+           IsIdentityOrder(BestOrder))) {
+        combineOrders(Pair.first, BestOrder);
         BestOrder = Pair.first;
         Cnt = Pair.second;
+      } else {
+        combineOrders(BestOrder, Pair.first);
       }
     }
     // Set order of the user node.
-    if (BestOrder.empty())
+    if (IsIdentityOrder(BestOrder))
       continue;
+    fixupOrderingIndices(BestOrder);
     SmallVector<int> Mask;
     inversePermutation(BestOrder, Mask);
     SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
@@ -4685,7 +4891,7 @@ bool BoUpSLP::canReorderOperands(
 
 void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
   SetVector<TreeEntry *> OrderedEntries;
-  DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
+  DenseSet<const TreeEntry *> GathersToOrders;
   // Find all reorderable leaf nodes with the given VF.
   // Currently the are vectorized loads,extracts without alternate operands +
   // some gathering of extracts.
@@ -4700,7 +4906,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize) ||
           !TE->ReuseShuffleIndices.empty())
-        GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
+        GathersToOrders.insert(TE.get());
     }
   }
 
@@ -4718,7 +4924,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
       if (!(TE->State == TreeEntry::Vectorize ||
             TE->State == TreeEntry::StridedVectorize ||
             (TE->State == TreeEntry::NeedToGather &&
-             GathersToOrders.count(TE))) ||
+             GathersToOrders.contains(TE))) ||
           TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
           !all_of(drop_begin(TE->UserTreeIndices),
                   [TE](const EdgeInfo &EI) {
@@ -4775,9 +4981,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
         const auto Order = [&]() -> const OrdersType {
           if (OpTE->State == TreeEntry::NeedToGather ||
               !OpTE->ReuseShuffleIndices.empty())
-            return GathersToOrders.find(OpTE)->second;
+            return getReorderingData(*OpTE, /*TopToBottom=*/false)
+                .value_or(OrdersType(1));
           return OpTE->ReorderIndices;
         }();
+        // The order is partially ordered, skip it in favor of fully non-ordered
+        // orders.
+        if (Order.size() == 1)
+          continue;
         unsigned NumOps = count_if(
             Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
               return P.second == OpTE;
@@ -4805,9 +5016,10 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
               (IgnoreReorder && TE->Idx == 0))
             return true;
           if (TE->State == TreeEntry::NeedToGather) {
-            auto It = GathersToOrders.find(TE);
-            if (It != GathersToOrders.end())
-              return !It->second.empty();
+            if (GathersToOrders.contains(TE))
+              return !getReorderingData(*TE, /*TopToBottom=*/false)
+                          .value_or(OrdersType(1))
+                          .empty();
             return true;
           }
           return false;
@@ -4839,21 +5051,49 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
             ++Res.first->second;
         }
       }
-      // Choose the best order.
-      ArrayRef<unsigned> BestOrder = OrdersUses.front().first;
-      unsigned Cnt = OrdersUses.front().second;
-      for (const auto &Pair : drop_begin(OrdersUses)) {
-        if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) {
+      if (OrdersUses.empty()) {
+        for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
+          OrderedEntries.remove(Op.second);
+        continue;
+      }
+      auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
+        const unsigned Sz = Order.size();
+        for (unsigned Idx : seq<unsigned>(0, Sz))
+          if (Idx != Order[Idx] && Order[Idx] != Sz)
+            return false;
+        return true;
+      };
+      // Choose the most used order.
+      unsigned IdentityCnt = 0;
+      unsigned VF = Data.second.front().second->getVectorFactor();
+      OrdersType IdentityOrder(VF, VF);
+      for (auto &Pair : OrdersUses) {
+        if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
+          IdentityCnt += Pair.second;
+          combineOrders(IdentityOrder, Pair.first);
+        }
+      }
+      MutableArrayRef<unsigned> BestOrder = IdentityOrder;
+      unsigned Cnt = IdentityCnt;
+      for (auto &Pair : OrdersUses) {
+        // Prefer identity order. But, if filled identity found (non-empty
+        // order) with same number of uses, as the new candidate order, we can
+        // choose this candidate order.
+        if (Cnt < Pair.second) {
+          combineOrders(Pair.first, BestOrder);
           BestOrder = Pair.first;
           Cnt = Pair.second;
+        } else {
+          combineOrders(BestOrder, Pair.first);
         }
       }
-      // Set order of the user node (reordering of operands and user nodes).
-      if (BestOrder.empty()) {
+      // Set order of the user node.
+      if (IsIdentityOrder(BestOrder)) {
         for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
           OrderedEntries.remove(Op.second);
         continue;
       }
+      fixupOrderingIndices(BestOrder);
       // Erase operands from OrderedEntries list and adjust their orders.
       VisitedOps.clear();
       SmallVector<int> Mask;
@@ -7472,6 +7712,20 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
       V1 = Constant::getNullValue(
           FixedVectorType::get(E->Scalars.front()->getType(), CommonVF));
+      // Not identity/broadcast? Try to see if the original vector is better.
+      if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
+          CommonVF == CommonMask.size() &&
+          any_of(enumerate(CommonMask),
+                 [](const auto &&P) {
+                   return P.value() != PoisonMaskElem &&
+                          static_cast<unsigned>(P.value()) != P.index();
+                 }) &&
+          any_of(CommonMask,
+                 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
+        SmallVector<int> ReorderMask;
+        inversePermutation(E->ReorderIndices, ReorderMask);
+        ::addMask(CommonMask, ReorderMask);
+      }
     } else if (V1 && P2.isNull()) {
       // Shuffle single vector.
       CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
@@ -9433,7 +9687,7 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
 std::optional<TargetTransformInfo::ShuffleKind>
 BoUpSLP::isGatherShuffledSingleRegisterEntry(
     const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
   Entries.clear();
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
@@ -9532,6 +9786,21 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       VToTEs.insert(TEPtr);
     }
     if (const TreeEntry *VTE = getTreeEntry(V)) {
+      if (ForOrder) {
+        if (VTE->State != TreeEntry::Vectorize) {
+          auto It = MultiNodeScalars.find(V);
+          if (It == MultiNodeScalars.end())
+            continue;
+          VTE = *It->getSecond().begin();
+          // Iterate through all vectorized nodes.
+          auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
+            return MTE->State == TreeEntry::Vectorize;
+          });
+          if (MIt == It->getSecond().end())
+            continue;
+          VTE = *MIt;
+        }
+      }
       Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
       if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
         continue;
@@ -9765,8 +10034,12 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
     unsigned Idx = Part * VL.size() + Pair.second;
-    Mask[Idx] = Pair.first * VF +
-                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+    Mask[Idx] =
+        Pair.first * VF +
+        (ForOrder ? std::distance(
+                        Entries[Pair.first]->Scalars.begin(),
+                        find(Entries[Pair.first]->Scalars, VL[Pair.second]))
+                  : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
     IsIdentity &= Mask[Idx] == Pair.second;
   }
   switch (Entries.size()) {
@@ -9791,8 +10064,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
 BoUpSLP::isGatherShuffledEntry(
     const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
-    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-    unsigned NumParts) {
+    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
+    bool ForOrder) {
   assert(NumParts > 0 && NumParts < VL.size() &&
          "Expected positive number of registers.");
   Entries.clear();
@@ -9810,7 +10083,8 @@ BoUpSLP::isGatherShuffledEntry(
     ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
     SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
     std::optional<TTI::ShuffleKind> SubRes =
-        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
+                                            ForOrder);
     if (!SubRes)
       SubEntries.clear();
     Res.push_back(SubRes);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 8f76b2e54e6c2..44542f32bf145 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -76,10 +76,10 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
@@ -107,12 +107,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
@@ -152,12 +152,12 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
 ; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
 ; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
@@ -166,9 +166,9 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
 ; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
 ; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
index 0a68996410448..dc05967af1529 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reorder-fmuladd-crash.ll
@@ -6,7 +6,7 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i32> [[TMP0]] to <2 x double>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    br label [[FOR_COND15_PREHEADER:%.*]]
 ; CHECK:       for.cond15.preheader:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
@@ -26,14 +26,15 @@ define i32 @foo(i32 %v1, double %v2) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[ARRAYIDX43]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> undef, <4 x double> [[TMP2]], <4 x double> [[TMP8]])
 ; CHECK-NEXT:    br label [[SW_EPILOG:%.*]]
 ; CHECK:       sw.bb195:
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       do.body:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       sw.epilog:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi <4 x double> [ undef, [[SW_BB195]] ], [ [[TMP9]], [[SW_BB]] ]
 ; CHECK-NEXT:    ret i32 undef
 ; CHECK:       if.end.1:
 ; CHECK-NEXT:    br label [[FOR_COND15_1:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index 28af0de171231..95aa40f664c0c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -20,17 +20,17 @@ define void @s116_modified(ptr %a) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 3
 ; CHECK-NEXT:    [[LD0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    store <4 x float> [[TMP12]], ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
index 5707e143ad551..89ea15d3ab3bc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
@@ -143,16 +143,17 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
 ; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP2]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
 ; CHECK-NEXT:    [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
-; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX163]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP8]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
+; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
+; CHECK-NEXT:    store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
+; CHECK-NEXT:    store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -183,19 +184,18 @@ define i32 @reorder_indices_1(float %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
 ; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
 ; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
index 9c7e8f66c6c6c..cb24a9cefffa2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -68,10 +68,10 @@ define void @pr35497() local_unnamed_addr #0 {
 ; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; SSE-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer
 ; SSE-NEXT:    store <2 x i64> [[TMP5]], ptr undef, align 1
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
-; SSE-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; SSE-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], <i64 6, i64 6>
 ; SSE-NEXT:    [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]]
 ; SSE-NEXT:    store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1
@@ -88,10 +88,10 @@ define void @pr35497() local_unnamed_addr #0 {
 ; AVX-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; AVX-NEXT:    store <2 x i64> [[TMP4]], ptr undef, align 1
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0
-; AVX-NEXT:    [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], <i64 2, i64 2>
-; AVX-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 20, i64 20>
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> <i32 1, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 2, i64 2>
+; AVX-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], <i64 20, i64 20>
 ; AVX-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
 ; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]]
 ; AVX-NEXT:    store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index c051d909f752e..ec90ca9bc674d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -18,9 +18,9 @@
 define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
 ; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
@@ -28,9 +28,9 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
 ;
 ; SSE42-LABEL: @reduce_and4(
 ; SSE42-NEXT:  entry:
-; SSE42-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE42-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE42-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE42-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
 ; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
 ; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
@@ -92,18 +92,18 @@ entry:
 
 define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4_transpose(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
 ; SSE42-LABEL: @reduce_and4_transpose(
-; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE42-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE42-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE42-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
 ; SSE42-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
 ; SSE42-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
index b5533463c3930..1a6ff2385905b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
@@ -17,13 +17,12 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> <i32 2, i32 0, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 2, i32 0, i32 1, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> <ptr poison, ptr null, ptr poison, ptr null, ptr null, ptr null, ptr null, ptr null>, <8 x i32> <i32 1, i32 9, i32 0, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP13]], false
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP11]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = and i1 [[TMP12]], false
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index f65f61975a61f..cd7ad210ca567 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -8,12 +8,11 @@ define void @test(ptr noalias %0, ptr %p) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> <i64 15, i64 4, i64 5, i64 0, i64 2, i64 6, i64 7, i64 8>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 4, i32 3, i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 2, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 0, i32 1, i32 2, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 24, i32 0, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 1, i32 9, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x float> [[TMP11]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <16 x float> [[TMP10]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %2 = getelementptr inbounds float, ptr %p, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll
index af606fc3a738b..d3c978412cdde 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll
@@ -6,7 +6,7 @@ define void @main(ptr %0) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP0:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> zeroinitializer, [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> <i32 1, i32 2, i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp oeq <4 x double> [[TMP7]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index c79e9b94278cd..fb2b653aefc87 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,10 +12,10 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
@@ -23,12 +23,11 @@ define void @test() {
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16
+; CHECK-NEXT:    store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
index 8d1d257820f0c..9e3ba05f88da8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
@@ -9,10 +9,10 @@ define void @foo(ptr %this, ptr %p, i32 %add7) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[ADD7:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i32> [[TMP0]], <i32 2, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    switch i32 undef, label [[SW_EPILOG:%.*]] [
-; CHECK-NEXT:    i32 0, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 2, label [[SW_BB]]
+; CHECK-NEXT:      i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[TMP1]], <i32 -1, i32 -1>
@@ -21,10 +21,11 @@ define void @foo(ptr %this, ptr %p, i32 %add7) {
 ; CHECK-NEXT:    br label [[SW_EPILOG]]
 ; CHECK:       sw.epilog:
 ; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 9584a663b2d48..46cca9b078ac6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -182,19 +182,18 @@ define i32 @reorder_indices_1(float %0) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = fneg float [[TMP3]]
 ; CHECK-NEXT:    [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]])
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = fneg <2 x float> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP13]], <2 x float> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
-; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    store <2 x float> [[TMP16]], ptr [[NOR1]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> zeroinitializer, <2 x float> [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP10]], <2 x float> [[TMP12]], <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP5]], float 0.000000e+00)
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul <2 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[MUL6_I_I_I:%.*]] = fmul float [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[NOR1]], align 4
 ; CHECK-NEXT:    store float [[MUL6_I_I_I]], ptr [[ARRAYIDX2_I265]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;

From 2685e7eadce08125672f0f6013145ae45b7a5ac3 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Thu, 22 Feb 2024 13:34:00 -0600
Subject: [PATCH 084/546] [lldb][docs] Remove/update docs pointing to unittest2
 (#82672)

---
 lldb/docs/resources/test.rst                       |  8 ++++----
 lldb/docs/testsuite/a-detailed-walkthrough.txt     |  9 ++++-----
 .../Python/lldbsuite/test/README-TestSuite         | 14 --------------
 3 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/lldb/docs/resources/test.rst b/lldb/docs/resources/test.rst
index 52757864539ea..2b0e9010fe280 100644
--- a/lldb/docs/resources/test.rst
+++ b/lldb/docs/resources/test.rst
@@ -17,8 +17,8 @@ The LLDB test suite consists of three different kinds of test:
   the output.
 * **API tests**: Integration tests that interact with the debugger through the
   SB API. These are written in Python and use LLDB's ``dotest.py`` testing
-  framework on top of Python's `unittest2
-  <https://docs.python.org/2/library/unittest.html>`_.
+  framework on top of Python's `unittest
+  <https://docs.python.org/3/library/unittest.html>`_.
 
 All three test suites use ``lit`` (`LLVM Integrated Tester
 <https://llvm.org/docs/CommandGuide/lit.html>`_ ) as the test driver. The test
@@ -94,7 +94,7 @@ programs from source, run them, and debug the processes.
 As mentioned before, ``dotest.py`` is LLDB's testing framework. The
 implementation is located under ``lldb/packages/Python/lldbsuite``. We have
 several extensions and custom test primitives on top of what's offered by
-`unittest2 <https://docs.python.org/2/library/unittest.html>`_. Those can be
+`unittest <https://docs.python.org/3/library/unittest.html>`_. Those can be
 found  in
 `lldbtest.py <https://github.com/llvm/llvm-project/blob/main/lldb/packages/Python/lldbsuite/test/lldbtest.py>`_.
 
@@ -146,7 +146,7 @@ the test should be run or not.
 
 ::
 
-  @expectedFailure(checking_function_name)
+  @skipTestIfFn(checking_function_name)
 
 In addition to providing a lot more flexibility when it comes to writing the
 test, the API test also allow for much more complex scenarios when it comes to
diff --git a/lldb/docs/testsuite/a-detailed-walkthrough.txt b/lldb/docs/testsuite/a-detailed-walkthrough.txt
index 57c9dbce3d0ab..8a7043786c190 100644
--- a/lldb/docs/testsuite/a-detailed-walkthrough.txt
+++ b/lldb/docs/testsuite/a-detailed-walkthrough.txt
@@ -58,16 +58,15 @@ display their output.  For brevity, the '-t' output is not included here.
 Notice the 'expected failures=1' message at the end of the run.  This is because
 of a bug currently in lldb such that setting target.process.output-path to
 'stdout.txt' does not have any effect on the redirection of the standard output
-of the subsequent launched process.  We are using unittest2 (a backport of new
-unittest features for Python 2.4-2.6) to decorate (mark) the particular test
-method as such:
+of the subsequent launched process.  We are using unittest to decorate (mark)
+the particular test method as such:
 
-    @unittest2.expectedFailure
+    @unittest.expectedFailure
     # rdar://problem/8435794
     # settings set target.process.output-path does not seem to work
     def test_set_output_path(self):
 
-See http://pypi.python.org/pypi/unittest2 for more details.
+See http://docs.python.org/library/unittest.html for more details.
 
 Now let's look inside the test method:
 
diff --git a/lldb/packages/Python/lldbsuite/test/README-TestSuite b/lldb/packages/Python/lldbsuite/test/README-TestSuite
index f76e836ab777c..388f94da0c409 100644
--- a/lldb/packages/Python/lldbsuite/test/README-TestSuite
+++ b/lldb/packages/Python/lldbsuite/test/README-TestSuite
@@ -91,20 +91,6 @@ to the Python test suite under the current 'test' directory.
   Contains platform specific plugin to build binaries with dsym/dwarf debugging
   info.  Other platform specific functionalities may be added in the future.
 
-- unittest2 directory
-
-  Many new features were added to unittest in Python 2.7, including test
-  discovery. unittest2 allows you to use these features with earlier versions of
-  Python.
-
-  It currently has unittest2 0.5.1 from http://pypi.python.org/pypi/unittest2.
-  Version 0.5.1 of unittest2 has feature parity with unittest in Python 2.7
-  final. If you want to ensure that your tests run identically under unittest2
-  and unittest in Python 2.7 you should use unittest2 0.5.1. 
-
-  Later versions of unittest2 include changes in unittest made in Python 3.2 and
-  onwards after the release of Python 2.7.
-
 - Profiling dotest.py runs
 
   I used the following command line thingy to do the profiling on a SnowLeopard

From e88c255313872185b8c9738d9fa0e624de1e1bea Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 23 Feb 2024 03:40:39 +0800
Subject: [PATCH 085/546] [InstCombine] Add support for cast instructions in
 `getFreelyInvertedImpl` (#82451)

This patch adds support for cast instructions in `getFreelyInvertedImpl`
to enable more optimizations.
Alive2: https://alive2.llvm.org/ce/z/F6maEE
---
 .../InstCombine/InstructionCombining.cpp      | 14 +++
 llvm/test/Transforms/InstCombine/not.ll       | 89 +++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4af455c37c788..87c8dca7efed8 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2387,6 +2387,20 @@ Value *InstCombiner::getFreelyInvertedImpl(Value *V, bool WillInvertAllUses,
     return NonNull;
   }
 
+  if (match(V, m_SExtLike(m_Value(A)))) {
+    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                         DoesConsume, Depth))
+      return Builder ? Builder->CreateSExt(AV, V->getType()) : NonNull;
+    return nullptr;
+  }
+
+  if (match(V, m_Trunc(m_Value(A)))) {
+    if (auto *AV = getFreelyInvertedImpl(A, A->hasOneUse(), Builder,
+                                         DoesConsume, Depth))
+      return Builder ? Builder->CreateTrunc(AV, V->getType()) : NonNull;
+    return nullptr;
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll
index 3b0e5b4412fbe..f277d13eee930 100644
--- a/llvm/test/Transforms/InstCombine/not.ll
+++ b/llvm/test/Transforms/InstCombine/not.ll
@@ -769,3 +769,92 @@ entry:
   %cmp = icmp sle i32 %select, %not.c
   ret i1 %cmp
 }
+
+define i32 @test_sext(i32 %a, i32 %b){
+; CHECK-LABEL: @test_sext(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[NOT:%.*]] = sub i32 [[TMP2]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sext = sext i1 %cmp to i32
+  %add = add i32 %b, %sext
+  %not = xor i32 %add, -1
+  ret i32 %not
+}
+
+define <2 x i32> @test_sext_vec(<2 x i32> %a, <2 x i32> %b){
+; CHECK-LABEL: @test_sext_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[NOT:%.*]] = sub <2 x i32> [[TMP2]], [[B:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[NOT]]
+;
+  %cmp = icmp eq <2 x i32> %a, zeroinitializer
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  %add = add <2 x i32> %b, %sext
+  %not = xor <2 x i32> %add, <i32 -1, i32 -1>
+  ret <2 x i32> %not
+}
+
+define i64 @test_zext_nneg(i32 %c1, i64 %c2, i64 %c3){
+; CHECK-LABEL: @test_zext_nneg(
+; CHECK-NEXT:    [[DOTNEG:%.*]] = add i64 [[C2:%.*]], -4
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[C1:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], [[C3:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i64 [[DOTNEG]], [[TMP2]]
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %not = xor i32 %c1, -1
+  %conv = zext nneg i32 %not to i64
+  %add1 = add i64 %c2, -5
+  %add2 = add i64 %conv, %c3
+  %sub = sub i64 %add1, %add2
+  ret i64 %sub
+}
+
+define i8 @test_trunc(i8 %a){
+; CHECK-LABEL: @test_trunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i8 [[A:%.*]], 0
+; CHECK-NEXT:    [[NOT:%.*]] = sext i1 [[TMP1]] to i8
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %zext = zext i8 %a to i32
+  %sub = add nsw i32 %zext, -1
+  %shr = ashr i32 %sub, 31
+  %conv = trunc i32 %shr to i8
+  %not = xor i8 %conv, -1
+  ret i8 %not
+}
+
+define <2 x i8> @test_trunc_vec(<2 x i8> %a){
+; CHECK-LABEL: @test_trunc_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i8> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[NOT:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[NOT]]
+;
+  %zext = zext <2 x i8> %a to <2 x i32>
+  %sub = add nsw <2 x i32> %zext, <i32 -1, i32 -1>
+  %shr = ashr <2 x i32> %sub, <i32 31, i32 31>
+  %conv = trunc <2 x i32> %shr to <2 x i8>
+  %not = xor <2 x i8> %conv, <i8 -1, i8 -1>
+  ret <2 x i8> %not
+}
+
+; Negative tests
+
+define i32 @test_zext(i32 %a, i32 %b){
+; CHECK-LABEL: @test_zext(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEXT]], [[B:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[ADD]], -1
+; CHECK-NEXT:    ret i32 [[NOT]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sext = zext i1 %cmp to i32
+  %add = add i32 %b, %sext
+  %not = xor i32 %add, -1
+  ret i32 %not
+}

From 3b20fb336d1191e7b969c30825ca8b9423550902 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Thu, 22 Feb 2024 11:43:11 -0800
Subject: [PATCH 086/546] [bazel] add missing dep after
 5b079af169cd04b457465fd7ca31714efeefe6d9

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 8d11fb9be188f..09c53c9e8a131 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -613,14 +613,15 @@ libc_support_library(
 libc_support_library(
     name = "__support_fixed_point",
     hdrs = [
-        "src/__support/fixed_point/fx_rep.h",
         "src/__support/fixed_point/fx_bits.h",
+        "src/__support/fixed_point/fx_rep.h",
     ],
     deps = [
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
         ":__support_macros_attributes",
         ":__support_macros_optimization",
+        ":__support_math_extras",
         ":llvm_libc_macros_stdfix_macros",
     ],
 )

From f5c8e9e53130a628c2c3d25c2cbc308e62d2f3e0 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <r@artagnon.com>
Date: Thu, 22 Feb 2024 19:55:18 +0000
Subject: [PATCH 087/546] LoopVectorize/test: guard pr72969 with asserts
 (#82653)

Follow up on 695a9d8 (LoopVectorize: add test for crash in #72969) to
guard pr72969.ll with REQUIRES: asserts, in order to be reasonably
confident that it will crash reliably.
---
 llvm/test/Transforms/LoopVectorize/X86/pr72969.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 40633c6c8383b..738f5cbaebea5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -1,3 +1,4 @@
+; REQUIRES: asserts
 ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s
 ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s
 ; REQUIRES: asserts

From c1e9883a813db76c1b108ad715895928bb93f4c2 Mon Sep 17 00:00:00 2001
From: Matthias Gehre <93204396+mgehre-amd@users.noreply.github.com>
Date: Thu, 22 Feb 2024 21:16:33 +0100
Subject: [PATCH 088/546] [TOSA] TosaToLinalg: fix int64_t min/max lowering of
 clamp (#82641)

tosa.clamp takes `min`/`max` attributes as i64, so ensure that the
lowering to linalg works for the whole range.

Co-authored-by: Tiago Trevisan Jost <tiago.trevisanjost@amd.com>
---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp  | 24 +++++++++----------
 .../TosaToLinalg/tosa-to-linalg.mlir          | 15 ++++++++++++
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 7eb32ebe3228f..7c477f2e1412b 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -384,23 +384,23 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
 
   if (isa<tosa::ClampOp>(op) && isa<IntegerType>(elementTy)) {
     auto intTy = cast<IntegerType>(elementTy);
-    int32_t min = static_cast<int32_t>(
-        cast<IntegerAttr>(op->getAttr("min_int")).getValue().getSExtValue());
-    int32_t max = static_cast<int32_t>(
-        cast<IntegerAttr>(op->getAttr("max_int")).getValue().getSExtValue());
+    int64_t min =
+        cast<IntegerAttr>(op->getAttr("min_int")).getValue().getSExtValue();
+    int64_t max =
+        cast<IntegerAttr>(op->getAttr("max_int")).getValue().getSExtValue();
 
     if (intTy.isUnsignedInteger()) {
-      min = std::max<int32_t>(min, 0);
-      max = std::min<int32_t>(
+      min = std::max(min, (int64_t)0);
+      max = std::min(
           max,
           APInt::getMaxValue(intTy.getIntOrFloatBitWidth()).getSExtValue());
     } else {
-      min = std::max<int32_t>(
-          min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth())
-                   .getSExtValue());
-      max = std::min<int32_t>(
-          max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth())
-                   .getSExtValue());
+      min =
+          std::max(min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth())
+                            .getSExtValue());
+      max =
+          std::min(max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth())
+                            .getSExtValue());
     }
 
     auto minVal = rewriter.create<arith::ConstantIntOp>(
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index febe74e876746..1fa783f05f04e 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -759,6 +759,21 @@ func.func @test_i8(%arg0: tensor<1xi8>) -> () {
 
 // -----
 
+// CHECK-LABEL: @test_i64
+func.func @test_i64(%arg0: tensor<1xi64>) -> () {
+  // CHECK: linalg.generic
+  // CHECK: ^bb0(%[[ARG1:.+]]: i64,
+  // CHECK-DAG: %[[C127:.+]] = arith.constant -9223372036854775808
+  // CHECK-DAG: %[[C126:.+]] = arith.constant 9223372036854775807
+  // CHECK-DAG: %[[LOWER:.+]] = arith.maxsi %[[C127]], %[[ARG1]]
+  // CHECK-DAG: %[[CLAMPED:.+]] = arith.minsi %[[C126]], %[[LOWER]]
+  %0 = tosa.clamp %arg0 {min_int = -9223372036854775808 : i64, max_int = 9223372036854775807 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi64>) -> tensor<1xi64>
+
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @test_clamp_f16
 func.func @test_clamp_f16(%arg0: tensor<1xf16>) -> () {
   // CHECK: linalg.generic

From 66f6929fec3ae4770368b60aa1920623ab835f9d Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Thu, 22 Feb 2024 14:32:24 -0600
Subject: [PATCH 089/546] [HLSL][Doc] Add doc about expected differences
 (#82395)

This document covers expected differences between Clang and the HLSL
reference compiler implementations (FXC & DXC). The document is not
intended to be exhaustive, but it should be a best effort to cover known
cases.

This document should document both the behavioral difference and the
explanation of why Clang differs.

The initail document covers known overload resolution differences.

---------

Co-authored-by: S. Bharadwaj Yadavalli <Bharadwaj.Yadavalli@microsoft.com>
---
 clang/docs/HLSL/ExpectedDifferences.rst | 110 ++++++++++++++++++++++++
 clang/docs/HLSL/HLSLDocs.rst            |   1 +
 2 files changed, 111 insertions(+)
 create mode 100644 clang/docs/HLSL/ExpectedDifferences.rst

diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
new file mode 100644
index 0000000000000..60001b22dc792
--- /dev/null
+++ b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -0,0 +1,110 @@
+
+Expected Differences vs DXC and FXC
+===================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+HLSL currently has two reference compilers, the `DirectX Shader Compiler (DXC)
+<https://github.com/microsoft/DirectXShaderCompiler/>`_ and the
+`Effect-Compiler (FXC) <https://learn.microsoft.com/en-us/windows/win32/direct3dtools/fxc>`_.
+The two reference compilers do not fully agree. Some known disagreements in the
+references are tracked on
+`DXC's GitHub
+<https://github.com/microsoft/DirectXShaderCompiler/issues?q=is%3Aopen+is%3Aissue+label%3Afxc-disagrees>`_,
+but many more are known to exist.
+
+HLSL as implemented by Clang will also not fully match either of the reference
+implementations, it is instead being written to match the `draft language
+specification <https://microsoft.github.io/hlsl-specs/specs/hlsl.pdf>`_.
+
+This document is a non-exhaustive collection the known differences between
+Clang's implementation of HLSL and the existing reference compilers.
+
+General Principles
+------------------
+
+Most of the intended differences between Clang and the earlier reference
+compilers are focused on increased consistency and correctness. Both reference
+compilers do not always apply language rules the same in all contexts.
+
+Clang also deviates from the reference compilers by providing different
+diagnostics, both in terms of the textual messages and the contexts in which
+diagnostics are produced. While striving for a high level of source
+compatibility with conforming HLSL code, Clang may produce earlier and more
+robust diagnostics for incorrect code or reject code that a reference compiler
+incorrectly accepted.
+
+Language Version
+================
+
+Clang targets language compatibility for HLSL 2021 as implemented by DXC.
+Language features that were removed in earlier versions of HLSL may be added on
+a case-by-case basis, but are not planned for the initial implementation.
+
+Overload Resolution
+===================
+
+Clang's HLSL implementation adopts C++ overload resolution rules as proposed for
+HLSL 202x based on proposal
+`0007 <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0007-const-instance-methods.md>`_
+and
+`0008 <https://github.com/microsoft/hlsl-specs/blob/main/proposals/0008-non-member-operator-overloading.md>`_.
+
+Clang's implementation extends standard overload resolution rules to HLSL
+library functionality. This causes subtle changes in overload resolution
+behavior between Clang and DXC. Some examples include:
+
+.. code-block:: c++
+
+  void halfOrInt16(half H);
+  void halfOrInt16(uint16_t U);
+  void halfOrInt16(int16_t I);
+
+  void takesDoubles(double, double, double);
+
+  cbuffer CB {
+    uint U;
+    int I;
+    float X, Y, Z;
+    double3 A, B;
+  }
+
+  export void call() {
+    halfOrInt16(U); // DXC: Fails with call ambiguous between int16_t and uint16_t overloads
+                    // Clang: Resolves to halfOrInt16(uint16_t).
+    halfOrInt16(I); // All: Resolves to halfOrInt16(int16_t).
+    half H;
+  #ifndef IGNORE_ERRORS
+    // asfloat16 is a builtin with overloads for half, int16_t, and uint16_t.
+    H = asfloat16(I); // DXC: Fails to resolve overload for int.
+                      // Clang: Resolves to asfloat16(int16_t).
+    H = asfloat16(U); // DXC: Fails to resolve overload for int.
+                      // Clang: Resolves to asfloat16(uint16_t).
+  #endif
+    H = asfloat16(0x01); // DXC: Resolves to asfloat16(half).
+                         // Clang: Resolves to asfloat16(uint16_t).
+
+    takesDoubles(X, Y, Z); // Works on all compilers
+  #ifndef IGNORE_ERRORS
+    fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double.
+                  // Clang: Resolves to fma(double,double,double).
+  #endif
+    
+    double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
+                          // FXC: Expands to compute double dot product with fmul/fadd
+                          // Clang: Resolves to dot(float3, float3), emits conversion warnings.
+
+  }
+
+.. note::
+
+  In Clang, a conscious decision was made to exclude the ``dot(vector<double,N>, vector<double,N>)`` 
+  overload and allow overload resolution to resolve the
+  ``vector<float,N>`` overload. This approach provides ``-Wconversion``
+  diagnostic notifying the user of the conversion rather than silently altering
+  precision relative to the other overloads (as FXC does) or generating code
+  that will fail validation (as DXC does).
diff --git a/clang/docs/HLSL/HLSLDocs.rst b/clang/docs/HLSL/HLSLDocs.rst
index 1f232129548d0..97b2425f013b3 100644
--- a/clang/docs/HLSL/HLSLDocs.rst
+++ b/clang/docs/HLSL/HLSLDocs.rst
@@ -11,6 +11,7 @@ HLSL Design and Implementation
 .. toctree::
    :maxdepth: 1
 
+   ExpectedDifferences
    HLSLIRReference
    ResourceTypes
    EntryFunctions

From 847048f497bcdfcfe52f36cba49f07bdbd63cd24 Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Thu, 22 Feb 2024 12:37:32 -0800
Subject: [PATCH 090/546] [mlir][Vector] Fix bug in vector xfer op flattening
 transformation (#81964)

It looks like the affine map generated to compute the indices of the
collapsed dimensions used the wrong dim size. For indices `[idx0][idx1]`
we computed the collapsed index as `idx0*size0 + idx1` instead of
`idx0*size1 + idx1`. This led to correctness issues in convolution tests
when enabling this transformation internally.
---
 .../mlir/Dialect/Utils/IndexingUtils.h        |  3 ++
 mlir/lib/Dialect/Utils/IndexingUtils.cpp      | 11 ++++-
 .../Transforms/VectorTransferOpTransforms.cpp | 41 +++++++++++--------
 .../Vector/vector-transfer-flatten.mlir       | 32 ++++++++++++++-
 4 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
index 2453d841f633e..9892253df2bff 100644
--- a/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/IndexingUtils.h
@@ -257,6 +257,9 @@ SmallVector<int64_t> getI64SubArray(ArrayAttr arrayAttr, unsigned dropFront = 0,
 std::pair<AffineExpr, SmallVector<OpFoldResult>>
 computeLinearIndex(OpFoldResult sourceOffset, ArrayRef<OpFoldResult> strides,
                    ArrayRef<OpFoldResult> indices);
+std::pair<AffineExpr, SmallVector<OpFoldResult>>
+computeLinearIndex(OpFoldResult sourceOffset, ArrayRef<int64_t> strides,
+                   ArrayRef<Value> indices);
 
 //===----------------------------------------------------------------------===//
 // Utilities for decomposing larger shapes
diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
index baaa581ab6f22..4c960659d80cb 100644
--- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp
+++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp
@@ -7,13 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Utils/IndexingUtils.h"
-
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "llvm/ADT/STLExtras.h"
-
 #include <numeric>
 #include <optional>
 
@@ -306,6 +305,14 @@ mlir::computeLinearIndex(OpFoldResult sourceOffset,
   return {expr, values};
 }
 
+std::pair<AffineExpr, SmallVector<OpFoldResult>>
+mlir::computeLinearIndex(OpFoldResult sourceOffset, ArrayRef<int64_t> strides,
+                         ArrayRef<Value> indices) {
+  return computeLinearIndex(
+      sourceOffset, getAsIndexOpFoldResult(sourceOffset.getContext(), strides),
+      getAsOpFoldResult(ValueRange(indices)));
+}
+
 //===----------------------------------------------------------------------===//
 // TileOffsetRange
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 04e5a816dd91e..0ffef6aabccc1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
@@ -577,7 +578,6 @@ class FlattenContiguousRowMajorTransferReadPattern
     if (transferReadOp.getMask())
       return failure();
 
-    SmallVector<Value> collapsedIndices;
     int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank();
 
     // 1. Collapse the source memref
@@ -599,12 +599,14 @@ class FlattenContiguousRowMajorTransferReadPattern
     // 2.2 New indices
     // If all the collapsed indices are zero then no extra logic is needed.
     // Otherwise, a new offset/index has to be computed.
+    SmallVector<Value> collapsedIndices;
     if (failed(checkAndCollapseInnerZeroIndices(transferReadOp.getIndices(),
                                                 firstDimToCollapse,
                                                 collapsedIndices))) {
-      // Copy all the leading indices
-      collapsedIndices = transferReadOp.getIndices();
-      collapsedIndices.resize(firstDimToCollapse);
+      // Copy all the leading indices.
+      SmallVector<Value> indices = transferReadOp.getIndices();
+      collapsedIndices.append(indices.begin(),
+                              indices.begin() + firstDimToCollapse);
 
       // Compute the remaining trailing index/offset required for reading from
       // the collapsed memref:
@@ -621,24 +623,26 @@ class FlattenContiguousRowMajorTransferReadPattern
       //      memref<1x86xi32>, vector<2xi32>
       // one would get the following offset:
       //    %offset = %arg0 * 43
-      AffineExpr offsetExpr, idxExpr;
-      bindSymbols(rewriter.getContext(), offsetExpr, idxExpr);
-
-      int64_t outputRank = transferReadOp.getIndices().size();
-      OpFoldResult offset =
+      OpFoldResult collapsedOffset =
           rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult();
 
-      for (int64_t i = firstDimToCollapse; i < outputRank; ++i) {
-        int64_t dim = dyn_cast<ShapedType>(source.getType()).getDimSize(i);
-        offset = affine::makeComposedFoldedAffineApply(
-            rewriter, loc, offsetExpr + dim * idxExpr,
-            {offset, transferReadOp.getIndices()[i]});
-      }
-      if (offset.is<Value>()) {
-        collapsedIndices.push_back(offset.get<Value>());
+      auto sourceShape = sourceType.getShape();
+      auto collapsedStrides = computeSuffixProduct(ArrayRef<int64_t>(
+          sourceShape.begin() + firstDimToCollapse, sourceShape.end()));
+
+      // Compute the collapsed offset.
+      ArrayRef<Value> indicesToCollapse(indices.begin() + firstDimToCollapse,
+                                        indices.end());
+      auto &&[collapsedExpr, collapsedVals] = computeLinearIndex(
+          collapsedOffset, collapsedStrides, indicesToCollapse);
+      collapsedOffset = affine::makeComposedFoldedAffineApply(
+          rewriter, loc, collapsedExpr, collapsedVals);
+
+      if (collapsedOffset.is<Value>()) {
+        collapsedIndices.push_back(collapsedOffset.get<Value>());
       } else {
         collapsedIndices.push_back(rewriter.create<arith::ConstantIndexOp>(
-            loc, *getConstantIntValue(offset)));
+            loc, *getConstantIntValue(collapsedOffset)));
       }
     }
 
@@ -710,6 +714,7 @@ class FlattenContiguousRowMajorTransferWritePattern
                                                 firstContiguousInnerDim,
                                                 collapsedIndices)))
       return failure();
+
     Value collapsedSource =
         collapseInnerDims(rewriter, loc, source, firstContiguousInnerDim);
     MemRefType collapsedSourceType =
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 1775b5fa4a346..3b6441d0c9560 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -83,7 +83,7 @@ func.func @transfer_read_dims_mismatch_non_zero_indices(
   return
 }
 
-// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 * 43)>
+// CHECK: #[[$ATTR_0:.+]] = affine_map<()[s0, s1] -> (s0 * 24 + s1 * 6)>
 
 // CHECK-LABEL:   func.func @transfer_read_dims_mismatch_non_zero_indices(
 // CHECK-SAME:      %[[IDX_1:.*]]: index, %[[IDX_2:.*]]: index,
@@ -92,7 +92,7 @@ func.func @transfer_read_dims_mismatch_non_zero_indices(
 // CHECK:           %[[C_0:.*]] = arith.constant 0 : i32
 // CHECK:           %[[C_0_IDX:.*]] = arith.constant 0 : index
 // CHECK:           %[[COLLAPSED_IN:.*]] = memref.collapse_shape %[[M_IN]] {{\[}}[0], [1, 2, 3]] : memref<1x43x4x6xi32> into memref<1x1032xi32>
-// CHECK:           %[[COLLAPSED_IDX:.*]] = affine.apply #[[$ATTR_0]]()[%[[IDX_2]], %[[IDX_1]]]
+// CHECK:           %[[COLLAPSED_IDX:.*]] = affine.apply #[[$ATTR_0]]()[%[[IDX_1]], %[[IDX_2]]]
 // CHECK:           %[[READ:.*]] = vector.transfer_read %[[COLLAPSED_IN]][%[[C_0_IDX]], %[[COLLAPSED_IDX]]], %[[C_0]] {in_bounds = [true]} : memref<1x1032xi32>, vector<12xi32>
 // CHECK:           %[[COLLAPSED_OUT:.*]] = memref.collapse_shape %[[M_OUT]] {{\[}}[0, 1, 2]] : memref<1x2x6xi32> into memref<12xi32>
 // CHECK:           vector.transfer_write %[[READ]], %[[COLLAPSED_OUT]][%[[C_0_IDX]]] {in_bounds = [true]} : vector<12xi32>, memref<12xi32>
@@ -459,3 +459,31 @@ func.func @fold_unit_dims_entirely(%arg0 : vector<8xi32>,
 // CHECK-128B-LABEL: func @fold_unit_dims_entirely(
 //   CHECK-128B-NOT:   memref.collapse_shape
 
+
+// -----
+
+func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>,
+                                              %idx0 : index, %idx1 : index) -> vector<2x2xf32> {
+  %c0 = arith.constant 0 : index
+  %cst_1 = arith.constant 0.000000e+00 : f32
+  %8 = vector.transfer_read %subview[%c0, %idx0, %idx1, %c0], %cst_1 {in_bounds = [true, true]} : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, vector<2x2xf32>
+  return %8 : vector<2x2xf32>
+}
+
+//       CHECK:  #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)>
+// CHECK-LABEL:    func.func @regression_non_contiguous_dim_read(
+//       CHECK:      %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>>
+//       CHECK:     %[[APPLY:.*]] = affine.apply #[[$MAP]]()
+
+// -----
+
+func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>,
+                                                %subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>,
+                                                %idx0 : index, %idx1 : index) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %value, %subview[%c0, %idx0, %idx1, %c0] {in_bounds = [true, true]} : vector<2x2xf32>, memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>
+  return
+}
+
+// CHECK-LABEL:  func.func @unsupported_non_contiguous_dim_write(
+//   CHECK-NOT:    memref.collapse_shape

From 91e9e3175268c85f4d0e8828d0d392191c250543 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 22 Feb 2024 13:47:36 -0700
Subject: [PATCH 091/546] [NewPM/CodeGen] Rewrite pass manager nesting (#81068)

Currently the new PM infra for codegen puts everything into a
MachineFunctionPassManager. The MachineFunctionPassManager owns both
Module passes and MachineFunction passes, and batches adjacent
MachineFunction passes like a typical PassManager.

The current MachineFunctionAnalysisManager also directly references a
module and function analysis manager to get results.

The initial argument was that the codegen pipeline is relatively "flat",
meaning it's mostly machine function passes with a couple of module
passes here and there. However, there are a couple of issues with this
as compared to a more structured nesting more like the optimization
pipeline. For example, it doesn't allow running function passes then
machine function passes on a function and its machine function all at
once. It also currently requires the caller to split out the IR passes
into one pass manager and the MIR passes into another pass manager.

This patch rewrites the new pass manager infra for the codegen pipeline
to be more similar to the nesting in the optimization pipeline.
Basically, a Function contains a MachineFunction. So we can have Module
-> Function -> MachineFunction adaptors. It also rewrites the analysis
managers to have inner/outer proxies like the ones in the optimization
pipeline. The new pass managers/adaptors/analysis managers can be seen
in use in PassManagerTest.cpp.

This allows us to consolidate to just having to add to one
ModulePassManager when using the codegen pipeline.

I haven't added the Function -> MachineFunction adaptor in this patch,
but it should be added when we merge AddIRPass/AddMachinePass so that we
can run IR and MIR passes on a function before proceeding to the next
function.

The MachineFunctionProperties infra for MIR verification is still WIP.
---
 .../include/llvm/CodeGen/MachinePassManager.h | 405 ++++++++++--------
 llvm/include/llvm/Passes/CodeGenPassBuilder.h | 108 +++--
 llvm/include/llvm/Passes/PassBuilder.h        |  15 +-
 llvm/include/llvm/Target/TargetMachine.h      |  10 +-
 llvm/lib/CodeGen/MachinePassManager.cpp       | 183 ++++----
 llvm/lib/Passes/PassBuilder.cpp               |  48 ++-
 llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp |   9 +-
 llvm/lib/Target/X86/X86TargetMachine.h        |   7 +-
 llvm/test/tools/llc/new-pm/pipeline.mir       |   3 +-
 llvm/test/tools/llc/new-pm/start-stop.ll      |   7 +-
 llvm/tools/llc/NewPMDriver.cpp                |  85 +---
 llvm/unittests/CodeGen/PassManagerTest.cpp    | 213 +++------
 .../MIR/PassBuilderCallbacksTest.cpp          | 216 ++++++----
 13 files changed, 681 insertions(+), 628 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachinePassManager.h b/llvm/include/llvm/CodeGen/MachinePassManager.h
index a0ad7d7a95a28..7713c55661ccc 100644
--- a/llvm/include/llvm/CodeGen/MachinePassManager.h
+++ b/llvm/include/llvm/CodeGen/MachinePassManager.h
@@ -25,17 +25,18 @@
 
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PassManagerInternal.h"
 #include "llvm/Support/Error.h"
 
-#include <map>
-
 namespace llvm {
 class Module;
 class Function;
 class MachineFunction;
 
 extern template class AnalysisManager<MachineFunction>;
+using MachineFunctionAnalysisManager = AnalysisManager<MachineFunction>;
 
 /// A CRTP mix-in that provides informational APIs needed for machine passes.
 ///
@@ -46,217 +47,247 @@ struct MachinePassInfoMixin : public PassInfoMixin<DerivedT> {
   // TODO: Add MachineFunctionProperties support.
 };
 
-/// An AnalysisManager<MachineFunction> that also exposes IR analysis results.
-class MachineFunctionAnalysisManager : public AnalysisManager<MachineFunction> {
-public:
-  using Base = AnalysisManager<MachineFunction>;
+namespace detail {
+struct MachinePassConcept
+    : PassConcept<MachineFunction, MachineFunctionAnalysisManager> {
+  virtual MachineFunctionProperties getRequiredProperties() const = 0;
+  virtual MachineFunctionProperties getSetProperties() const = 0;
+  virtual MachineFunctionProperties getClearedProperties() const = 0;
+};
 
-  MachineFunctionAnalysisManager() : FAM(nullptr), MAM(nullptr) {}
-  MachineFunctionAnalysisManager(FunctionAnalysisManager &FAM,
-                                 ModuleAnalysisManager &MAM)
-      : FAM(&FAM), MAM(&MAM) {}
-  MachineFunctionAnalysisManager(MachineFunctionAnalysisManager &&) = default;
-  MachineFunctionAnalysisManager &
-  operator=(MachineFunctionAnalysisManager &&) = default;
+template <typename PassT> struct MachinePassModel : MachinePassConcept {
+  explicit MachinePassModel(PassT Pass) : Pass(std::move(Pass)) {}
+  // We have to explicitly define all the special member functions because MSVC
+  // refuses to generate them.
+  MachinePassModel(const MachinePassModel &Arg) : Pass(Arg.Pass) {}
+  MachinePassModel(MachinePassModel &&Arg) : Pass(std::move(Arg.Pass)) {}
 
-  /// Get the result of an analysis pass for a Function.
-  ///
-  /// Runs the analysis if a cached result is not available.
-  template <typename PassT> typename PassT::Result &getResult(Function &F) {
-    return FAM->getResult<PassT>(F);
+  friend void swap(MachinePassModel &LHS, MachinePassModel &RHS) {
+    using std::swap;
+    swap(LHS.Pass, RHS.Pass);
   }
 
-  /// Get the cached result of an analysis pass for a Function.
-  ///
-  /// This method never runs the analysis.
-  ///
-  /// \returns null if there is no cached result.
-  template <typename PassT>
-  typename PassT::Result *getCachedResult(Function &F) {
-    return FAM->getCachedResult<PassT>(F);
+  MachinePassModel &operator=(MachinePassModel RHS) {
+    swap(*this, RHS);
+    return *this;
   }
 
-  /// Get the result of an analysis pass for a Module.
-  ///
-  /// Runs the analysis if a cached result is not available.
-  template <typename PassT> typename PassT::Result &getResult(Module &M) {
-    return MAM->getResult<PassT>(M);
+  PreservedAnalyses run(MachineFunction &IR,
+                        MachineFunctionAnalysisManager &AM) override {
+    return Pass.run(IR, AM);
   }
 
-  /// Get the cached result of an analysis pass for a Module.
-  ///
-  /// This method never runs the analysis.
-  ///
-  /// \returns null if there is no cached result.
-  template <typename PassT> typename PassT::Result *getCachedResult(Module &M) {
-    return MAM->getCachedResult<PassT>(M);
+  void printPipeline(
+      raw_ostream &OS,
+      function_ref<StringRef(StringRef)> MapClassName2PassName) override {
+    Pass.printPipeline(OS, MapClassName2PassName);
   }
 
-  /// Get the result of an analysis pass for a MachineFunction.
-  ///
-  /// Runs the analysis if a cached result is not available.
-  using Base::getResult;
+  StringRef name() const override { return PassT::name(); }
 
-  /// Get the cached result of an analysis pass for a MachineFunction.
-  ///
-  /// This method never runs the analysis.
-  ///
-  /// returns null if there is no cached result.
-  using Base::getCachedResult;
-
-  // FIXME: Add LoopAnalysisManager or CGSCCAnalysisManager if needed.
-  FunctionAnalysisManager *FAM;
-  ModuleAnalysisManager *MAM;
-};
+  template <typename T>
+  using has_required_t = decltype(std::declval<T &>().isRequired());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_required_t, T>::value, bool>
+  passIsRequiredImpl() {
+    return T::isRequired();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_required_t, T>::value, bool>
+  passIsRequiredImpl() {
+    return false;
+  }
+  bool isRequired() const override { return passIsRequiredImpl<PassT>(); }
+
+  template <typename T>
+  using has_get_required_properties_t =
+      decltype(std::declval<T &>().getRequiredProperties());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_get_required_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getRequiredPropertiesImpl() {
+    return PassT::getRequiredProperties();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_get_required_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getRequiredPropertiesImpl() {
+    return MachineFunctionProperties();
+  }
+  MachineFunctionProperties getRequiredProperties() const override {
+    return getRequiredPropertiesImpl<PassT>();
+  }
 
-extern template class PassManager<MachineFunction>;
+  template <typename T>
+  using has_get_set_properties_t =
+      decltype(std::declval<T &>().getSetProperties());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_get_set_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getSetPropertiesImpl() {
+    return PassT::getSetProperties();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_get_set_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getSetPropertiesImpl() {
+    return MachineFunctionProperties();
+  }
+  MachineFunctionProperties getSetProperties() const override {
+    return getSetPropertiesImpl<PassT>();
+  }
 
-/// MachineFunctionPassManager adds/removes below features to/from the base
-/// PassManager template instantiation.
-///
-/// - Support passes that implement doInitialization/doFinalization. This is for
-///   machine function passes to work on module level constructs. One such pass
-///   is AsmPrinter.
-///
-/// - Support machine module pass which runs over the module (for example,
-///   MachineOutliner). A machine module pass needs to define the method:
-///
-///   ```Error run(Module &, MachineFunctionAnalysisManager &)```
-///
-///   FIXME: machine module passes still need to define the usual machine
-///          function pass interface, namely,
-///          `PreservedAnalyses run(MachineFunction &,
-///                                 MachineFunctionAnalysisManager &)`
-///          But this interface wouldn't be executed. It is just a placeholder
-///          to satisfy the pass manager type-erased inteface. This
-///          special-casing of machine module pass is due to its limited use
-///          cases and the unnecessary complexity it may bring to the machine
-///          pass manager.
-///
-/// - The base class `run` method is replaced by an alternative `run` method.
-///   See details below.
-///
-/// - Support codegening in the SCC order. Users include interprocedural
-///   register allocation (IPRA).
-class MachineFunctionPassManager
-    : public PassManager<MachineFunction, MachineFunctionAnalysisManager> {
-  using Base = PassManager<MachineFunction, MachineFunctionAnalysisManager>;
+  template <typename T>
+  using has_get_cleared_properties_t =
+      decltype(std::declval<T &>().getClearedProperties());
+  template <typename T>
+  static std::enable_if_t<is_detected<has_get_cleared_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getClearedPropertiesImpl() {
+    return PassT::getClearedProperties();
+  }
+  template <typename T>
+  static std::enable_if_t<!is_detected<has_get_cleared_properties_t, T>::value,
+                          MachineFunctionProperties>
+  getClearedPropertiesImpl() {
+    return MachineFunctionProperties();
+  }
+  MachineFunctionProperties getClearedProperties() const override {
+    return getClearedPropertiesImpl<PassT>();
+  }
 
+  PassT Pass;
+};
+} // namespace detail
+
+using MachineFunctionAnalysisManagerModuleProxy =
+    InnerAnalysisManagerProxy<MachineFunctionAnalysisManager, Module>;
+
+template <>
+bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
+    Module &M, const PreservedAnalyses &PA,
+    ModuleAnalysisManager::Invalidator &Inv);
+extern template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+                                                Module>;
+
+extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
+                                                MachineFunction>;
+/// Provide the \c ModuleAnalysisManager to \c Function proxy.
+using ModuleAnalysisManagerMachineFunctionProxy =
+    OuterAnalysisManagerProxy<ModuleAnalysisManager, MachineFunction>;
+
+class FunctionAnalysisManagerMachineFunctionProxy
+    : public AnalysisInfoMixin<FunctionAnalysisManagerMachineFunctionProxy> {
 public:
-  MachineFunctionPassManager(bool RequireCodeGenSCCOrder = false,
-                             bool VerifyMachineFunction = false)
-      : RequireCodeGenSCCOrder(RequireCodeGenSCCOrder),
-        VerifyMachineFunction(VerifyMachineFunction) {}
-  MachineFunctionPassManager(MachineFunctionPassManager &&) = default;
-  MachineFunctionPassManager &
-  operator=(MachineFunctionPassManager &&) = default;
-
-  /// Run machine passes for a Module.
+  class Result {
+  public:
+    explicit Result(FunctionAnalysisManager &FAM) : FAM(&FAM) {}
+
+    Result(Result &&Arg) : FAM(std::move(Arg.FAM)) {
+      // We have to null out the analysis manager in the moved-from state
+      // because we are taking ownership of the responsibilty to clear the
+      // analysis state.
+      Arg.FAM = nullptr;
+    }
+
+    ~Result() {
+      // FAM is cleared in a moved from state where there is nothing to do.
+      if (!FAM)
+        return;
+
+      // Clear out the analysis manager if we're being destroyed -- it means we
+      // didn't even see an invalidate call when we got invalidated.
+      FAM->clear();
+    }
+
+    Result &operator=(Result &&RHS) {
+      FAM = RHS.FAM;
+      // We have to null out the analysis manager in the moved-from state
+      // because we are taking ownership of the responsibilty to clear the
+      // analysis state.
+      RHS.FAM = nullptr;
+      return *this;
+    }
+
+    /// Accessor for the analysis manager.
+    FunctionAnalysisManager &getManager() { return *FAM; }
+
+    /// Handler for invalidation of the outer IR unit, \c IRUnitT.
+    ///
+    /// If the proxy analysis itself is not preserved, we assume that the set of
+    /// inner IR objects contained in IRUnit may have changed.  In this case,
+    /// we have to call \c clear() on the inner analysis manager, as it may now
+    /// have stale pointers to its inner IR objects.
+    ///
+    /// Regardless of whether the proxy analysis is marked as preserved, all of
+    /// the analyses in the inner analysis manager are potentially invalidated
+    /// based on the set of preserved analyses.
+    bool invalidate(MachineFunction &IR, const PreservedAnalyses &PA,
+                    MachineFunctionAnalysisManager::Invalidator &Inv);
+
+  private:
+    FunctionAnalysisManager *FAM;
+  };
+
+  explicit FunctionAnalysisManagerMachineFunctionProxy(
+      FunctionAnalysisManager &FAM)
+      : FAM(&FAM) {}
+
+  /// Run the analysis pass and create our proxy result object.
   ///
-  /// The intended use is to start the codegen pipeline for a Module. The base
-  /// class's `run` method is deliberately hidden by this due to the observation
-  /// that we don't yet have the use cases of compositing two instances of
-  /// machine pass managers, or compositing machine pass managers with other
-  /// types of pass managers.
-  Error run(Module &M, MachineFunctionAnalysisManager &MFAM);
-
-  template <typename PassT> void addPass(PassT &&Pass) {
-    Base::addPass(std::forward<PassT>(Pass));
-    PassConceptT *P = Passes.back().get();
-    addDoInitialization<PassT>(P);
-    addDoFinalization<PassT>(P);
-
-    // Add machine module pass.
-    addRunOnModule<PassT>(P);
+  /// This doesn't do any interesting work; it is primarily used to insert our
+  /// proxy result object into the outer analysis cache so that we can proxy
+  /// invalidation to the inner analysis manager.
+  Result run(MachineFunction &, MachineFunctionAnalysisManager &) {
+    return Result(*FAM);
   }
 
-private:
-  template <typename PassT>
-  using has_init_t = decltype(std::declval<PassT &>().doInitialization(
-      std::declval<Module &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  std::enable_if_t<!is_detected<has_init_t, PassT>::value>
-  addDoInitialization(PassConceptT *Pass) {}
-
-  template <typename PassT>
-  std::enable_if_t<is_detected<has_init_t, PassT>::value>
-  addDoInitialization(PassConceptT *Pass) {
-    using PassModelT = detail::PassModel<MachineFunction, PassT,
-                                         MachineFunctionAnalysisManager>;
-    auto *P = static_cast<PassModelT *>(Pass);
-    InitializationFuncs.emplace_back(
-        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
-          return P->Pass.doInitialization(M, MFAM);
-        });
-  }
+  static AnalysisKey Key;
 
-  template <typename PassT>
-  using has_fini_t = decltype(std::declval<PassT &>().doFinalization(
-      std::declval<Module &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  std::enable_if_t<!is_detected<has_fini_t, PassT>::value>
-  addDoFinalization(PassConceptT *Pass) {}
-
-  template <typename PassT>
-  std::enable_if_t<is_detected<has_fini_t, PassT>::value>
-  addDoFinalization(PassConceptT *Pass) {
-    using PassModelT = detail::PassModel<MachineFunction, PassT,
-                                         MachineFunctionAnalysisManager>;
-    auto *P = static_cast<PassModelT *>(Pass);
-    FinalizationFuncs.emplace_back(
-        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
-          return P->Pass.doFinalization(M, MFAM);
-        });
-  }
+private:
+  FunctionAnalysisManager *FAM;
+};
 
-  template <typename PassT>
-  using is_machine_module_pass_t = decltype(std::declval<PassT &>().run(
-      std::declval<Module &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  using is_machine_function_pass_t = decltype(std::declval<PassT &>().run(
-      std::declval<MachineFunction &>(),
-      std::declval<MachineFunctionAnalysisManager &>()));
-
-  template <typename PassT>
-  std::enable_if_t<!is_detected<is_machine_module_pass_t, PassT>::value>
-  addRunOnModule(PassConceptT *Pass) {}
-
-  template <typename PassT>
-  std::enable_if_t<is_detected<is_machine_module_pass_t, PassT>::value>
-  addRunOnModule(PassConceptT *Pass) {
-    static_assert(is_detected<is_machine_function_pass_t, PassT>::value,
-                  "machine module pass needs to define machine function pass "
-                  "api. sorry.");
-
-    using PassModelT = detail::PassModel<MachineFunction, PassT,
-                                         MachineFunctionAnalysisManager>;
-    auto *P = static_cast<PassModelT *>(Pass);
-    MachineModulePasses.emplace(
-        Passes.size() - 1,
-        [=](Module &M, MachineFunctionAnalysisManager &MFAM) {
-          return P->Pass.run(M, MFAM);
-        });
-  }
+class ModuleToMachineFunctionPassAdaptor
+    : public PassInfoMixin<ModuleToMachineFunctionPassAdaptor> {
+  using MachinePassConcept = detail::MachinePassConcept;
 
-  using FuncTy = Error(Module &, MachineFunctionAnalysisManager &);
-  SmallVector<llvm::unique_function<FuncTy>, 4> InitializationFuncs;
-  SmallVector<llvm::unique_function<FuncTy>, 4> FinalizationFuncs;
+public:
+  explicit ModuleToMachineFunctionPassAdaptor(
+      std::unique_ptr<MachinePassConcept> Pass)
+      : Pass(std::move(Pass)) {}
 
-  using PassIndex = decltype(Passes)::size_type;
-  std::map<PassIndex, llvm::unique_function<FuncTy>> MachineModulePasses;
+  /// Runs the function pass across every function in the module.
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
-  // Run codegen in the SCC order.
-  bool RequireCodeGenSCCOrder;
+  static bool isRequired() { return true; }
 
-  bool VerifyMachineFunction;
+private:
+  std::unique_ptr<MachinePassConcept> Pass;
 };
 
+template <typename MachineFunctionPassT>
+ModuleToMachineFunctionPassAdaptor
+createModuleToMachineFunctionPassAdaptor(MachineFunctionPassT &&Pass) {
+  using PassModelT = detail::MachinePassModel<MachineFunctionPassT>;
+  // Do not use make_unique, it causes too many template instantiations,
+  // causing terrible compile times.
+  return ModuleToMachineFunctionPassAdaptor(
+      std::unique_ptr<detail::MachinePassConcept>(
+          new PassModelT(std::forward<MachineFunctionPassT>(Pass))));
+}
+
+template <>
+PreservedAnalyses
+PassManager<MachineFunction>::run(MachineFunction &,
+                                  AnalysisManager<MachineFunction> &);
+extern template class PassManager<MachineFunction>;
+
+/// Convenience typedef for a pass manager over functions.
+using MachineFunctionPassManager = PassManager<MachineFunction>;
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEPASSMANAGER_H
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 80bbfb75185a9..dc60727729f73 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
 #include "llvm/CodeGen/JMCInstrumenter.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
+#include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
@@ -88,12 +89,8 @@ namespace llvm {
 #define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME)                             \
   struct PASS_NAME : public MachinePassInfoMixin<PASS_NAME> {                  \
     template <typename... Ts> PASS_NAME(Ts &&...) {}                           \
-    Error run(Module &, MachineFunctionAnalysisManager &) {                    \
-      return Error::success();                                                 \
-    }                                                                          \
-    PreservedAnalyses run(MachineFunction &,                                   \
-                          MachineFunctionAnalysisManager &) {                  \
-      llvm_unreachable("this api is to make new PM api happy");                \
+    PreservedAnalyses run(Module &, ModuleAnalysisManager &) {                 \
+      return PreservedAnalyses::all();                                         \
     }                                                                          \
   };
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME)                           \
@@ -132,8 +129,8 @@ template <typename DerivedT> class CodeGenPassBuilder {
       Opt.OptimizeRegAlloc = getOptLevel() != CodeGenOptLevel::None;
   }
 
-  Error buildPipeline(ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
-                      raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+  Error buildPipeline(ModulePassManager &MPM, raw_pwrite_stream &Out,
+                      raw_pwrite_stream *DwoOut,
                       CodeGenFileType FileType) const;
 
   PassInstrumentationCallbacks *getPassInstrumentationCallbacks() const {
@@ -149,7 +146,15 @@ template <typename DerivedT> class CodeGenPassBuilder {
   using is_function_pass_t = decltype(std::declval<PassT &>().run(
       std::declval<Function &>(), std::declval<FunctionAnalysisManager &>()));
 
+  template <typename PassT>
+  using is_machine_function_pass_t = decltype(std::declval<PassT &>().run(
+      std::declval<MachineFunction &>(),
+      std::declval<MachineFunctionAnalysisManager &>()));
+
   // Function object to maintain state while adding codegen IR passes.
+  // TODO: add a Function -> MachineFunction adaptor and merge
+  // AddIRPass/AddMachinePass so we can have a function pipeline that runs both
+  // function passes and machine function passes.
   class AddIRPass {
   public:
     AddIRPass(ModulePassManager &MPM, const DerivedT &PB) : MPM(MPM), PB(PB) {}
@@ -196,31 +201,47 @@ template <typename DerivedT> class CodeGenPassBuilder {
   // Function object to maintain state while adding codegen machine passes.
   class AddMachinePass {
   public:
-    AddMachinePass(MachineFunctionPassManager &PM, const DerivedT &PB)
-        : PM(PM), PB(PB) {}
+    AddMachinePass(ModulePassManager &MPM, const DerivedT &PB)
+        : MPM(MPM), PB(PB) {}
+    ~AddMachinePass() {
+      if (!MFPM.isEmpty())
+        MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+    }
+
+    template <typename PassT>
+    void operator()(PassT &&Pass, bool Force = false,
+                    StringRef Name = PassT::name()) {
+      static_assert((is_detected<is_machine_function_pass_t, PassT>::value ||
+                     is_detected<is_module_pass_t, PassT>::value) &&
+                    "Only module pass and function pass are supported.");
 
-    template <typename PassT> void operator()(PassT &&Pass) {
-      if (!PB.runBeforeAdding(PassT::name()))
+      if (!Force && !PB.runBeforeAdding(Name))
         return;
 
-      PM.addPass(std::forward<PassT>(Pass));
+      // Add Function Pass
+      if constexpr (is_detected<is_machine_function_pass_t, PassT>::value) {
+        MFPM.addPass(std::forward<PassT>(Pass));
 
-      for (auto &C : PB.AfterCallbacks)
-        C(PassT::name());
-    }
+        for (auto &C : PB.AfterCallbacks)
+          C(Name);
+      } else {
+        // Add Module Pass
+        if (!MFPM.isEmpty()) {
+          MPM.addPass(
+              createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+          MFPM = MachineFunctionPassManager();
+        }
 
-    template <typename PassT> void insertPass(StringRef PassName, PassT Pass) {
-      PB.AfterCallbacks.emplace_back(
-          [this, PassName, Pass = std::move(Pass)](StringRef Name) {
-            if (PassName == Name)
-              this->PM.addPass(std::move(Pass));
-          });
-    }
+        MPM.addPass(std::forward<PassT>(Pass));
 
-    MachineFunctionPassManager releasePM() { return std::move(PM); }
+        for (auto &C : PB.AfterCallbacks)
+          C(Name);
+      }
+    }
 
   private:
-    MachineFunctionPassManager &PM;
+    ModulePassManager &MPM;
+    MachineFunctionPassManager MFPM;
     const DerivedT &PB;
   };
 
@@ -467,30 +488,43 @@ template <typename DerivedT> class CodeGenPassBuilder {
 
 template <typename Derived>
 Error CodeGenPassBuilder<Derived>::buildPipeline(
-    ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
-    raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
     CodeGenFileType FileType) const {
   auto StartStopInfo = TargetPassConfig::getStartStopInfo(*PIC);
   if (!StartStopInfo)
     return StartStopInfo.takeError();
   setStartStopPasses(*StartStopInfo);
-  AddIRPass addIRPass(MPM, derived());
-  // `ProfileSummaryInfo` is always valid.
-  addIRPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-  addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>());
-  addISelPasses(addIRPass);
 
-  AddMachinePass addPass(MFPM, derived());
+  bool PrintAsm = TargetPassConfig::willCompleteCodeGenPipeline();
+  bool PrintMIR = !PrintAsm && FileType != CodeGenFileType::Null;
+
+  {
+    AddIRPass addIRPass(MPM, derived());
+    addIRPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+    addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>());
+    addISelPasses(addIRPass);
+  }
+
+  AddMachinePass addPass(MPM, derived());
+
+  if (PrintMIR)
+    addPass(PrintMIRPreparePass(Out), /*Force=*/true);
+
   if (auto Err = addCoreISelPasses(addPass))
     return std::move(Err);
 
   if (auto Err = derived().addMachinePasses(addPass))
     return std::move(Err);
 
-  derived().addAsmPrinter(
-      addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) {
-        return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx);
-      });
+  if (PrintAsm) {
+    derived().addAsmPrinter(
+        addPass, [this, &Out, DwoOut, FileType](MCContext &Ctx) {
+          return this->TM.createMCStreamer(Out, DwoOut, FileType, Ctx);
+        });
+  }
+
+  if (PrintMIR)
+    addPass(PrintMIRPass(Out), /*Force=*/true);
 
   addPass(FreeMachineFunctionPass());
   return verifyStartStop(*StartStopInfo);
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 10c5b7c00bae3..6822cfdb4957b 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -133,7 +133,8 @@ class PassBuilder {
   void crossRegisterProxies(LoopAnalysisManager &LAM,
                             FunctionAnalysisManager &FAM,
                             CGSCCAnalysisManager &CGAM,
-                            ModuleAnalysisManager &MAM);
+                            ModuleAnalysisManager &MAM,
+                            MachineFunctionAnalysisManager *MFAM = nullptr);
 
   /// Registers all available module analysis passes.
   ///
@@ -569,9 +570,9 @@ class PassBuilder {
     ModulePipelineParsingCallbacks.push_back(C);
   }
   void registerPipelineParsingCallback(
-      const std::function<bool(StringRef Name, MachineFunctionPassManager &)>
-          &C) {
-    MachinePipelineParsingCallbacks.push_back(C);
+      const std::function<bool(StringRef Name, MachineFunctionPassManager &,
+                               ArrayRef<PipelineElement>)> &C) {
+    MachineFunctionPipelineParsingCallbacks.push_back(C);
   }
   /// @}}
 
@@ -733,8 +734,10 @@ class PassBuilder {
   // Machine pass callbackcs
   SmallVector<std::function<void(MachineFunctionAnalysisManager &)>, 2>
       MachineFunctionAnalysisRegistrationCallbacks;
-  SmallVector<std::function<bool(StringRef, MachineFunctionPassManager &)>, 2>
-      MachinePipelineParsingCallbacks;
+  SmallVector<std::function<bool(StringRef, MachineFunctionPassManager &,
+                                 ArrayRef<PipelineElement>)>,
+              2>
+      MachineFunctionPipelineParsingCallbacks;
 };
 
 /// This utility template takes care of adding require<> and invalidate<>
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 7462f61d32b56..d7ce088cad49f 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -34,8 +34,6 @@ using ModulePassManager = PassManager<Module>;
 
 class Function;
 class GlobalValue;
-class MachineFunctionPassManager;
-class MachineFunctionAnalysisManager;
 class MachineModuleInfoWrapperPass;
 class Mangler;
 class MCAsmInfo;
@@ -455,11 +453,9 @@ class LLVMTargetMachine : public TargetMachine {
                       bool DisableVerify = true,
                       MachineModuleInfoWrapperPass *MMIWP = nullptr) override;
 
-  virtual Error buildCodeGenPipeline(ModulePassManager &,
-                                     MachineFunctionPassManager &,
-                                     MachineFunctionAnalysisManager &,
-                                     raw_pwrite_stream &, raw_pwrite_stream *,
-                                     CodeGenFileType, CGPassBuilderOption,
+  virtual Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
+                                     raw_pwrite_stream *, CodeGenFileType,
+                                     CGPassBuilderOption,
                                      PassInstrumentationCallbacks *) {
     return make_error<StringError>("buildCodeGenPipeline is not overridden",
                                    inconvertibleErrorCode());
diff --git a/llvm/lib/CodeGen/MachinePassManager.cpp b/llvm/lib/CodeGen/MachinePassManager.cpp
index d42bbe239830f..9a750b5bed433 100644
--- a/llvm/lib/CodeGen/MachinePassManager.cpp
+++ b/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachinePassManager.h"
-#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/PassManagerImpl.h"
@@ -19,99 +18,121 @@
 using namespace llvm;
 
 namespace llvm {
-template class AllAnalysesOn<MachineFunction>;
+
+AnalysisKey FunctionAnalysisManagerMachineFunctionProxy::Key;
+
 template class AnalysisManager<MachineFunction>;
 template class PassManager<MachineFunction>;
+template class InnerAnalysisManagerProxy<MachineFunctionAnalysisManager,
+                                         Module>;
+template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
+                                         MachineFunction>;
+
+bool FunctionAnalysisManagerMachineFunctionProxy::Result::invalidate(
+    MachineFunction &IR, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &Inv) {
+  // MachineFunction passes should not invalidate Function analyses.
+  // TODO: verify that PA doesn't invalidate Function analyses.
+  return false;
+}
 
-Error MachineFunctionPassManager::run(Module &M,
-                                      MachineFunctionAnalysisManager &MFAM) {
-  // MachineModuleAnalysis is a module analysis pass that is never invalidated
-  // because we don't run any module pass in codegen pipeline. This is very
-  // important because the codegen state is stored in MMI which is the analysis
-  // result of MachineModuleAnalysis. MMI should not be recomputed.
-  auto &MMI = MFAM.getResult<MachineModuleAnalysis>(M).getMMI();
-
-  (void)RequireCodeGenSCCOrder;
-  assert(!RequireCodeGenSCCOrder && "not implemented");
-
-  // M is unused here
-  PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(M);
-
-  // Add a PIC to verify machine functions.
-  if (VerifyMachineFunction) {
-    // No need to pop this callback later since MIR pipeline is flat which means
-    // current pipeline is the top-level pipeline. Callbacks are not used after
-    // current pipeline.
-    PI.pushBeforeNonSkippedPassCallback([](StringRef PassID, Any IR) {
-      assert(llvm::any_cast<const MachineFunction *>(&IR));
-      const MachineFunction *MF = llvm::any_cast<const MachineFunction *>(IR);
-      assert(MF && "Machine function should be valid for printing");
-      std::string Banner = std::string("After ") + std::string(PassID);
-      verifyMachineFunction(Banner, *MF);
-    });
+template <>
+bool MachineFunctionAnalysisManagerModuleProxy::Result::invalidate(
+    Module &M, const PreservedAnalyses &PA,
+    ModuleAnalysisManager::Invalidator &Inv) {
+  // If literally everything is preserved, we're done.
+  if (PA.areAllPreserved())
+    return false; // This is still a valid proxy.
+
+  // If this proxy isn't marked as preserved, then even if the result remains
+  // valid, the key itself may no longer be valid, so we clear everything.
+  //
+  // Note that in order to preserve this proxy, a module pass must ensure that
+  // the MFAM has been completely updated to handle the deletion of functions.
+  // Specifically, any MFAM-cached results for those functions need to have been
+  // forcibly cleared. When preserved, this proxy will only invalidate results
+  // cached on functions *still in the module* at the end of the module pass.
+  auto PAC = PA.getChecker<MachineFunctionAnalysisManagerModuleProxy>();
+  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Module>>()) {
+    InnerAM->clear();
+    return true;
   }
 
-  for (auto &F : InitializationFuncs) {
-    if (auto Err = F(M, MFAM))
-      return Err;
+  // FIXME: be more precise, see
+  // FunctionAnalysisManagerModuleProxy::Result::invalidate.
+  if (!PA.allAnalysesInSetPreserved<AllAnalysesOn<MachineFunction>>()) {
+    InnerAM->clear();
+    return true;
   }
 
-  unsigned Idx = 0;
-  size_t Size = Passes.size();
-  do {
-    // Run machine module passes
-    for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) {
-      if (!PI.runBeforePass<Module>(*Passes[Idx], M))
-        continue;
-      if (auto Err = MachineModulePasses.at(Idx)(M, MFAM))
-        return Err;
-      PI.runAfterPass(*Passes[Idx], M, PreservedAnalyses::all());
-    }
-
-    // Finish running all passes.
-    if (Idx == Size)
-      break;
-
-    // Run machine function passes
-
-    // Get index range of machine function passes.
-    unsigned Begin = Idx;
-    for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx)
-      ;
-
-    for (Function &F : M) {
-      // Do not codegen any 'available_externally' functions at all, they have
-      // definitions outside the translation unit.
-      if (F.hasAvailableExternallyLinkage())
-        continue;
-
-      MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
-
-      for (unsigned I = Begin, E = Idx; I != E; ++I) {
-        auto *P = Passes[I].get();
+  // Return false to indicate that this result is still a valid proxy.
+  return false;
+}
 
-        if (!PI.runBeforePass<MachineFunction>(*P, MF))
-          continue;
+PreservedAnalyses
+ModuleToMachineFunctionPassAdaptor::run(Module &M, ModuleAnalysisManager &AM) {
+  auto &MMI = AM.getResult<MachineModuleAnalysis>(M).getMMI();
+  MachineFunctionAnalysisManager &MFAM =
+      AM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M).getManager();
+  PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(M);
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  for (Function &F : M) {
+    // Do not codegen any 'available_externally' functions at all, they have
+    // definitions outside the translation unit.
+    if (F.hasAvailableExternallyLinkage())
+      continue;
+
+    MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+
+    if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+      continue;
+    PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+    if (MMI.getMachineFunction(F)) {
+      MFAM.invalidate(MF, PassPA);
+      PI.runAfterPass(*Pass, MF, PassPA);
+    } else {
+      MFAM.clear(MF, F.getName());
+      PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
+    }
+    PA.intersect(std::move(PassPA));
+  }
 
-        // TODO: EmitSizeRemarks
-        PreservedAnalyses PassPA = P->run(MF, MFAM);
+  return PA;
+}
 
-        // MF is dangling after FreeMachineFunctionPass
-        if (P->name() != FreeMachineFunctionPass::name()) {
-          MFAM.invalidate(MF, PassPA);
+void ModuleToMachineFunctionPassAdaptor::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  OS << "machine-function(";
+  Pass->printPipeline(OS, MapClassName2PassName);
+  OS << ')';
+}
 
-          PI.runAfterPass(*P, MF, PassPA);
-        }
-      }
+template <>
+PreservedAnalyses
+PassManager<MachineFunction>::run(MachineFunction &MF,
+                                  AnalysisManager<MachineFunction> &MFAM) {
+  PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF);
+  Function &F = MF.getFunction();
+  MachineModuleInfo &MMI =
+      MFAM.getResult<ModuleAnalysisManagerMachineFunctionProxy>(MF)
+          .getCachedResult<MachineModuleAnalysis>(*F.getParent())
+          ->getMMI();
+  PreservedAnalyses PA = PreservedAnalyses::all();
+  for (auto &Pass : Passes) {
+    if (!PI.runBeforePass<MachineFunction>(*Pass, MF))
+      continue;
+
+    PreservedAnalyses PassPA = Pass->run(MF, MFAM);
+    if (MMI.getMachineFunction(F)) {
+      MFAM.invalidate(MF, PassPA);
+      PI.runAfterPass(*Pass, MF, PassPA);
+    } else {
+      MFAM.clear(MF, F.getName());
+      PI.runAfterPassInvalidated<MachineFunction>(*Pass, PassPA);
     }
-  } while (true);
-
-  for (auto &F : FinalizationFuncs) {
-    if (auto Err = F(M, MFAM))
-      return Err;
+    PA.intersect(std::move(PassPA));
   }
-
-  return Error::success();
+  return PA;
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index f26d95ab1e479..fed7a14c8a2e3 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -91,6 +91,7 @@
 #include "llvm/CodeGen/JMCInstrumenter.h"
 #include "llvm/CodeGen/LowerEmuTLS.h"
 #include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/SafeStack.h"
 #include "llvm/CodeGen/SelectOptimize.h"
 #include "llvm/CodeGen/ShadowStackGCLowering.h"
@@ -1259,6 +1260,28 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   return callbacksAcceptPassName<FunctionPassManager>(Name, Callbacks);
 }
 
+template <typename CallbacksT>
+static bool isMachineFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
+  // Explicitly handle pass manager names.
+  if (Name == "machine-function")
+    return true;
+
+  // Explicitly handle custom-parsed pass names.
+  if (parseRepeatPassName(Name))
+    return true;
+
+#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)                               \
+  if (Name == NAME)                                                            \
+    return true;
+#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
+  if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
+    return true;
+
+#include "llvm/Passes/MachinePassRegistry.def"
+
+  return callbacksAcceptPassName<MachineFunctionPassManager>(Name, Callbacks);
+}
+
 template <typename CallbacksT>
 static bool isLoopNestPassName(StringRef Name, CallbacksT &Callbacks,
                                bool &UseMemorySSA) {
@@ -1394,6 +1417,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
       return Error::success();
     }
+    if (Name == "machine-function") {
+      MachineFunctionPassManager MFPM;
+      if (auto Err = parseMachinePassPipeline(MFPM, InnerPipeline))
+        return Err;
+      MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+      return Error::success();
+    }
     if (auto Params = parseFunctionPipelineName(Name)) {
       if (Params->second)
         return make_error<StringError>(
@@ -1874,8 +1904,8 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM,
   }
 #include "llvm/Passes/MachinePassRegistry.def"
 
-  for (auto &C : MachinePipelineParsingCallbacks)
-    if (C(Name, MFPM))
+  for (auto &C : MachineFunctionPipelineParsingCallbacks)
+    if (C(Name, MFPM, E.InnerPipeline))
       return Error::success();
   return make_error<StringError>(
       formatv("unknown machine pass '{0}'", Name).str(),
@@ -1942,7 +1972,8 @@ Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
 void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
                                        FunctionAnalysisManager &FAM,
                                        CGSCCAnalysisManager &CGAM,
-                                       ModuleAnalysisManager &MAM) {
+                                       ModuleAnalysisManager &MAM,
+                                       MachineFunctionAnalysisManager *MFAM) {
   MAM.registerPass([&] { return FunctionAnalysisManagerModuleProxy(FAM); });
   MAM.registerPass([&] { return CGSCCAnalysisManagerModuleProxy(CGAM); });
   CGAM.registerPass([&] { return ModuleAnalysisManagerCGSCCProxy(MAM); });
@@ -1950,6 +1981,14 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
   FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); });
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
+  if (MFAM) {
+    MAM.registerPass(
+        [&] { return MachineFunctionAnalysisManagerModuleProxy(*MFAM); });
+    MFAM->registerPass(
+        [&] { return ModuleAnalysisManagerMachineFunctionProxy(MAM); });
+    MFAM->registerPass(
+        [&] { return FunctionAnalysisManagerMachineFunctionProxy(FAM); });
+  }
 }
 
 Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
@@ -1991,6 +2030,9 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
                               UseMemorySSA)) {
       Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop",
                                  std::move(*Pipeline)}}}};
+    } else if (isMachineFunctionPassName(
+                   FirstName, MachineFunctionPipelineParsingCallbacks)) {
+      Pipeline = {{"machine-function", std::move(*Pipeline)}};
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline))
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 4a11dd2e31acd..a620ba911ec61 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -47,10 +47,9 @@ Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
 } // namespace
 
 Error X86TargetMachine::buildCodeGenPipeline(
-    ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
-    MachineFunctionAnalysisManager &, raw_pwrite_stream &Out,
-    raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
-    CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) {
+    ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, CGPassBuilderOption Opt,
+    PassInstrumentationCallbacks *PIC) {
   auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC);
-  return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType);
+  return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
 }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index f31c971df9584..0fd3e47aaefe7 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,10 +58,9 @@ class X86TargetMachine final : public LLVMTargetMachine {
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
 
-  Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &,
-                             MachineFunctionAnalysisManager &,
-                             raw_pwrite_stream &, raw_pwrite_stream *,
-                             CodeGenFileType, CGPassBuilderOption,
+  Error buildCodeGenPipeline(ModulePassManager &, raw_pwrite_stream &,
+                             raw_pwrite_stream *, CodeGenFileType,
+                             CGPassBuilderOption,
                              PassInstrumentationCallbacks *) override;
 
   bool isJIT() const { return IsJIT; }
diff --git a/llvm/test/tools/llc/new-pm/pipeline.mir b/llvm/test/tools/llc/new-pm/pipeline.mir
index c7dda4b6d1356..fcc7d4f8f02e3 100644
--- a/llvm/test/tools/llc/new-pm/pipeline.mir
+++ b/llvm/test/tools/llc/new-pm/pipeline.mir
@@ -1,7 +1,6 @@
 # RUN: llc -mtriple=x86_64-pc-linux-gnu -x mir -passes=no-op-machine-function --print-pipeline-passes -filetype=null < %s | FileCheck %s --match-full-lines
 
-# CHECK: IR pipeline: PrintMIRPreparePass
-# CHECK: MIR pipeline: no-op-machine-function,print,FreeMachineFunctionPass
+# CHECK: machine-function(no-op-machine-function),PrintMIRPreparePass,machine-function(print,FreeMachineFunctionPass)
 
 ---
 name: f
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
index c25e45d1f7ab9..8c795a7a70f81 100644
--- a/llvm/test/tools/llc/new-pm/start-stop.ll
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s
-
-; CHECK: IR pipeline: function(mergeicmps,expand-memcmp,gc-lowering)
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s --check-prefix=NULL
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ
 
+; NULL: function(mergeicmps,expand-memcmp,gc-lowering)
+; OBJ: function(mergeicmps,expand-memcmp,gc-lowering),PrintMIRPreparePass,machine-function(print)
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index c3288ef9d0808..6ae1b8db5e115 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -89,30 +89,6 @@ bool LLCDiagnosticHandler::handleDiagnostics(const DiagnosticInfo &DI) {
 
 static llvm::ExitOnError ExitOnErr;
 
-static void RunPasses(bool BOS, ToolOutputFile *Out, Module *M,
-                      LLVMContext &Context, SmallString<0> &Buffer,
-                      ModulePassManager *MPM, ModuleAnalysisManager *MAM,
-                      MachineFunctionPassManager &MFPM,
-                      MachineFunctionAnalysisManager &MFAM) {
-  assert(M && "invalid input module!");
-
-  // Before executing passes, print the final values of the LLVM options.
-  cl::PrintOptionValues();
-
-  if (MPM) {
-    assert(MAM && "expect a ModuleAnalysisManager!");
-    MPM->run(*M, *MAM);
-  }
-
-  ExitOnErr(MFPM.run(*M, MFAM));
-
-  if (Context.getDiagHandlerPtr()->HasErrors)
-    exit(1);
-
-  if (BOS)
-    Out->os() << Buffer;
-}
-
 int llvm::compileModuleWithNewPM(
     StringRef Arg0, std::unique_ptr<Module> M, std::unique_ptr<MIRParser> MIR,
     std::unique_ptr<TargetMachine> Target, std::unique_ptr<ToolOutputFile> Out,
@@ -131,16 +107,6 @@ int llvm::compileModuleWithNewPM(
 
   raw_pwrite_stream *OS = &Out->os();
 
-  // Manually do the buffering rather than using buffer_ostream,
-  // so we can memcmp the contents in CompileTwice mode in future.
-  SmallString<0> Buffer;
-  std::unique_ptr<raw_svector_ostream> BOS;
-  if ((codegen::getFileType() != CodeGenFileType::AssemblyFile &&
-       !Out->os().supportsSeeking())) {
-    BOS = std::make_unique<raw_svector_ostream>(Buffer);
-    OS = BOS.get();
-  }
-
   // Fetch options from TargetPassConfig
   CGPassBuilderOption Opt = getCGPassBuilderOption();
   Opt.DisableVerify = NoVerify;
@@ -158,20 +124,19 @@ int llvm::compileModuleWithNewPM(
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
+  MachineFunctionAnalysisManager MFAM;
   PassBuilder PB(Target.get(), PipelineTuningOptions(), std::nullopt, &PIC);
   PB.registerModuleAnalyses(MAM);
   PB.registerCGSCCAnalyses(CGAM);
   PB.registerFunctionAnalyses(FAM);
   PB.registerLoopAnalyses(LAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+  PB.registerMachineFunctionAnalyses(MFAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
 
   FAM.registerPass([&] { return TargetLibraryAnalysis(TLII); });
   MAM.registerPass([&] { return MachineModuleAnalysis(MMI); });
 
-  MachineFunctionAnalysisManager MFAM(FAM, MAM);
-
   ModulePassManager MPM;
-  MachineFunctionPassManager MFPM;
 
   if (!PassPipeline.empty()) {
     // Construct a custom pass pipeline that starts after instruction
@@ -182,49 +147,39 @@ int llvm::compileModuleWithNewPM(
       return 1;
     }
 
-    ExitOnErr(PB.parsePassPipeline(MFPM, PassPipeline));
+    // FIXME: verify that there are no IR passes.
+    ExitOnErr(PB.parsePassPipeline(MPM, PassPipeline));
     MPM.addPass(PrintMIRPreparePass(*OS));
+    MachineFunctionPassManager MFPM;
     MFPM.addPass(PrintMIRPass(*OS));
     MFPM.addPass(FreeMachineFunctionPass());
+    MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
 
-    auto &MMI = MFAM.getResult<MachineModuleAnalysis>(*M).getMMI();
     if (MIR->parseMachineFunctions(*M, MMI))
       return 1;
   } else {
-    ExitOnErr(LLVMTM.buildCodeGenPipeline(MPM, MFPM, MFAM, *OS,
-                                          DwoOut ? &DwoOut->os() : nullptr,
-                                          FileType, Opt, &PIC));
-
-    auto StartStopInfo = TargetPassConfig::getStartStopInfo(PIC);
-    assert(StartStopInfo && "Expect StartStopInfo!");
-
-    if (auto StopPassName = StartStopInfo->StopPass; !StopPassName.empty()) {
-      MFPM.addPass(PrintMIRPass(*OS));
-      MFPM.addPass(FreeMachineFunctionPass());
-    }
+    ExitOnErr(LLVMTM.buildCodeGenPipeline(
+        MPM, *OS, DwoOut ? &DwoOut->os() : nullptr, FileType, Opt, &PIC));
   }
 
   if (PrintPipelinePasses) {
-    std::string IRPipeline;
-    raw_string_ostream IRSOS(IRPipeline);
-    MPM.printPipeline(IRSOS, [&PIC](StringRef ClassName) {
-      auto PassName = PIC.getPassNameForClassName(ClassName);
-      return PassName.empty() ? ClassName : PassName;
-    });
-    outs() << "IR pipeline: " << IRPipeline << '\n';
-
-    std::string MIRPipeline;
-    raw_string_ostream MIRSOS(MIRPipeline);
-    MFPM.printPipeline(MIRSOS, [&PIC](StringRef ClassName) {
+    std::string PipelineStr;
+    raw_string_ostream OS(PipelineStr);
+    MPM.printPipeline(OS, [&PIC](StringRef ClassName) {
       auto PassName = PIC.getPassNameForClassName(ClassName);
       return PassName.empty() ? ClassName : PassName;
     });
-    outs() << "MIR pipeline: " << MIRPipeline << '\n';
+    outs() << PipelineStr << '\n';
     return 0;
   }
 
-  RunPasses(BOS.get(), Out.get(), M.get(), Context, Buffer, &MPM, &MAM, MFPM,
-            MFAM);
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  MPM.run(*M, MAM);
+
+  if (Context.getDiagHandlerPtr()->HasErrors)
+    exit(1);
 
   // Declare success.
   Out->keep();
diff --git a/llvm/unittests/CodeGen/PassManagerTest.cpp b/llvm/unittests/CodeGen/PassManagerTest.cpp
index 28003c2f4b3f1..4283eb01a9c8f 100644
--- a/llvm/unittests/CodeGen/PassManagerTest.cpp
+++ b/llvm/unittests/CodeGen/PassManagerTest.cpp
@@ -5,13 +5,18 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// Test that the various MachineFunction pass managers, adaptors, analyses, and
+// analysis managers work.
+//===----------------------------------------------------------------------===//
 
+#include "llvm/IR/PassManager.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -34,14 +39,9 @@ class TestFunctionAnalysis : public AnalysisInfoMixin<TestFunctionAnalysis> {
     int InstructionCount;
   };
 
-  /// Run the analysis pass over the function and return a result.
+  /// The number of instructions in the Function.
   Result run(Function &F, FunctionAnalysisManager &AM) {
-    int Count = 0;
-    for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE; ++BBI)
-      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
-           ++II)
-        ++Count;
-    return Result(Count);
+    return Result(F.getInstructionCount());
   }
 
 private:
@@ -59,13 +59,12 @@ class TestMachineFunctionAnalysis
     int InstructionCount;
   };
 
-  /// Run the analysis pass over the machine function and return a result.
-  Result run(MachineFunction &MF, MachineFunctionAnalysisManager::Base &AM) {
-    auto &MFAM = static_cast<MachineFunctionAnalysisManager &>(AM);
-    // Query function analysis result.
+  Result run(MachineFunction &MF, MachineFunctionAnalysisManager &AM) {
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+            .getManager();
     TestFunctionAnalysis::Result &FAR =
-        MFAM.getResult<TestFunctionAnalysis>(MF.getFunction());
-    // + 5
+        FAM.getResult<TestFunctionAnalysis>(MF.getFunction());
     return FAR.InstructionCount;
   }
 
@@ -76,90 +75,54 @@ class TestMachineFunctionAnalysis
 
 AnalysisKey TestMachineFunctionAnalysis::Key;
 
-const std::string DoInitErrMsg = "doInitialization failed";
-const std::string DoFinalErrMsg = "doFinalization failed";
-
 struct TestMachineFunctionPass : public PassInfoMixin<TestMachineFunctionPass> {
-  TestMachineFunctionPass(int &Count, std::vector<int> &BeforeInitialization,
-                          std::vector<int> &BeforeFinalization,
-                          std::vector<int> &MachineFunctionPassCount)
-      : Count(Count), BeforeInitialization(BeforeInitialization),
-        BeforeFinalization(BeforeFinalization),
-        MachineFunctionPassCount(MachineFunctionPassCount) {}
-
-  Error doInitialization(Module &M, MachineFunctionAnalysisManager &MFAM) {
-    // Force doInitialization fail by starting with big `Count`.
-    if (Count > 10000)
-      return make_error<StringError>(DoInitErrMsg, inconvertibleErrorCode());
-
-    // + 1
-    ++Count;
-    BeforeInitialization.push_back(Count);
-    return Error::success();
-  }
-  Error doFinalization(Module &M, MachineFunctionAnalysisManager &MFAM) {
-    // Force doFinalization fail by starting with big `Count`.
-    if (Count > 1000)
-      return make_error<StringError>(DoFinalErrMsg, inconvertibleErrorCode());
-
-    // + 1
-    ++Count;
-    BeforeFinalization.push_back(Count);
-    return Error::success();
-  }
+  TestMachineFunctionPass(int &Count, std::vector<int> &Counts)
+      : Count(Count), Counts(Counts) {}
 
   PreservedAnalyses run(MachineFunction &MF,
                         MachineFunctionAnalysisManager &MFAM) {
-    // Query function analysis result.
+    FunctionAnalysisManager &FAM =
+        MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
+            .getManager();
     TestFunctionAnalysis::Result &FAR =
-        MFAM.getResult<TestFunctionAnalysis>(MF.getFunction());
-    // 3 + 1 + 1 = 5
+        FAM.getResult<TestFunctionAnalysis>(MF.getFunction());
     Count += FAR.InstructionCount;
 
-    // Query module analysis result.
-    MachineModuleInfo &MMI =
-        MFAM.getResult<MachineModuleAnalysis>(*MF.getFunction().getParent())
-            .getMMI();
-    // 1 + 1 + 1 = 3
-    Count += (MMI.getModule() == MF.getFunction().getParent());
-
-    // Query machine function analysis result.
     TestMachineFunctionAnalysis::Result &MFAR =
         MFAM.getResult<TestMachineFunctionAnalysis>(MF);
-    // 3 + 1 + 1 = 5
     Count += MFAR.InstructionCount;
 
-    MachineFunctionPassCount.push_back(Count);
+    Counts.push_back(Count);
 
     return PreservedAnalyses::none();
   }
 
   int &Count;
-  std::vector<int> &BeforeInitialization;
-  std::vector<int> &BeforeFinalization;
-  std::vector<int> &MachineFunctionPassCount;
+  std::vector<int> &Counts;
 };
 
 struct TestMachineModulePass : public PassInfoMixin<TestMachineModulePass> {
-  TestMachineModulePass(int &Count, std::vector<int> &MachineModulePassCount)
-      : Count(Count), MachineModulePassCount(MachineModulePassCount) {}
-
-  Error run(Module &M, MachineFunctionAnalysisManager &MFAM) {
-    MachineModuleInfo &MMI = MFAM.getResult<MachineModuleAnalysis>(M).getMMI();
-    // + 1
-    Count += (MMI.getModule() == &M);
-    MachineModulePassCount.push_back(Count);
-    return Error::success();
-  }
-
-  PreservedAnalyses run(MachineFunction &MF,
-                        MachineFunctionAnalysisManager &AM) {
-    llvm_unreachable(
-        "This should never be reached because this is machine module pass");
+  TestMachineModulePass(int &Count, std::vector<int> &Counts)
+      : Count(Count), Counts(Counts) {}
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) {
+    MachineModuleInfo &MMI = MAM.getResult<MachineModuleAnalysis>(M).getMMI();
+    FunctionAnalysisManager &FAM =
+        MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+    MachineFunctionAnalysisManager &MFAM =
+        MAM.getResult<MachineFunctionAnalysisManagerModuleProxy>(M)
+            .getManager();
+    for (Function &F : M) {
+      MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+      Count += FAM.getResult<TestFunctionAnalysis>(F).InstructionCount;
+      Count += MFAM.getResult<TestMachineFunctionAnalysis>(MF).InstructionCount;
+    }
+    Counts.push_back(Count);
+    return PreservedAnalyses::all();
   }
 
   int &Count;
-  std::vector<int> &MachineModulePassCount;
+  std::vector<int> &Counts;
 };
 
 std::unique_ptr<Module> parseIR(LLVMContext &Context, const char *IR) {
@@ -211,102 +174,40 @@ TEST_F(PassManagerTest, Basic) {
   M->setDataLayout(TM->createDataLayout());
 
   MachineModuleInfo MMI(LLVMTM);
+
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
   ModuleAnalysisManager MAM;
+  MachineFunctionAnalysisManager MFAM;
   PassBuilder PB(TM.get());
   PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
   PB.registerFunctionAnalyses(FAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.registerMachineFunctionAnalyses(MFAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
 
   FAM.registerPass([&] { return TestFunctionAnalysis(); });
-  FAM.registerPass([&] { return PassInstrumentationAnalysis(); });
   MAM.registerPass([&] { return MachineModuleAnalysis(MMI); });
-  MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
-
-  MachineFunctionAnalysisManager MFAM;
-  {
-    // Test move assignment.
-    MachineFunctionAnalysisManager NestedMFAM(FAM, MAM);
-    NestedMFAM.registerPass([&] { return PassInstrumentationAnalysis(); });
-    NestedMFAM.registerPass([&] { return TestMachineFunctionAnalysis(); });
-    MFAM = std::move(NestedMFAM);
-  }
+  MFAM.registerPass([&] { return TestMachineFunctionAnalysis(); });
 
   int Count = 0;
-  std::vector<int> BeforeInitialization[2];
-  std::vector<int> BeforeFinalization[2];
-  std::vector<int> TestMachineFunctionCount[2];
-  std::vector<int> TestMachineModuleCount[2];
+  std::vector<int> Counts;
 
+  ModulePassManager MPM;
   MachineFunctionPassManager MFPM;
-  {
-    // Test move assignment.
-    MachineFunctionPassManager NestedMFPM;
-    NestedMFPM.addPass(TestMachineModulePass(Count, TestMachineModuleCount[0]));
-    NestedMFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[0],
-                                               BeforeFinalization[0],
-                                               TestMachineFunctionCount[0]));
-    NestedMFPM.addPass(TestMachineModulePass(Count, TestMachineModuleCount[1]));
-    NestedMFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1],
-                                               BeforeFinalization[1],
-                                               TestMachineFunctionCount[1]));
-    MFPM = std::move(NestedMFPM);
-  }
+  MPM.addPass(TestMachineModulePass(Count, Counts));
+  MPM.addPass(createModuleToMachineFunctionPassAdaptor(
+      TestMachineFunctionPass(Count, Counts)));
+  MPM.addPass(TestMachineModulePass(Count, Counts));
+  MFPM.addPass(TestMachineFunctionPass(Count, Counts));
+  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+
+  MPM.run(*M, MAM);
 
-  ASSERT_FALSE(errorToBool(MFPM.run(*M, MFAM)));
-
-  // Check first machine module pass
-  EXPECT_EQ(1u, TestMachineModuleCount[0].size());
-  EXPECT_EQ(3, TestMachineModuleCount[0][0]);
-
-  // Check first machine function pass
-  EXPECT_EQ(1u, BeforeInitialization[0].size());
-  EXPECT_EQ(1, BeforeInitialization[0][0]);
-  EXPECT_EQ(3u, TestMachineFunctionCount[0].size());
-  EXPECT_EQ(10, TestMachineFunctionCount[0][0]);
-  EXPECT_EQ(13, TestMachineFunctionCount[0][1]);
-  EXPECT_EQ(16, TestMachineFunctionCount[0][2]);
-  EXPECT_EQ(1u, BeforeFinalization[0].size());
-  EXPECT_EQ(31, BeforeFinalization[0][0]);
-
-  // Check second machine module pass
-  EXPECT_EQ(1u, TestMachineModuleCount[1].size());
-  EXPECT_EQ(17, TestMachineModuleCount[1][0]);
-
-  // Check second machine function pass
-  EXPECT_EQ(1u, BeforeInitialization[1].size());
-  EXPECT_EQ(2, BeforeInitialization[1][0]);
-  EXPECT_EQ(3u, TestMachineFunctionCount[1].size());
-  EXPECT_EQ(24, TestMachineFunctionCount[1][0]);
-  EXPECT_EQ(27, TestMachineFunctionCount[1][1]);
-  EXPECT_EQ(30, TestMachineFunctionCount[1][2]);
-  EXPECT_EQ(1u, BeforeFinalization[1].size());
-  EXPECT_EQ(32, BeforeFinalization[1][0]);
-
-  EXPECT_EQ(32, Count);
-
-  // doInitialization returns error
-  Count = 10000;
-  MFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1],
-                                       BeforeFinalization[1],
-                                       TestMachineFunctionCount[1]));
-  std::string Message;
-  llvm::handleAllErrors(MFPM.run(*M, MFAM), [&](llvm::StringError &Error) {
-    Message = Error.getMessage();
-  });
-  EXPECT_EQ(Message, DoInitErrMsg);
-
-  // doFinalization returns error
-  Count = 1000;
-  MFPM.addPass(TestMachineFunctionPass(Count, BeforeInitialization[1],
-                                       BeforeFinalization[1],
-                                       TestMachineFunctionCount[1]));
-  llvm::handleAllErrors(MFPM.run(*M, MFAM), [&](llvm::StringError &Error) {
-    Message = Error.getMessage();
-  });
-  EXPECT_EQ(Message, DoFinalErrMsg);
+  EXPECT_EQ((std::vector<int>{10, 16, 18, 20, 30, 36, 38, 40}), Counts);
+  EXPECT_EQ(40, Count);
 }
 
 } // namespace
diff --git a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
index 8ecde223cb510..4b7d7846b0336 100644
--- a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
+++ b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/CodeGen/FreeMachineFunction.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Testing/Support/Error.h"
@@ -96,8 +99,6 @@ MATCHER_P(HasNameRegex, Name, "") {
 }
 
 struct MockPassInstrumentationCallbacks {
-  PassInstrumentationCallbacks Callbacks;
-
   MockPassInstrumentationCallbacks() {
     ON_CALL(*this, runBeforePass(_, _)).WillByDefault(Return(true));
   }
@@ -111,7 +112,7 @@ struct MockPassInstrumentationCallbacks {
   MOCK_METHOD2(runBeforeAnalysis, void(StringRef PassID, llvm::Any));
   MOCK_METHOD2(runAfterAnalysis, void(StringRef PassID, llvm::Any));
 
-  void registerPassInstrumentation() {
+  void registerPassInstrumentation(PassInstrumentationCallbacks &Callbacks) {
     Callbacks.registerShouldRunOptionalPassCallback(
         [this](StringRef P, llvm::Any IR) {
           return this->runBeforePass(P, IR);
@@ -147,7 +148,8 @@ struct MockPassInstrumentationCallbacks {
     // to check these explicitly.
     EXPECT_CALL(*this,
                 runBeforePass(Not(HasNameRegex("Mock")), HasName(IRName)))
-        .Times(AnyNumber());
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(false));
     EXPECT_CALL(
         *this, runBeforeSkippedPass(Not(HasNameRegex("Mock")), HasName(IRName)))
         .Times(AnyNumber());
@@ -157,15 +159,9 @@ struct MockPassInstrumentationCallbacks {
     EXPECT_CALL(*this,
                 runAfterPass(Not(HasNameRegex("Mock")), HasName(IRName), _))
         .Times(AnyNumber());
-    EXPECT_CALL(*this, runBeforeAnalysis(HasNameRegex("MachineModuleAnalysis"),
-                                         HasName(IRName)))
-        .Times(AnyNumber());
     EXPECT_CALL(*this,
                 runBeforeAnalysis(Not(HasNameRegex("Mock")), HasName(IRName)))
         .Times(AnyNumber());
-    EXPECT_CALL(*this, runAfterAnalysis(HasNameRegex("MachineModuleAnalysis"),
-                                        HasName(IRName)))
-        .Times(AnyNumber());
     EXPECT_CALL(*this,
                 runAfterAnalysis(Not(HasNameRegex("Mock")), HasName(IRName)))
         .Times(AnyNumber());
@@ -202,7 +198,7 @@ template <typename DerivedT> class MockAnalysisHandleBase {
       }
     };
 
-    Result run(MachineFunction &IR, MachineFunctionAnalysisManager::Base &AM) {
+    Result run(MachineFunction &IR, MachineFunctionAnalysisManager &AM) {
       return Handle->run(IR, AM);
     }
   };
@@ -249,7 +245,7 @@ template <typename DerivedT> class MockPassHandleBase {
 
   public:
     PreservedAnalyses run(MachineFunction &IR,
-                          MachineFunctionAnalysisManager::Base &AM) {
+                          MachineFunctionAnalysisManager &AM) {
       return Handle->run(IR, AM);
     }
   };
@@ -270,7 +266,7 @@ template <typename DerivedT> class MockPassHandleBase {
 
 struct MockAnalysisHandle : public MockAnalysisHandleBase<MockAnalysisHandle> {
   MOCK_METHOD2(run, Analysis::Result(MachineFunction &,
-                                     MachineFunctionAnalysisManager::Base &));
+                                     MachineFunctionAnalysisManager &));
 
   MOCK_METHOD3(invalidate, bool(MachineFunction &, const PreservedAnalyses &,
                                 MachineFunctionAnalysisManager::Invalidator &));
@@ -284,7 +280,7 @@ AnalysisKey MockAnalysisHandleBase<DerivedT>::Analysis::Key;
 class MockPassHandle : public MockPassHandleBase<MockPassHandle> {
 public:
   MOCK_METHOD2(run, PreservedAnalyses(MachineFunction &,
-                                      MachineFunctionAnalysisManager::Base &));
+                                      MachineFunctionAnalysisManager &));
 
   MockPassHandle() { setDefaults(); }
 };
@@ -297,50 +293,51 @@ class MachineFunctionCallbacksTest : public testing::Test {
     InitializeAllTargetMCs();
   }
 
+  LLVMContext Context;
+
   std::unique_ptr<LLVMTargetMachine> TM;
   std::unique_ptr<MachineModuleInfo> MMI;
 
-  LLVMContext Context;
   std::unique_ptr<Module> M;
-  std::unique_ptr<MIRParser> MIR;
-
-  MockPassInstrumentationCallbacks CallbacksHandle;
 
-  PassBuilder PB;
-  ModulePassManager PM;
-  MachineFunctionPassManager MFPM;
-  FunctionAnalysisManager FAM;
-  ModuleAnalysisManager AM;
+  PassInstrumentationCallbacks PIC;
+  std::unique_ptr<PassBuilder> PB;
+  ModulePassManager MPM;
   MachineFunctionAnalysisManager MFAM;
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
 
+  MockPassInstrumentationCallbacks CallbacksHandle;
   MockPassHandle PassHandle;
   MockAnalysisHandle AnalysisHandle;
 
-  std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
-                                   MachineModuleInfo &MMI) {
+  static std::unique_ptr<Module> parseMIR(StringRef MIRCode,
+                                          LLVMContext &Context,
+                                          TargetMachine &TM,
+                                          MachineModuleInfo &MMI) {
     SMDiagnostic Diagnostic;
     std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
-    MIR = createMIRParser(std::move(MBuffer), Context);
-    if (!MIR)
-      return nullptr;
+    std::unique_ptr<MIRParser> MIR =
+        createMIRParser(std::move(MBuffer), Context);
+    assert(MIR);
 
     std::unique_ptr<Module> Mod = MIR->parseIRModule();
-    if (!Mod)
-      return nullptr;
+    assert(Mod);
 
+    // Module identifier is used in tests below.
+    Mod->setModuleIdentifier("module");
     Mod->setDataLayout(TM.createDataLayout());
 
-    if (MIR->parseMachineFunctions(*Mod, MMI)) {
-      M.reset();
-      return nullptr;
-    }
+    bool Ret = MIR->parseMachineFunctions(*Mod, MMI);
+    assert(!Ret);
+
     return Mod;
   }
 
   static PreservedAnalyses
-  getAnalysisResult(MachineFunction &U,
-                    MachineFunctionAnalysisManager::Base &AM) {
-    auto &MFAM = static_cast<MachineFunctionAnalysisManager &>(AM);
+  getAnalysisResult(MachineFunction &U, MachineFunctionAnalysisManager &MFAM) {
     MFAM.getResult<MockAnalysisHandle::Analysis>(U);
     return PreservedAnalyses::all();
   }
@@ -356,25 +353,18 @@ class MachineFunctionCallbacksTest : public testing::Test {
             TripleName, "", "", TargetOptions(), std::nullopt)));
     if (!TM)
       GTEST_SKIP();
-    MMI = std::make_unique<MachineModuleInfo>(TM.get());
-    M = parseMIR(*TM, MIRString, *MMI);
-    AM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
-  }
 
-  MachineFunctionCallbacksTest()
-      : CallbacksHandle(), PB(nullptr, PipelineTuningOptions(), std::nullopt,
-                              &CallbacksHandle.Callbacks),
-        PM(), FAM(), AM(), MFAM(FAM, AM) {
-
-    EXPECT_TRUE(&CallbacksHandle.Callbacks ==
-                PB.getPassInstrumentationCallbacks());
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    M = parseMIR(MIRString, Context, *TM, *MMI);
+    PB = std::make_unique<PassBuilder>(TM.get(), PipelineTuningOptions(),
+                                       std::nullopt, &PIC);
 
     /// Register a callback for analysis registration.
     ///
     /// The callback is a function taking a reference to an AnalyisManager
     /// object. When called, the callee gets to register its own analyses with
     /// this PassBuilder instance.
-    PB.registerAnalysisRegistrationCallback(
+    PB->registerAnalysisRegistrationCallback(
         [this](MachineFunctionAnalysisManager &AM) {
           // Register our mock analysis
           AM.registerPass([this] { return AnalysisHandle.getAnalysis(); });
@@ -386,24 +376,29 @@ class MachineFunctionCallbacksTest : public testing::Test {
     /// callbacks for each encountered pass name that it does not know. This
     /// includes both simple pass names as well as names of sub-pipelines. In
     /// the latter case, the InnerPipeline is not empty.
-    PB.registerPipelineParsingCallback(
-        [this](StringRef Name, MachineFunctionPassManager &PM) {
+    PB->registerPipelineParsingCallback(
+        [this](StringRef Name, MachineFunctionPassManager &PM,
+               ArrayRef<PassBuilder::PipelineElement> InnerPipeline) {
           if (parseAnalysisUtilityPasses<MockAnalysisHandle::Analysis>(
                   "test-analysis", Name, PM))
             return true;
 
           /// Parse the name of our pass mock handle
           if (Name == "test-transform") {
-            MFPM.addPass(PassHandle.getPass());
+            PM.addPass(PassHandle.getPass());
             return true;
           }
           return false;
         });
 
     /// Register builtin analyses and cross-register the analysis proxies
-    PB.registerModuleAnalyses(AM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerMachineFunctionAnalyses(MFAM);
+    PB->registerModuleAnalyses(MAM);
+    PB->registerCGSCCAnalyses(CGAM);
+    PB->registerFunctionAnalyses(FAM);
+    PB->registerLoopAnalyses(LAM);
+    PB->registerMachineFunctionAnalyses(MFAM);
+    PB->crossRegisterProxies(LAM, FAM, CGAM, MAM, &MFAM);
+    MAM.registerPass([&] { return MachineModuleAnalysis(*MMI); });
   }
 };
 
@@ -412,53 +407,58 @@ TEST_F(MachineFunctionCallbacksTest, Passes) {
   EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded())
+  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
       << "Pipeline was: " << PipelineText;
-  ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded());
+  MPM.run(*M, MAM);
 }
 
 TEST_F(MachineFunctionCallbacksTest, InstrumentedPasses) {
-  CallbacksHandle.registerPassInstrumentation();
+  CallbacksHandle.registerPassInstrumentation(PIC);
   // Non-mock instrumentation not specifically mentioned below can be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("<string>");
   CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
 
   // PassInstrumentation calls should happen in-sequence, in the same order
   // as passes/analyses are scheduled.
   ::testing::Sequence PISequence;
   EXPECT_CALL(CallbacksHandle,
               runBeforePass(HasNameRegex("MockPassHandle"), HasName("test")))
-      .InSequence(PISequence);
+      .InSequence(PISequence)
+      .WillOnce(Return(true));
   EXPECT_CALL(
       CallbacksHandle,
       runBeforeNonSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
       .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle,
-              runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), _))
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforeAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test")))
       .InSequence(PISequence);
-  EXPECT_CALL(CallbacksHandle,
-              runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), _))
+  EXPECT_CALL(
+      CallbacksHandle,
+      runAfterAnalysis(HasNameRegex("MockAnalysisHandle"), HasName("test")))
       .InSequence(PISequence);
   EXPECT_CALL(CallbacksHandle,
               runAfterPass(HasNameRegex("MockPassHandle"), HasName("test"), _))
       .InSequence(PISequence);
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforeSkippedPass(HasNameRegex("MockPassHandle"), HasName("test")))
+      .Times(0);
 
   EXPECT_CALL(AnalysisHandle, run(HasName("test"), _));
   EXPECT_CALL(PassHandle, run(HasName("test"), _)).WillOnce(&getAnalysisResult);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded())
+  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
       << "Pipeline was: " << PipelineText;
-  ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded());
+  MPM.run(*M, MAM);
 }
 
 TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) {
-  CallbacksHandle.registerPassInstrumentation();
+  CallbacksHandle.registerPassInstrumentation(PIC);
   // Non-mock instrumentation run here can safely be ignored.
-  CallbacksHandle.ignoreNonMockPassInstrumentation("<string>");
   CallbacksHandle.ignoreNonMockPassInstrumentation("test");
-  CallbacksHandle.ignoreNonMockPassInstrumentation("");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
 
   // Skip the pass by returning false.
   EXPECT_CALL(CallbacksHandle,
@@ -495,9 +495,81 @@ TEST_F(MachineFunctionCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_THAT_ERROR(PB.parsePassPipeline(MFPM, PipelineText), Succeeded())
+  ASSERT_THAT_ERROR(PB->parsePassPipeline(MPM, PipelineText), Succeeded())
       << "Pipeline was: " << PipelineText;
-  ASSERT_THAT_ERROR(MFPM.run(*M, MFAM), Succeeded());
+  MPM.run(*M, MAM);
+}
+
+// Check that the Module -> MachineFunction adaptor properly calls
+// runAfterPassInvalidated.
+TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass) {
+  CallbacksHandle.registerPassInstrumentation(PIC);
+  // Non-mock instrumentation run here can safely be ignored.
+  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
+
+  ::testing::Sequence PISequence;
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test")))
+      .InSequence(PISequence)
+      .WillOnce(Return(true));
+  EXPECT_CALL(CallbacksHandle,
+              runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"),
+                                      HasName("test")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated(
+                                   HasNameRegex("FreeMachineFunctionPass"), _))
+      .InSequence(PISequence);
+
+  // runAfterPass should not be called since the MachineFunction is no longer
+  // valid after FreeMachineFunctionPass.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _))
+      .Times(0);
+
+  MPM.addPass(
+      createModuleToMachineFunctionPassAdaptor(FreeMachineFunctionPass()));
+  MPM.run(*M, MAM);
+}
+
+// Check that the Module -> MachineFunction adaptor and MachineFunction pass
+// manager properly call runAfterPassInvalidated.
+TEST_F(MachineFunctionCallbacksTest, InstrumentedFreeMFPass2) {
+  CallbacksHandle.registerPassInstrumentation(PIC);
+  // Non-mock instrumentation run here can safely be ignored.
+  CallbacksHandle.ignoreNonMockPassInstrumentation("test");
+  CallbacksHandle.ignoreNonMockPassInstrumentation("module");
+
+  ::testing::Sequence PISequence;
+  EXPECT_CALL(
+      CallbacksHandle,
+      runBeforePass(HasNameRegex("FreeMachineFunctionPass"), HasName("test")))
+      .InSequence(PISequence)
+      .WillOnce(Return(true));
+  EXPECT_CALL(CallbacksHandle,
+              runBeforeNonSkippedPass(HasNameRegex("FreeMachineFunctionPass"),
+                                      HasName("test")))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle, runAfterPassInvalidated(
+                                   HasNameRegex("FreeMachineFunctionPass"), _))
+      .InSequence(PISequence);
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPassInvalidated(HasNameRegex("PassManager"), _))
+      .InSequence(PISequence);
+
+  // runAfterPass should not be called since the MachineFunction is no longer
+  // valid after FreeMachineFunctionPass.
+  EXPECT_CALL(CallbacksHandle,
+              runAfterPass(HasNameRegex("FreeMachineFunctionPass"), _, _))
+      .Times(0);
+  EXPECT_CALL(CallbacksHandle, runAfterPass(HasNameRegex("PassManager"), _, _))
+      .Times(0);
+
+  MachineFunctionPassManager MFPM;
+  MFPM.addPass(FreeMachineFunctionPass());
+  MPM.addPass(createModuleToMachineFunctionPassAdaptor(std::move(MFPM)));
+  MPM.run(*M, MAM);
 }
 
 } // end anonymous namespace

From 7f71fa909a10be182b82b9dfaf0fade6eb84796c Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Thu, 22 Feb 2024 21:01:05 +0000
Subject: [PATCH 092/546] Extend GCC workaround to GCC < 8.4 for
 llvm::iterator_range ctor (#82643)

GCC SFINAE error with decltype was fixed in commit
ac5e28911abdfb8d9bf6bea980223e199bbcf28d which made it into GCC 8.4.
Therefore adjust GCC version test accordingly.
---
 llvm/include/llvm/ADT/iterator_range.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/iterator_range.h b/llvm/include/llvm/ADT/iterator_range.h
index 2dc227935984b..7d288ea4506ba 100644
--- a/llvm/include/llvm/ADT/iterator_range.h
+++ b/llvm/include/llvm/ADT/iterator_range.h
@@ -43,8 +43,8 @@ class iterator_range {
   IteratorT begin_iterator, end_iterator;
 
 public:
-#if __GNUC__ == 7
-  // Be careful no to break gcc-7 on the mlir target.
+#if __GNUC__ == 7 || (__GNUC__ == 8 && __GNUC_MINOR__ < 4)
+  // Be careful no to break gcc-7 and gcc-8 < 8.4 on the mlir target.
   // See https://github.com/llvm/llvm-project/issues/63843
   template <typename Container>
 #else

From df6f756a19277d936ec83f7cebc2501327ac3add Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Thu, 22 Feb 2024 16:11:40 -0500
Subject: [PATCH 093/546] Re-land [lldb-dap] Add support for data breakpoint.
 (#81909)

This implements functionality to handle DataBreakpointInfo request and
SetDataBreakpoints request.

Previous commit
https://github.com/llvm/llvm-project/commit/8c56e78ec531f0e2460213c20fff869b6b7add99
was reverted because setting 1 byte watchpoint failed in the new test on
ARM64. So, I changed the test to setting 4 byte watchpoint instead, and
hope this won't break it again. It also adds the fixes from
https://github.com/llvm/llvm-project/pull/81680.
---
 .../test/tools/lldb-dap/dap_server.py         |  47 +++
 .../tools/lldb-dap/databreakpoint/Makefile    |   3 +
 .../TestDAP_setDataBreakpoints.py             | 131 +++++++
 .../tools/lldb-dap/databreakpoint/main.cpp    |  17 +
 lldb/tools/lldb-dap/CMakeLists.txt            |   1 +
 lldb/tools/lldb-dap/DAPForward.h              |   2 +
 lldb/tools/lldb-dap/Watchpoint.cpp            |  48 +++
 lldb/tools/lldb-dap/Watchpoint.h              |  34 ++
 lldb/tools/lldb-dap/lldb-dap.cpp              | 341 ++++++++++++++++--
 9 files changed, 590 insertions(+), 34 deletions(-)
 create mode 100644 lldb/test/API/tools/lldb-dap/databreakpoint/Makefile
 create mode 100644 lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
 create mode 100644 lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp
 create mode 100644 lldb/tools/lldb-dap/Watchpoint.cpp
 create mode 100644 lldb/tools/lldb-dap/Watchpoint.h

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index bb863bb871917..27a76a652f406 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -501,6 +501,18 @@ def get_local_variable_value(self, name, frameIndex=0, threadId=None):
             return variable["value"]
         return None
 
+    def get_local_variable_child(self, name, child_name, frameIndex=0, threadId=None):
+        local = self.get_local_variable(name, frameIndex, threadId)
+        if local["variablesReference"] == 0:
+            return None
+        children = self.request_variables(local["variablesReference"])["body"][
+            "variables"
+        ]
+        for child in children:
+            if child["name"] == child_name:
+                return child
+        return None
+
     def replay_packets(self, replay_file_path):
         f = open(replay_file_path, "r")
         mode = "invalid"
@@ -895,6 +907,41 @@ def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=Non
         }
         return self.send_recv(command_dict)
 
+    def request_dataBreakpointInfo(
+        self, variablesReference, name, frameIndex=0, threadId=None
+    ):
+        stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId)
+        if stackFrame is None:
+            return []
+        args_dict = {
+            "variablesReference": variablesReference,
+            "name": name,
+            "frameId": stackFrame["id"],
+        }
+        command_dict = {
+            "command": "dataBreakpointInfo",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self.send_recv(command_dict)
+
+    def request_setDataBreakpoint(self, dataBreakpoints):
+        """dataBreakpoints is a list of dictionary with following fields:
+        {
+            dataId: (address in hex)/(size in bytes)
+            accessType: read/write/readWrite
+            [condition]: string
+            [hitCondition]: string
+        }
+        """
+        args_dict = {"breakpoints": dataBreakpoints}
+        command_dict = {
+            "command": "setDataBreakpoints",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self.send_recv(command_dict)
+
     def request_compileUnits(self, moduleId):
         args_dict = {"moduleId": moduleId}
         command_dict = {
diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/Makefile b/lldb/test/API/tools/lldb-dap/databreakpoint/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/databreakpoint/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
new file mode 100644
index 0000000000000..17cdad89aa6d1
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
@@ -0,0 +1,131 @@
+"""
+Test lldb-dap dataBreakpointInfo and setDataBreakpoints requests
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+import lldbdap_testcase
+
+
+class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
+    def setUp(self):
+        lldbdap_testcase.DAPTestCaseBase.setUp(self)
+        self.accessTypes = ["read", "write", "readWrite"]
+
+    @skipIfWindows
+    @skipIfRemote
+    def test_expression(self):
+        """Tests setting data breakpoints on expression."""
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        first_loop_break_line = line_number(source, "// first loop breakpoint")
+        self.set_source_breakpoints(source, [first_loop_break_line])
+        self.continue_to_next_stop()
+        self.dap_server.get_stackFrame()
+        # Test setting write watchpoint using expressions: &x, arr+2
+        response_x = self.dap_server.request_dataBreakpointInfo(0, "&x")
+        response_arr_2 = self.dap_server.request_dataBreakpointInfo(0, "arr+2")
+        # Test response from dataBreakpointInfo request.
+        self.assertEquals(response_x["body"]["dataId"].split("/")[1], "4")
+        self.assertEquals(response_x["body"]["accessTypes"], self.accessTypes)
+        self.assertEquals(response_arr_2["body"]["dataId"].split("/")[1], "4")
+        self.assertEquals(response_arr_2["body"]["accessTypes"], self.accessTypes)
+        dataBreakpoints = [
+            {"dataId": response_x["body"]["dataId"], "accessType": "write"},
+            {"dataId": response_arr_2["body"]["dataId"], "accessType": "write"},
+        ]
+        set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints)
+        self.assertEquals(
+            set_response["body"]["breakpoints"],
+            [{"verified": True}, {"verified": True}],
+        )
+
+        self.continue_to_next_stop()
+        x_val = self.dap_server.get_local_variable_value("x")
+        i_val = self.dap_server.get_local_variable_value("i")
+        self.assertEquals(x_val, "2")
+        self.assertEquals(i_val, "1")
+
+        self.continue_to_next_stop()
+        arr_2 = self.dap_server.get_local_variable_child("arr", "[2]")
+        i_val = self.dap_server.get_local_variable_value("i")
+        self.assertEquals(arr_2["value"], "42")
+        self.assertEquals(i_val, "2")
+
+    @skipIfWindows
+    @skipIfRemote
+    def test_functionality(self):
+        """Tests setting data breakpoints on variable."""
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        first_loop_break_line = line_number(source, "// first loop breakpoint")
+        self.set_source_breakpoints(source, [first_loop_break_line])
+        self.continue_to_next_stop()
+        self.dap_server.get_local_variables()
+        # Test write watchpoints on x, arr[2]
+        response_x = self.dap_server.request_dataBreakpointInfo(1, "x")
+        arr = self.dap_server.get_local_variable("arr")
+        response_arr_2 = self.dap_server.request_dataBreakpointInfo(
+            arr["variablesReference"], "[2]"
+        )
+
+        # Test response from dataBreakpointInfo request.
+        self.assertEquals(response_x["body"]["dataId"].split("/")[1], "4")
+        self.assertEquals(response_x["body"]["accessTypes"], self.accessTypes)
+        self.assertEquals(response_arr_2["body"]["dataId"].split("/")[1], "4")
+        self.assertEquals(response_arr_2["body"]["accessTypes"], self.accessTypes)
+        dataBreakpoints = [
+            {"dataId": response_x["body"]["dataId"], "accessType": "write"},
+            {"dataId": response_arr_2["body"]["dataId"], "accessType": "write"},
+        ]
+        set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints)
+        self.assertEquals(
+            set_response["body"]["breakpoints"],
+            [{"verified": True}, {"verified": True}],
+        )
+
+        self.continue_to_next_stop()
+        x_val = self.dap_server.get_local_variable_value("x")
+        i_val = self.dap_server.get_local_variable_value("i")
+        self.assertEquals(x_val, "2")
+        self.assertEquals(i_val, "1")
+
+        self.continue_to_next_stop()
+        arr_2 = self.dap_server.get_local_variable_child("arr", "[2]")
+        i_val = self.dap_server.get_local_variable_value("i")
+        self.assertEquals(arr_2["value"], "42")
+        self.assertEquals(i_val, "2")
+        self.dap_server.request_setDataBreakpoint([])
+
+        # Test hit condition
+        second_loop_break_line = line_number(source, "// second loop breakpoint")
+        breakpoint_ids = self.set_source_breakpoints(source, [second_loop_break_line])
+        self.continue_to_breakpoints(breakpoint_ids)
+        dataBreakpoints = [
+            {
+                "dataId": response_x["body"]["dataId"],
+                "accessType": "write",
+                "hitCondition": "2",
+            }
+        ]
+        set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints)
+        self.assertEquals(set_response["body"]["breakpoints"], [{"verified": True}])
+        self.continue_to_next_stop()
+        x_val = self.dap_server.get_local_variable_value("x")
+        self.assertEquals(x_val, "3")
+
+        # Test condition
+        dataBreakpoints = [
+            {
+                "dataId": response_x["body"]["dataId"],
+                "accessType": "write",
+                "condition": "x==10",
+            }
+        ]
+        set_response = self.dap_server.request_setDataBreakpoint(dataBreakpoints)
+        self.assertEquals(set_response["body"]["breakpoints"], [{"verified": True}])
+        self.continue_to_next_stop()
+        x_val = self.dap_server.get_local_variable_value("x")
+        self.assertEquals(x_val, "10")
diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp b/lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp
new file mode 100644
index 0000000000000..bef09c203845e
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/databreakpoint/main.cpp
@@ -0,0 +1,17 @@
+int main(int argc, char const *argv[]) {
+  // Test for data breakpoint
+  int x = 0;
+  int arr[4] = {1, 2, 3, 4};
+  for (int i = 0; i < 5; ++i) { // first loop breakpoint
+    if (i == 1) {
+      x = i + 1;
+    } else if (i == 2) {
+      arr[i] = 42;
+    }
+  }
+
+  x = 1;
+  for (int i = 0; i < 10; ++i) { // second loop breakpoint
+    ++x;
+  }
+}
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index f8c0e4ecf36c2..f8f0d86453f58 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -37,6 +37,7 @@ add_lldb_tool(lldb-dap
   RunInTerminal.cpp
   SourceBreakpoint.cpp
   DAP.cpp
+  Watchpoint.cpp
 
   LINK_LIBS
     liblldb
diff --git a/lldb/tools/lldb-dap/DAPForward.h b/lldb/tools/lldb-dap/DAPForward.h
index fffff1e3f7902..8c79488fae8db 100644
--- a/lldb/tools/lldb-dap/DAPForward.h
+++ b/lldb/tools/lldb-dap/DAPForward.h
@@ -14,6 +14,7 @@ struct BreakpointBase;
 struct ExceptionBreakpoint;
 struct FunctionBreakpoint;
 struct SourceBreakpoint;
+struct Watchpoint;
 } // namespace lldb_dap
 
 namespace lldb {
@@ -39,6 +40,7 @@ class SBStringList;
 class SBTarget;
 class SBThread;
 class SBValue;
+class SBWatchpoint;
 } // namespace lldb
 
 #endif
diff --git a/lldb/tools/lldb-dap/Watchpoint.cpp b/lldb/tools/lldb-dap/Watchpoint.cpp
new file mode 100644
index 0000000000000..2f176e0da84f1
--- /dev/null
+++ b/lldb/tools/lldb-dap/Watchpoint.cpp
@@ -0,0 +1,48 @@
+//===-- Watchpoint.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Watchpoint.h"
+#include "DAP.h"
+#include "JSONUtils.h"
+#include "llvm/ADT/StringExtras.h"
+
+namespace lldb_dap {
+Watchpoint::Watchpoint(const llvm::json::Object &obj) : BreakpointBase(obj) {
+  llvm::StringRef dataId = GetString(obj, "dataId");
+  std::string accessType = GetString(obj, "accessType").str();
+  auto [addr_str, size_str] = dataId.split('/');
+  lldb::addr_t addr;
+  size_t size;
+  llvm::to_integer(addr_str, addr, 16);
+  llvm::to_integer(size_str, size);
+  lldb::SBWatchpointOptions options;
+  options.SetWatchpointTypeRead(accessType != "write");
+  if (accessType != "read")
+    options.SetWatchpointTypeWrite(lldb::eWatchpointWriteTypeOnModify);
+  wp = g_dap.target.WatchpointCreateByAddress(addr, size, options, error);
+  SetCondition();
+  SetHitCondition();
+}
+
+void Watchpoint::SetCondition() { wp.SetCondition(condition.c_str()); }
+
+void Watchpoint::SetHitCondition() {
+  uint64_t hitCount = 0;
+  if (llvm::to_integer(hitCondition, hitCount))
+    wp.SetIgnoreCount(hitCount - 1);
+}
+
+void Watchpoint::CreateJsonObject(llvm::json::Object &object) {
+  if (error.Success()) {
+    object.try_emplace("verified", true);
+  } else {
+    object.try_emplace("verified", false);
+    EmplaceSafeString(object, "message", error.GetCString());
+  }
+}
+} // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Watchpoint.h b/lldb/tools/lldb-dap/Watchpoint.h
new file mode 100644
index 0000000000000..026b07d67241c
--- /dev/null
+++ b/lldb/tools/lldb-dap/Watchpoint.h
@@ -0,0 +1,34 @@
+//===-- Watchpoint.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_WATCHPOINT_H
+#define LLDB_TOOLS_LLDB_DAP_WATCHPOINT_H
+
+#include "BreakpointBase.h"
+#include "lldb/API/SBError.h"
+#include "lldb/API/SBWatchpoint.h"
+#include "lldb/API/SBWatchpointOptions.h"
+
+namespace lldb_dap {
+
+struct Watchpoint : public BreakpointBase {
+  // The LLDB breakpoint associated wit this watchpoint.
+  lldb::SBWatchpoint wp;
+  lldb::SBError error;
+
+  Watchpoint() = default;
+  Watchpoint(const llvm::json::Object &obj);
+  Watchpoint(lldb::SBWatchpoint wp) : wp(wp) {}
+
+  void SetCondition() override;
+  void SetHitCondition() override;
+  void CreateJsonObject(llvm::json::Object &object) override;
+};
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 78b0b4078706a..c6a275bcf8140 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
+#include "Watchpoint.h"
+#include "lldb/API/SBMemoryRegionInfo.h"
 
 #include <cassert>
 #include <climits>
@@ -560,6 +562,46 @@ void EventThreadFunction() {
   }
 }
 
+lldb::SBValue FindVariable(uint64_t variablesReference, llvm::StringRef name) {
+  lldb::SBValue variable;
+  if (lldb::SBValueList *top_scope = GetTopLevelScope(variablesReference)) {
+    bool is_duplicated_variable_name = name.contains(" @");
+    // variablesReference is one of our scopes, not an actual variable it is
+    // asking for a variable in locals or globals or registers
+    int64_t end_idx = top_scope->GetSize();
+    // Searching backward so that we choose the variable in closest scope
+    // among variables of the same name.
+    for (int64_t i = end_idx - 1; i >= 0; --i) {
+      lldb::SBValue curr_variable = top_scope->GetValueAtIndex(i);
+      std::string variable_name = CreateUniqueVariableNameForDisplay(
+          curr_variable, is_duplicated_variable_name);
+      if (variable_name == name) {
+        variable = curr_variable;
+        break;
+      }
+    }
+  } else {
+    // This is not under the globals or locals scope, so there are no duplicated
+    // names.
+
+    // We have a named item within an actual variable so we need to find it
+    // withing the container variable by name.
+    lldb::SBValue container = g_dap.variables.GetVariable(variablesReference);
+    variable = container.GetChildMemberWithName(name.data());
+    if (!variable.IsValid()) {
+      if (name.starts_with("[")) {
+        llvm::StringRef index_str(name.drop_front(1));
+        uint64_t index = 0;
+        if (!index_str.consumeInteger(0, index)) {
+          if (index_str == "]")
+            variable = container.GetChildAtIndex(index);
+        }
+      }
+    }
+  }
+  return variable;
+}
+
 // Both attach and launch take a either a sourcePath or sourceMap
 // argument (or neither), from which we need to set the target.source-map.
 void SetSourceMapFromArguments(const llvm::json::Object &arguments) {
@@ -1647,6 +1689,8 @@ void request_initialize(const llvm::json::Object &request) {
   body.try_emplace("supportsProgressReporting", true);
   // The debug adapter supports 'logMessage' in breakpoint.
   body.try_emplace("supportsLogPoints", true);
+  // The debug adapter supports data watchpoints.
+  body.try_emplace("supportsDataBreakpoints", true);
 
   response.try_emplace("body", std::move(body));
   g_dap.SendJSON(llvm::json::Value(std::move(response)));
@@ -2593,6 +2637,264 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) {
   g_dap.SendJSON(llvm::json::Value(std::move(response)));
 }
 
+// "DataBreakpointInfoRequest": {
+//   "allOf": [ { "$ref": "#/definitions/Request" }, {
+//     "type": "object",
+//     "description": "Obtains information on a possible data breakpoint that
+//     could be set on an expression or variable.\nClients should only call this
+//     request if the corresponding capability `supportsDataBreakpoints` is
+//     true.", "properties": {
+//       "command": {
+//         "type": "string",
+//         "enum": [ "dataBreakpointInfo" ]
+//       },
+//       "arguments": {
+//         "$ref": "#/definitions/DataBreakpointInfoArguments"
+//       }
+//     },
+//     "required": [ "command", "arguments"  ]
+//   }]
+// },
+// "DataBreakpointInfoArguments": {
+//   "type": "object",
+//   "description": "Arguments for `dataBreakpointInfo` request.",
+//   "properties": {
+//     "variablesReference": {
+//       "type": "integer",
+//       "description": "Reference to the variable container if the data
+//       breakpoint is requested for a child of the container. The
+//       `variablesReference` must have been obtained in the current suspended
+//       state. See 'Lifetime of Object References' in the Overview section for
+//       details."
+//     },
+//     "name": {
+//       "type": "string",
+//       "description": "The name of the variable's child to obtain data
+//       breakpoint information for.\nIf `variablesReference` isn't specified,
+//       this can be an expression."
+//     },
+//     "frameId": {
+//       "type": "integer",
+//       "description": "When `name` is an expression, evaluate it in the scope
+//       of this stack frame. If not specified, the expression is evaluated in
+//       the global scope. When `variablesReference` is specified, this property
+//       has no effect."
+//     }
+//   },
+//   "required": [ "name" ]
+// },
+// "DataBreakpointInfoResponse": {
+//   "allOf": [ { "$ref": "#/definitions/Response" }, {
+//     "type": "object",
+//     "description": "Response to `dataBreakpointInfo` request.",
+//     "properties": {
+//       "body": {
+//         "type": "object",
+//         "properties": {
+//           "dataId": {
+//             "type": [ "string", "null" ],
+//             "description": "An identifier for the data on which a data
+//             breakpoint can be registered with the `setDataBreakpoints`
+//             request or null if no data breakpoint is available. If a
+//             `variablesReference` or `frameId` is passed, the `dataId` is
+//             valid in the current suspended state, otherwise it's valid
+//             indefinitely. See 'Lifetime of Object References' in the Overview
+//             section for details. Breakpoints set using the `dataId` in the
+//             `setDataBreakpoints` request may outlive the lifetime of the
+//             associated `dataId`."
+//           },
+//           "description": {
+//             "type": "string",
+//             "description": "UI string that describes on what data the
+//             breakpoint is set on or why a data breakpoint is not available."
+//           },
+//           "accessTypes": {
+//             "type": "array",
+//             "items": {
+//               "$ref": "#/definitions/DataBreakpointAccessType"
+//             },
+//             "description": "Attribute lists the available access types for a
+//             potential data breakpoint. A UI client could surface this
+//             information."
+//           },
+//           "canPersist": {
+//             "type": "boolean",
+//             "description": "Attribute indicates that a potential data
+//             breakpoint could be persisted across sessions."
+//           }
+//         },
+//         "required": [ "dataId", "description" ]
+//       }
+//     },
+//     "required": [ "body" ]
+//   }]
+// }
+void request_dataBreakpointInfo(const llvm::json::Object &request) {
+  llvm::json::Object response;
+  FillResponse(request, response);
+  llvm::json::Object body;
+  lldb::SBError error;
+  llvm::json::Array accessTypes{"read", "write", "readWrite"};
+  const auto *arguments = request.getObject("arguments");
+  const auto variablesReference =
+      GetUnsigned(arguments, "variablesReference", 0);
+  llvm::StringRef name = GetString(arguments, "name");
+  lldb::SBFrame frame = g_dap.GetLLDBFrame(*arguments);
+  lldb::SBValue variable = FindVariable(variablesReference, name);
+  std::string addr, size;
+
+  if (variable.IsValid()) {
+    lldb::addr_t load_addr = variable.GetLoadAddress();
+    size_t byte_size = variable.GetByteSize();
+    if (load_addr == LLDB_INVALID_ADDRESS) {
+      body.try_emplace("dataId", nullptr);
+      body.try_emplace("description",
+                       "does not exist in memory, its location is " +
+                           std::string(variable.GetLocation()));
+    } else if (byte_size == 0) {
+      body.try_emplace("dataId", nullptr);
+      body.try_emplace("description", "variable size is 0");
+    } else {
+      addr = llvm::utohexstr(load_addr);
+      size = llvm::utostr(byte_size);
+    }
+  } else if (variablesReference == 0 && frame.IsValid()) {
+    lldb::SBValue value = frame.EvaluateExpression(name.data());
+    if (value.GetError().Fail()) {
+      lldb::SBError error = value.GetError();
+      const char *error_cstr = error.GetCString();
+      body.try_emplace("dataId", nullptr);
+      body.try_emplace("description", error_cstr && error_cstr[0]
+                                          ? std::string(error_cstr)
+                                          : "evaluation failed");
+    } else {
+      uint64_t load_addr = value.GetValueAsUnsigned();
+      addr = llvm::utohexstr(load_addr);
+      lldb::SBMemoryRegionInfo region;
+      lldb::SBError err =
+          g_dap.target.GetProcess().GetMemoryRegionInfo(load_addr, region);
+      if (err.Success()) {
+        if (!(region.IsReadable() || region.IsWritable())) {
+          body.try_emplace("dataId", nullptr);
+          body.try_emplace("description",
+                           "memory region for address " + addr +
+                               " has no read or write permissions");
+        } else {
+          lldb::SBData data = value.GetPointeeData();
+          if (data.IsValid())
+            size = llvm::utostr(data.GetByteSize());
+          else {
+            body.try_emplace("dataId", nullptr);
+            body.try_emplace("description",
+                             "unable to get byte size for expression: " +
+                                 name.str());
+          }
+        }
+      } else {
+        body.try_emplace("dataId", nullptr);
+        body.try_emplace("description",
+                         "unable to get memory region info for address " +
+                             addr);
+      }
+    }
+  } else {
+    body.try_emplace("dataId", nullptr);
+    body.try_emplace("description", "variable not found: " + name.str());
+  }
+
+  if (!body.getObject("dataId")) {
+    body.try_emplace("dataId", addr + "/" + size);
+    body.try_emplace("accessTypes", std::move(accessTypes));
+    body.try_emplace("description",
+                     size + " bytes at " + addr + " " + name.str());
+  }
+  response.try_emplace("body", std::move(body));
+  g_dap.SendJSON(llvm::json::Value(std::move(response)));
+}
+
+// "SetDataBreakpointsRequest": {
+//   "allOf": [ { "$ref": "#/definitions/Request" }, {
+//     "type": "object",
+//     "description": "Replaces all existing data breakpoints with new data
+//     breakpoints.\nTo clear all data breakpoints, specify an empty
+//     array.\nWhen a data breakpoint is hit, a `stopped` event (with reason
+//     `data breakpoint`) is generated.\nClients should only call this request
+//     if the corresponding capability `supportsDataBreakpoints` is true.",
+//     "properties": {
+//       "command": {
+//         "type": "string",
+//         "enum": [ "setDataBreakpoints" ]
+//       },
+//       "arguments": {
+//         "$ref": "#/definitions/SetDataBreakpointsArguments"
+//       }
+//     },
+//     "required": [ "command", "arguments"  ]
+//   }]
+// },
+// "SetDataBreakpointsArguments": {
+//   "type": "object",
+//   "description": "Arguments for `setDataBreakpoints` request.",
+//   "properties": {
+//     "breakpoints": {
+//       "type": "array",
+//       "items": {
+//         "$ref": "#/definitions/DataBreakpoint"
+//       },
+//       "description": "The contents of this array replaces all existing data
+//       breakpoints. An empty array clears all data breakpoints."
+//     }
+//   },
+//   "required": [ "breakpoints" ]
+// },
+// "SetDataBreakpointsResponse": {
+//   "allOf": [ { "$ref": "#/definitions/Response" }, {
+//     "type": "object",
+//     "description": "Response to `setDataBreakpoints` request.\nReturned is
+//     information about each breakpoint created by this request.",
+//     "properties": {
+//       "body": {
+//         "type": "object",
+//         "properties": {
+//           "breakpoints": {
+//             "type": "array",
+//             "items": {
+//               "$ref": "#/definitions/Breakpoint"
+//             },
+//             "description": "Information about the data breakpoints. The array
+//             elements correspond to the elements of the input argument
+//             `breakpoints` array."
+//           }
+//         },
+//         "required": [ "breakpoints" ]
+//       }
+//     },
+//     "required": [ "body" ]
+//   }]
+// }
+void request_setDataBreakpoints(const llvm::json::Object &request) {
+  llvm::json::Object response;
+  lldb::SBError error;
+  FillResponse(request, response);
+  const auto *arguments = request.getObject("arguments");
+  const auto *breakpoints = arguments->getArray("breakpoints");
+  llvm::json::Array response_breakpoints;
+  g_dap.target.DeleteAllWatchpoints();
+  if (breakpoints) {
+    for (const auto &bp : *breakpoints) {
+      const auto *bp_obj = bp.getAsObject();
+      if (bp_obj) {
+        Watchpoint wp(*bp_obj);
+        AppendBreakpoint(&wp, response_breakpoints);
+      }
+    }
+  }
+  llvm::json::Object body;
+  body.try_emplace("breakpoints", std::move(response_breakpoints));
+  response.try_emplace("body", std::move(body));
+  g_dap.SendJSON(llvm::json::Value(std::move(response)));
+}
+
 // "SourceRequest": {
 //   "allOf": [ { "$ref": "#/definitions/Request" }, {
 //     "type": "object",
@@ -3076,7 +3378,6 @@ void request_setVariable(const llvm::json::Object &request) {
   const auto variablesReference =
       GetUnsigned(arguments, "variablesReference", 0);
   llvm::StringRef name = GetString(arguments, "name");
-  bool is_duplicated_variable_name = name.contains(" @");
 
   const auto value = GetString(arguments, "value");
   // Set success to false just in case we don't find the variable by name
@@ -3097,40 +3398,8 @@ void request_setVariable(const llvm::json::Object &request) {
   const auto id_value = GetUnsigned(arguments, "id", UINT64_MAX);
   if (id_value != UINT64_MAX) {
     variable = g_dap.variables.GetVariable(id_value);
-  } else if (lldb::SBValueList *top_scope =
-                 GetTopLevelScope(variablesReference)) {
-    // variablesReference is one of our scopes, not an actual variable it is
-    // asking for a variable in locals or globals or registers
-    int64_t end_idx = top_scope->GetSize();
-    // Searching backward so that we choose the variable in closest scope
-    // among variables of the same name.
-    for (int64_t i = end_idx - 1; i >= 0; --i) {
-      lldb::SBValue curr_variable = top_scope->GetValueAtIndex(i);
-      std::string variable_name = CreateUniqueVariableNameForDisplay(
-          curr_variable, is_duplicated_variable_name);
-      if (variable_name == name) {
-        variable = curr_variable;
-        break;
-      }
-    }
   } else {
-    // This is not under the globals or locals scope, so there are no duplicated
-    // names.
-
-    // We have a named item within an actual variable so we need to find it
-    // withing the container variable by name.
-    lldb::SBValue container = g_dap.variables.GetVariable(variablesReference);
-    variable = container.GetChildMemberWithName(name.data());
-    if (!variable.IsValid()) {
-      if (name.starts_with("[")) {
-        llvm::StringRef index_str(name.drop_front(1));
-        uint64_t index = 0;
-        if (!index_str.consumeInteger(0, index)) {
-          if (index_str == "]")
-            variable = container.GetChildAtIndex(index);
-        }
-      }
-    }
+    variable = FindVariable(variablesReference, name);
   }
 
   if (variable.IsValid()) {
@@ -3613,6 +3882,10 @@ void RegisterRequestCallbacks() {
                                 request_setExceptionBreakpoints);
   g_dap.RegisterRequestCallback("setFunctionBreakpoints",
                                 request_setFunctionBreakpoints);
+  g_dap.RegisterRequestCallback("dataBreakpointInfo",
+                                request_dataBreakpointInfo);
+  g_dap.RegisterRequestCallback("setDataBreakpoints",
+                                request_setDataBreakpoints);
   g_dap.RegisterRequestCallback("setVariable", request_setVariable);
   g_dap.RegisterRequestCallback("source", request_source);
   g_dap.RegisterRequestCallback("stackTrace", request_stackTrace);

From 0eb64eebdecb3c138b4adfea1cbcdd03aa7d455c Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 22 Feb 2024 21:12:51 +0000
Subject: [PATCH 094/546] [gn build] Port df6f756a1927

---
 llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index 98c2068f6da29..8cb60fd81840f 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -51,6 +51,7 @@ executable("lldb-dap") {
     "ProgressEvent.cpp",
     "RunInTerminal.cpp",
     "SourceBreakpoint.cpp",
+    "Watchpoint.cpp",
     "lldb-dap.cpp",
   ]
 }

From 45fe67dd61a6ac7df84d3a586e41c36a4767757f Mon Sep 17 00:00:00 2001
From: Daniel Martinez <danielpedromartinez@duck.com>
Date: Thu, 22 Feb 2024 21:14:27 +0000
Subject: [PATCH 095/546] Fix build on musl by including stdint.h (#81434)

openmp fails to build on musl since it lacks the defines for int32_t

Co-authored-by: Daniel Martinez <danielmartinez@cock.li>
---
 openmp/libomptarget/include/Shared/SourceInfo.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openmp/libomptarget/include/Shared/SourceInfo.h b/openmp/libomptarget/include/Shared/SourceInfo.h
index 7ce5fd43efc07..711f06a04d017 100644
--- a/openmp/libomptarget/include/Shared/SourceInfo.h
+++ b/openmp/libomptarget/include/Shared/SourceInfo.h
@@ -13,6 +13,7 @@
 #ifndef OMPTARGET_SHARED_SOURCE_INFO_H
 #define OMPTARGET_SHARED_SOURCE_INFO_H
 
+#include <stdint.h>
 #include <string>
 
 #ifdef _WIN32

From 47b7c91abe7af3133a591aa2e73fffa30826f986 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 15:29:29 -0600
Subject: [PATCH 096/546] [libc] Rework the GPU build to be a regular target
 (#81921)

Summary:
This is a massive patch because it reworks the entire build and
everything that depends on it. This is not split up because various bots
would fail otherwise. I will attempt to describe the necessary changes
here.

This patch completely reworks how the GPU build is built and targeted.
Previously, we used a standard runtimes build and handled both NVPTX and
AMDGPU in a single build via multi-targeting. This added a lot of
divergence in the build system and prevented us from doing various
things like building for the CPU / GPU at the same time, or exporting
the startup libraries or running tests without a full rebuild.

The new appraoch is to handle the GPU builds as strict cross-compiling
runtimes. The first step required
https://github.com/llvm/llvm-project/pull/81557 to allow the `LIBC`
target to build for the GPU without touching the other targets. This
means that the GPU uses all the same handling as the other builds in
`libc`.

The new expected way to build the GPU libc is with
`LLVM_LIBC_RUNTIME_TARGETS=amdgcn-amd-amdhsa;nvptx64-nvidia-cuda`.

The second step was reworking how we generated the embedded GPU library
by moving it into the library install step. Where we previously had one
`libcgpu.a` we now have `libcgpu-amdgpu.a` and `libcgpu-nvptx.a`. This
patch includes the necessary clang / OpenMP changes to make that not
break the bots when this lands.

We unfortunately still require that the NVPTX target has an `internal`
target for tests. This is because the NVPTX target needs to do LTO for
the provided version (The offloading toolchain can handle it) but cannot
use it for the native toolchain which is used for making tests.

This approach is vastly superior in every way, allowing us to treat the
GPU as a standard cross-compiling target. We can now install the GPU
utilities to do things like use the offload tests and other fun things.

Some certain utilities need to be built with
`--target=${LLVM_HOST_TRIPLE}` as well. I think this is a fine
workaround as we
will always assume that the GPU `libc` is a cross-build with a
functioning host.

Depends on https://github.com/llvm/llvm-project/pull/81557
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  37 +-
 clang/test/Driver/openmp-offload-gpu.c        |  20 +-
 libc/CMakeLists.txt                           |  20 +-
 .../cmake/modules/LLVMLibCArchitectures.cmake |  28 +-
 libc/cmake/modules/LLVMLibCCheckMPFR.cmake    |   2 +-
 .../modules/LLVMLibCCompileOptionRules.cmake  |  76 +---
 libc/cmake/modules/LLVMLibCHeaderRules.cmake  |   2 +-
 libc/cmake/modules/LLVMLibCLibraryRules.cmake | 141 +++++--
 libc/cmake/modules/LLVMLibCObjectRules.cmake  | 348 ++++--------------
 libc/cmake/modules/LLVMLibCTestRules.cmake    |  47 ++-
 .../modules/prepare_libc_gpu_build.cmake      | 108 ++----
 libc/docs/gpu/using.rst                       |  33 +-
 libc/include/CMakeLists.txt                   |   6 +-
 libc/lib/CMakeLists.txt                       |  35 +-
 libc/src/__support/File/CMakeLists.txt        |   2 +-
 libc/src/__support/GPU/CMakeLists.txt         |   2 +-
 libc/src/__support/OSUtil/CMakeLists.txt      |   2 +-
 libc/src/__support/RPC/CMakeLists.txt         |   2 +-
 libc/src/math/CMakeLists.txt                  |  16 +-
 libc/src/math/gpu/vendor/CMakeLists.txt       |   1 -
 libc/src/stdio/CMakeLists.txt                 |   2 +-
 libc/src/stdlib/CMakeLists.txt                |   4 +-
 libc/src/string/CMakeLists.txt                |  12 +-
 libc/startup/gpu/CMakeLists.txt               |  35 +-
 libc/startup/gpu/amdgpu/CMakeLists.txt        |  13 -
 libc/startup/gpu/nvptx/CMakeLists.txt         |   9 -
 libc/test/CMakeLists.txt                      |   6 +-
 libc/test/IntegrationTest/CMakeLists.txt      |  16 -
 libc/test/UnitTest/CMakeLists.txt             |   2 +-
 libc/test/src/__support/CMakeLists.txt        |  49 +--
 libc/test/src/__support/CPP/CMakeLists.txt    |   2 +-
 libc/test/src/__support/File/CMakeLists.txt   |   2 +-
 libc/test/src/errno/CMakeLists.txt            |   2 +-
 libc/test/src/math/CMakeLists.txt             |  20 +-
 libc/test/src/math/smoke/CMakeLists.txt       |   8 +-
 libc/test/src/stdio/CMakeLists.txt            |   2 +-
 libc/test/src/stdlib/CMakeLists.txt           |   6 +-
 libc/test/utils/UnitTest/CMakeLists.txt       |   2 +-
 libc/utils/CMakeLists.txt                     |   2 +-
 libc/utils/MPFRWrapper/CMakeLists.txt         |   2 +-
 libc/utils/gpu/CMakeLists.txt                 |   4 +-
 libc/utils/gpu/loader/CMakeLists.txt          |  48 ++-
 libc/utils/gpu/loader/amdgpu/CMakeLists.txt   |   6 +-
 libc/utils/gpu/loader/nvptx/CMakeLists.txt    |  10 +-
 libc/utils/gpu/server/CMakeLists.txt          |   9 +
 llvm/CMakeLists.txt                           |   4 +-
 llvm/cmake/modules/HandleLLVMOptions.cmake    |   7 +
 llvm/runtimes/CMakeLists.txt                  |  11 +-
 openmp/libomptarget/CMakeLists.txt            |   9 +-
 .../plugins-nextgen/common/CMakeLists.txt     |   6 +-
 .../plugins-nextgen/common/src/RPC.cpp        |   3 +-
 openmp/libomptarget/test/lit.cfg              |   8 +-
 52 files changed, 585 insertions(+), 664 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e5196bd8b5ae9..347b250260c4c 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1087,10 +1087,41 @@ static void addOpenMPDeviceLibC(const ToolChain &TC, const ArgList &Args,
                           "llvm-libc-decls");
   bool HasLibC = llvm::sys::fs::exists(LibCDecls) &&
                  llvm::sys::fs::is_directory(LibCDecls);
-  if (Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC)) {
-    CmdArgs.push_back("-lcgpu");
-    CmdArgs.push_back("-lmgpu");
+  if (!Args.hasFlag(options::OPT_gpulibc, options::OPT_nogpulibc, HasLibC))
+    return;
+
+  // We don't have access to the offloading toolchains here, so determine from
+  // the arguments if we have any active NVPTX or AMDGPU toolchains.
+  llvm::DenseSet<const char *> Libraries;
+  if (const Arg *Targets = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) {
+    if (llvm::any_of(Targets->getValues(),
+                     [](auto S) { return llvm::Triple(S).isAMDGPU(); })) {
+      Libraries.insert("-lcgpu-amdgpu");
+      Libraries.insert("-lmgpu-amdgpu");
+    }
+    if (llvm::any_of(Targets->getValues(),
+                     [](auto S) { return llvm::Triple(S).isNVPTX(); })) {
+      Libraries.insert("-lcgpu-nvptx");
+      Libraries.insert("-lmgpu-nvptx");
+    }
   }
+
+  for (StringRef Arch : Args.getAllArgValues(options::OPT_offload_arch_EQ)) {
+    if (llvm::any_of(llvm::split(Arch, ","), [](StringRef Str) {
+          return IsAMDGpuArch(StringToCudaArch(Str));
+        })) {
+      Libraries.insert("-lcgpu-amdgpu");
+      Libraries.insert("-lmgpu-amdgpu");
+    }
+    if (llvm::any_of(llvm::split(Arch, ","), [](StringRef Str) {
+          return IsNVIDIAGpuArch(StringToCudaArch(Str));
+        })) {
+      Libraries.insert("-lcgpu-nvptx");
+      Libraries.insert("-lmgpu-nvptx");
+    }
+  }
+
+  llvm::append_range(CmdArgs, Libraries);
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index bccc5fd9483ac..5da74a35d87ad 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -393,14 +393,28 @@
 //
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
 // RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
+// RUN:      --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \
 // RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
-// RUN:      --offload-arch=sm_52 -gpulibc -nogpuinc %s 2>&1 \
+// RUN:      --rocm-path=%S/Inputs/rocm \
+// RUN:      --offload-arch=sm_52,gfx803 -gpulibc -nogpuinc %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=LIBC-GPU %s
-// LIBC-GPU: "-lcgpu"{{.*}}"-lmgpu"
+// RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
+// RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
+// RUN:      --libomptarget-amdgpu-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgpu-gfx803.bc \
+// RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
+// RUN:      --rocm-path=%S/Inputs/rocm \
+// RUN:      -Xopenmp-target=nvptx64-nvidia-cuda -march=sm_52 \
+// RUN:      -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 \
+// RUN:      -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -gpulibc -nogpuinc %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LIBC-GPU %s
+// LIBC-GPU-DAG: "-lcgpu-amdgpu"
+// LIBC-GPU-DAG: "-lmgpu-amdgpu"
+// LIBC-GPU-DAG: "-lcgpu-nvptx"
+// LIBC-GPU-DAG: "-lmgpu-nvptx"
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -fopenmp=libomp \
 // RUN:      --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
 // RUN:      --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
 // RUN:      --offload-arch=sm_52 -nogpulibc -nogpuinc %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=NO-LIBC-GPU %s
-// NO-LIBC-GPU-NOT: "-lcgpu"{{.*}}"-lmgpu"
+// NO-LIBC-GPU-NOT: -lmgpu{{.*}}-lcgpu
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 9f9839423499e..6a57fcec26e47 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -43,7 +43,7 @@ set(LIBC_NAMESPACE "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LL
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
 )
 
-if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+if(LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD)
   if(NOT LIBC_HDRGEN_EXE)
     # We need to set up hdrgen first since other targets depend on it.
     add_subdirectory(utils/LibcTableGenUtil)
@@ -77,7 +77,7 @@ if(LIBC_HDRGEN_ONLY OR NEED_LIBC_HDRGEN)
   # to build libc-hdrgen and return.
 
   # Always make the RPC server availible to other projects for GPU mode.
-  if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+  if(LLVM_LIBC_GPU_BUILD)
     add_subdirectory(utils/gpu/server)
   endif()
   return()
@@ -118,7 +118,7 @@ if(COMMAND_RETURN_CODE EQUAL 0)
   message(STATUS "Set COMPILER_RESOURCE_DIR to "
                  "${COMPILER_RESOURCE_DIR} using --print-resource-dir")
 else()
-  if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if (LIBC_TARGET_OS_IS_GPU)
     message(FATAL_ERROR "COMPILER_RESOURCE_DIR must be set for GPU builds")
   else()
     set(COMPILER_RESOURCE_DIR OFF)
@@ -216,11 +216,7 @@ foreach(config_path IN LISTS LIBC_CONFIG_JSON_FILE_LIST)
   load_libc_config(${config_path}/config.json ${cmd_line_conf})
 endforeach()
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-  set(LIBC_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
-  set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm)
-  set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
-elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
+if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
   set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
@@ -235,7 +231,11 @@ else()
     set(LIBC_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include)
     set(LIBC_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX})
   endif()
-  set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR})
+  if(LIBC_TARGET_OS_IS_GPU)
+    set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
+  else()
+    set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR})
+  endif()
 endif()
 
 if(LIBC_TARGET_TRIPLE)
@@ -247,7 +247,7 @@ else()
   set(LIBC_INSTALL_LIBRARY_DIR lib${LLVM_LIBDIR_SUFFIX})
 endif()
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   include(prepare_libc_gpu_build)
   set(LIBC_ENABLE_UNITTESTS OFF)
 endif()
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index 623ed774be727..0dbc59ad643ac 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -6,18 +6,6 @@
 # platform.
 # ------------------------------------------------------------------------------
 
-if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
-  # We set the generic target and OS to "gpu" here. More specific defintions
-  # for the exact target GPU are set up in prepare_libc_gpu_build.cmake.
-  set(LIBC_TARGET_OS "gpu")
-  set(LIBC_TARGET_ARCHITECTURE_IS_GPU TRUE)
-  set(LIBC_TARGET_ARCHITECTURE "gpu")
-  if(LIBC_TARGET_TRIPLE)
-    message(WARNING "LIBC_TARGET_TRIPLE is ignored as LIBC_GPU_BUILD is on. ")
-  endif()
-  return()
-endif()
-
 if(MSVC)
   # If the compiler is visual c++ or equivalent, we will assume a host build.
   set(LIBC_TARGET_OS ${CMAKE_HOST_SYSTEM_NAME})
@@ -59,6 +47,10 @@ function(get_arch_and_system_from_triple triple arch_var sys_var)
     set(target_arch "riscv32")
   elseif(target_arch MATCHES "^riscv64")
     set(target_arch "riscv64")
+  elseif(target_arch MATCHES "^amdgcn")
+    set(target_arch "amdgpu")
+  elseif(target_arch MATCHES "^nvptx64")
+    set(target_arch "nvptx")
   else()
     return()
   endif()
@@ -75,6 +67,12 @@ function(get_arch_and_system_from_triple triple arch_var sys_var)
     set(target_sys "darwin")
   endif()
 
+  # Setting OS name for GPU architectures.
+  list(GET triple_comps -1 gpu_target_sys)
+  if(gpu_target_sys MATCHES "^amdhsa" OR gpu_target_sys MATCHES "^cuda")
+    set(target_sys "gpu")
+  endif()
+
   set(${sys_var} ${target_sys} PARENT_SCOPE)
 endfunction(get_arch_and_system_from_triple)
 
@@ -156,6 +154,10 @@ elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv64")
 elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "riscv32")
   set(LIBC_TARGET_ARCHITECTURE_IS_RISCV32 TRUE)
   set(LIBC_TARGET_ARCHITECTURE "riscv")
+elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "amdgpu")
+  set(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
+elseif(LIBC_TARGET_ARCHITECTURE STREQUAL "nvptx")
+  set(LIBC_TARGET_ARCHITECTURE_IS_NVPTX TRUE)
 else()
   message(FATAL_ERROR
           "Unsupported libc target architecture ${LIBC_TARGET_ARCHITECTURE}")
@@ -178,6 +180,8 @@ elseif(LIBC_TARGET_OS STREQUAL "darwin")
   set(LIBC_TARGET_OS_IS_DARWIN TRUE)
 elseif(LIBC_TARGET_OS STREQUAL "windows")
   set(LIBC_TARGET_OS_IS_WINDOWS TRUE)
+elseif(LIBC_TARGET_OS STREQUAL "gpu")
+  set(LIBC_TARGET_OS_IS_GPU TRUE)
 else()
   message(FATAL_ERROR
           "Unsupported libc target operating system ${LIBC_TARGET_OS}")
diff --git a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
index 9e361f5fd8112..bbaeb9f0dc053 100644
--- a/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckMPFR.cmake
@@ -2,7 +2,7 @@ set(LLVM_LIBC_MPFR_INSTALL_PATH "" CACHE PATH "Path to where MPFR is installed (
 
 if(LLVM_LIBC_MPFR_INSTALL_PATH)
   set(LIBC_TESTS_CAN_USE_MPFR TRUE)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   set(LIBC_TESTS_CAN_USE_MPFR FALSE)
 else()
   try_compile(
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 140e4d51a9c2e..33ba5da4f8d57 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -82,10 +82,22 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "/EHs-c-")
     list(APPEND compile_options "/GR-")
   endif()
-  if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if (LIBC_TARGET_OS_IS_GPU)
     list(APPEND compile_options "-nogpulib")
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
+    list(APPEND compile_options "-flto")
+
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      list(APPEND compile_options "-Wno-unknown-cuda-version")
+      list(APPEND compile_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
+      list(APPEND compile_options "--cuda-feature=+ptx63")
+      if(LIBC_CUDA_ROOT)
+        list(APPEND compile_options "--cuda-path=${LIBC_CUDA_ROOT}")
+      endif()
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
+    endif()
 
     # Manually disable all standard include paths and include the resource
     # directory to prevent system headers from being included.
@@ -138,73 +150,21 @@ function(_get_common_test_compile_options output_var flags)
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
-# Obtains NVPTX specific arguments for compilation.
-# The PTX feature is primarily based on the CUDA toolchain version. We want to
-# be able to target NVPTX without an existing CUDA installation, so we need to
-# set this manually. This simply sets the PTX feature to the minimum required
-# for the features we wish to use on that target. The minimum PTX features used
-# here roughly corresponds to the CUDA 9.0 release.
-# Adjust as needed for desired PTX features.
-function(get_nvptx_compile_options output_var gpu_arch)
-  set(nvptx_options "")
-  list(APPEND nvptx_options "-march=${gpu_arch}")
-  list(APPEND nvptx_options "-Wno-unknown-cuda-version")
-  list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
-  if(${gpu_arch} STREQUAL "sm_35")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_37")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_50")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_52")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_53")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_60")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_61")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_62")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_70")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_72")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_75")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_80")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_86")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_89")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_90")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  else()
-    message(FATAL_ERROR "Unknown Nvidia GPU architecture '${gpu_arch}'")
-  endif()
-
-  if(LIBC_CUDA_ROOT)
-    list(APPEND nvptx_options "--cuda-path=${LIBC_CUDA_ROOT}")
-  endif()
-  set(${output_var} ${nvptx_options} PARENT_SCOPE)
-endfunction()
-
 function(_get_hermetic_test_compile_options output_var flags)
   _get_compile_options_from_flags(compile_flags ${flags})
   list(APPEND compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT} ${compile_flags}
        ${flags} -fpie -ffreestanding -fno-exceptions -fno-rtti)
 
   # The GPU build requires overriding the default CMake triple and architecture.
-  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     list(APPEND compile_options
          -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-         --target=${LIBC_GPU_TARGET_TRIPLE}
          -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
-  elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-    get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     list(APPEND compile_options
-         -nogpulib ${nvptx_options} -fno-use-cxa-atexit --target=${LIBC_GPU_TARGET_TRIPLE})
+         "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
+         --cuda-path=${LIBC_CUDA_ROOT}
+         -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 9e9b598721ab3..19515b1cbcc18 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -139,7 +139,7 @@ function(add_gen_header target_name)
             ${hdrgen_deps}
   )
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if(LIBC_TARGET_OS_IS_GPU)
     file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/llvm-libc-decls)
     set(decl_out_file ${LIBC_INCLUDE_DIR}/llvm-libc-decls/${relative_path})
     add_custom_command(
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index 81c207ec23176..f15ffd5f9c218 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -50,31 +50,9 @@ function(collect_object_file_deps target result)
   endif()
 endfunction(collect_object_file_deps)
 
-# A rule to build a library from a collection of entrypoint objects.
-# Usage:
-#     add_entrypoint_library(
-#       DEPENDS <list of add_entrypoint_object targets>
-#     )
-#
-# NOTE: If one wants an entrypoint to be available in a library, then they will
-# have to list the entrypoint target explicitly in the DEPENDS list. Implicit
-# entrypoint dependencies will not be added to the library.
-function(add_entrypoint_library target_name)
-  cmake_parse_arguments(
-    "ENTRYPOINT_LIBRARY"
-    "" # No optional arguments
-    "" # No single value arguments
-    "DEPENDS" # Multi-value arguments
-    ${ARGN}
-  )
-  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
-    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
-                        "of 'add_entrypoint_object' targets.")
-  endif()
-
-  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+function(get_all_object_file_deps result fq_deps_list)
   set(all_deps "")
-  foreach(dep IN LISTS fq_deps_list)
+  foreach(dep ${fq_deps_list})
     get_target_property(dep_type ${dep} "TARGET_TYPE")
     if(NOT ((${dep_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE}) OR
             (${dep_type} STREQUAL ${ENTRYPOINT_EXT_TARGET_TYPE}) OR
@@ -102,6 +80,121 @@ function(add_entrypoint_library target_name)
     list(APPEND all_deps ${entrypoint_target})
   endforeach(dep)
   list(REMOVE_DUPLICATES all_deps)
+  set(${result} ${all_deps} PARENT_SCOPE)
+endfunction()
+
+# A rule to build a library from a collection of entrypoint objects and bundle
+# it into a GPU fatbinary. Usage is the same as 'add_entrypoint_library'.
+# Usage:
+#     add_gpu_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+function(add_gpu_entrypoint_library target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
+  # The GPU 'libc' needs to be exported in a format that can be linked with
+  # offloading langauges like OpenMP or CUDA. This wraps every GPU object into a
+  # fat binary and adds them to a static library.
+  set(objects "")
+  foreach(dep IN LISTS all_deps)
+    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
+    string(FIND ${dep} "." last_dot_loc REVERSE)
+    math(EXPR name_loc "${last_dot_loc} + 1")
+    string(SUBSTRING ${dep} ${name_loc} -1 name)
+    if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+      set(prefix --image=arch=generic,triple=nvptx64-nvidia-cuda,feature=+ptx63)
+    elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+      set(prefix --image=arch=generic,triple=amdgcn-amd-amdhsa)
+    endif()
+
+    # Use the 'clang-offload-packager' to merge these files into a binary blob.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
+      COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+              "${prefix},file=$<JOIN:${object},,file=>" -o 
+              ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
+      DEPENDS ${dep}
+      COMMENT "Packaging LLVM offloading binary for '${object}'"
+    )
+    add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
+                      "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+
+    # CMake does not permit setting the name on object files. In order to have
+    # human readable names we create an empty stub file with the entrypoint
+    # name. This empty file will then have the created binary blob embedded.
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
+      DEPENDS ${dep} ${dep}.__gpubin__
+    )
+    add_custom_target(${dep}.__stub__ 
+                      DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
+
+    add_library(${dep}.__fatbin__
+      EXCLUDE_FROM_ALL OBJECT
+      "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
+    )
+
+    # This is always compiled for the LLVM host triple instead of the native GPU
+    # triple that is used by default in the build. 
+    target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
+    target_compile_options(${dep}.__fatbin__ PRIVATE 
+      --target=${LLVM_HOST_TRIPLE}
+      "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
+    add_dependencies(${dep}.__fatbin__ ${dep} ${dep}.__stub__ ${dep}.__gpubin__)
+
+    # Set the list of newly create fat binaries containing embedded device code.
+    list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
+  endforeach()
+
+  add_library(
+    ${target_name}
+    STATIC
+      ${objects}
+  )
+  set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
+endfunction(add_gpu_entrypoint_library)
+
+# A rule to build a library from a collection of entrypoint objects.
+# Usage:
+#     add_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+#
+# NOTE: If one wants an entrypoint to be available in a library, then they will
+# have to list the entrypoint target explicitly in the DEPENDS list. Implicit
+# entrypoint dependencies will not be added to the library.
+function(add_entrypoint_library target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
   set(objects "")
   foreach(dep IN LISTS all_deps)
     list(APPEND objects $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 308ba7d0d5dd7..78536f4eec55a 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -1,175 +1,5 @@
 set(OBJECT_LIBRARY_TARGET_TYPE "OBJECT_LIBRARY")
 
-# Build the object target for a single GPU arch.
-# Usage:
-#     _build_gpu_object_for_single_arch(
-#       <target_name>
-#       <gpu_arch>
-#       SRCS <list of .cpp files>
-#       HDRS <list of .h files>
-#       DEPENDS <list of dependencies>
-#       COMPILE_OPTIONS <optional list of special compile options for this target>
-#       FLAGS <optional list of flags>
-#     )
-function(_build_gpu_object_for_single_arch fq_target_name gpu_arch)
-  cmake_parse_arguments(
-    "ADD_GPU_OBJ"
-    "" # No optional arguments
-    "NAME;CXX_STANDARD" # Single value arguments
-    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
-    ${ARGN}
-  )
-
-  if(NOT ADD_GPU_OBJ_CXX_STANDARD)
-    set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-  endif()
-
-  set(compile_options ${ADD_GPU_OBJ_COMPILE_OPTIONS})
-  # Derive the triple from the specified architecture.
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    set(gpu_target_triple ${AMDGPU_TARGET_TRIPLE})
-    list(APPEND compile_options "-mcpu=${gpu_arch}")
-    list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
-    list(APPEND compile_options "-emit-llvm")
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    set(gpu_target_triple ${NVPTX_TARGET_TRIPLE})
-    get_nvptx_compile_options(nvptx_options ${gpu_arch})
-    list(APPEND compile_options "${nvptx_options}")
-  else()
-    message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
-  endif()
-  list(APPEND compile_options "--target=${gpu_target_triple}")
-
-  # Build the library for this target architecture. We always emit LLVM-IR for
-  # packaged GPU binaries.
-  add_library(${fq_target_name}
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_GPU_OBJ_SRCS}
-    ${ADD_GPU_OBJ_HDRS}
-  )
-
-  target_compile_options(${fq_target_name} PRIVATE ${compile_options})
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  set_target_properties(${fq_target_name} PROPERTIES CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD})
-  if(ADD_GPU_OBJ_DEPENDS)
-    add_dependencies(${fq_target_name} ${ADD_GPU_OBJ_DEPENDS})
-    set_target_properties(${fq_target_name} PROPERTIES DEPS "${ADD_GPU_OBJ_DEPENDS}")
-  endif()
-endfunction(_build_gpu_object_for_single_arch)
-
-# Build the object target for the GPU.
-# This compiles the target for all supported architectures and embeds it into
-# host binary for installing.
-# Usage:
-#     _build_gpu_object_bundle(
-#       <target_name>
-#       SRCS <list of .cpp files>
-#       HDRS <list of .h files>
-#       DEPENDS <list of dependencies>
-#       COMPILE_OPTIONS <optional list of special compile options for this target>
-#       FLAGS <optional list of flags>
-#     )
-function(_build_gpu_object_bundle fq_target_name)
-  cmake_parse_arguments(
-    "ADD_GPU_OBJ"
-    "" # No optional arguments
-    "NAME;CXX_STANDARD" # Single value arguments
-    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
-    ${ARGN}
-  )
-
-  if(NOT ADD_GPU_OBJ_CXX_STANDARD)
-    set(ADD_GPU_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
-  endif()
-
-  foreach(add_gpu_obj_src ${ADD_GPU_OBJ_SRCS})
-    # The packaged version will be built for every target GPU architecture. We do
-    # this so we can support multiple accelerators on the same machine.
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
-      get_filename_component(src_name ${add_gpu_obj_src} NAME)
-      set(gpu_target_name ${fq_target_name}.${src_name}.${gpu_arch})
-
-      _build_gpu_object_for_single_arch(
-        ${gpu_target_name}
-        ${gpu_arch}
-        CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD}
-        HDRS ${ADD_GPU_OBJ_HDRS}
-        SRCS ${add_gpu_obj_src}
-        COMPILE_OPTIONS
-          ${ADD_GPU_OBJ_COMPILE_OPTIONS}
-          "-emit-llvm"
-        DEPENDS ${ADD_GPU_OBJ_DEPENDS}
-      )
-      # Append this target to a list of images to package into a single binary.
-      set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
-      if("${gpu_arch}" IN_LIST all_nvptx_architectures)
-        get_nvptx_compile_options(nvptx_options ${gpu_arch})
-        string(REGEX MATCH "\\+ptx[0-9]+" nvptx_ptx_feature ${nvptx_options})
-        list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${NVPTX_TARGET_TRIPLE},feature=${nvptx_ptx_feature})
-      else()
-        list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${AMDGPU_TARGET_TRIPLE})
-       endif()
-      list(APPEND gpu_target_objects ${input_file})
-    endforeach()
-
-    # After building the target for the desired GPUs we must package the output
-    # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
-    # more information.
-    set(packaged_target_name ${fq_target_name}.${src_name}.__gpu__)
-    set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.${src_name}.gpubin)
-
-    add_custom_command(OUTPUT ${packaged_output_name}
-                       COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
-                               ${packager_images} -o ${packaged_output_name}
-                       DEPENDS ${gpu_target_objects} ${add_gpu_obj_src} ${ADD_GPU_OBJ_HDRS}
-                       COMMENT "Packaging LLVM offloading binary")
-    add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
-    list(APPEND packaged_gpu_names ${packaged_target_name})
-    list(APPEND packaged_gpu_binaries ${packaged_output_name})
-  endforeach()
-
-  # We create an empty 'stub' file for the host to contain the embedded device
-  # code. This will be packaged into 'libcgpu.a'.
-  # TODO: In the future we will want to combine every architecture for a target
-  #       into a single bitcode file and use that. For now we simply build for
-  #       every single one and let the offloading linker handle it.
-  string(FIND ${fq_target_name} "." last_dot_loc REVERSE)
-  math(EXPR name_loc "${last_dot_loc} + 1")
-  string(SUBSTRING ${fq_target_name} ${name_loc} -1 target_name)
-  set(stub_filename "${target_name}.cpp")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}"
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs/
-    COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}
-    DEPENDS ${gpu_target_objects} ${ADD_GPU_OBJ_SRCS} ${ADD_GPU_OBJ_HDRS}
-  )
-  set(stub_target_name ${fq_target_name}.__stub__)
-  add_custom_target(${stub_target_name} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename})
-
-  add_library(
-    ${fq_target_name}
-    # We want an object library as the objects will eventually get packaged into
-    # an archive (like libcgpu.a).
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${CMAKE_CURRENT_BINARY_DIR}/stubs/${stub_filename}
-  )
-  target_compile_options(${fq_target_name} BEFORE PRIVATE
-                         ${ADD_GPU_OBJ_COMPILE_OPTIONS} -nostdlib)
-  foreach(packaged_gpu_binary ${packaged_gpu_binaries})
-    target_compile_options(${fq_target_name} PRIVATE
-                           "SHELL:-Xclang -fembed-offload-object=${packaged_gpu_binary}")
-  endforeach()
-  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-  add_dependencies(${fq_target_name}
-                   ${full_deps_list} ${packaged_gpu_names} ${stub_target_name})
-endfunction()
-
 # Rule which is essentially a wrapper over add_library to compile a set of
 # sources to object files.
 # Usage:
@@ -214,53 +44,37 @@ function(create_object_library fq_target_name)
     message(FATAL_ERROR "'add_object_library' rule requires SRCS to be specified.")
   endif()
 
-  # The GPU build uses a separate internal file.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND NOT ${ADD_OBJECT_NO_GPU_BUNDLE})
-    set(internal_target_name ${fq_target_name}.__internal__)
-    set(public_packaging_for_internal "")
-  else()
-    set(internal_target_name ${fq_target_name})
-    set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING")
-  endif()
+  set(internal_target_name ${fq_target_name}.__internal__)
+  set(public_packaging_for_internal "-DLIBC_COPT_PUBLIC_PACKAGING")
 
   _get_common_compile_options(compile_options "${ADD_OBJECT_FLAGS}")
   list(APPEND compile_options ${ADD_OBJECT_COMPILE_OPTIONS})
 
-  # GPU builds require special handling for the objects because we want to
-  # export several different targets at once, e.g. for both Nvidia and AMD.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    if(NOT ${ADD_OBJECT_NO_GPU_BUNDLE})
-      _build_gpu_object_bundle(
-        ${fq_target_name}
-        SRCS ${ADD_OBJECT_SRCS}
-        HDRS ${ADD_OBJECT_HDRS}
-        CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-        COMPILE_OPTIONS ${compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING"
-        DEPENDS ${fq_deps_list}
-      )
-    endif()
-    # When the target for GPU is not bundled, internal_target_name is the same
-    # as fq_targetname
-    _build_gpu_object_for_single_arch(
-      ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
-      SRCS ${ADD_OBJECT_SRCS}
-      HDRS ${ADD_OBJECT_HDRS}
-      CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-      COMPILE_OPTIONS ${compile_options} ${public_packaging_for_internal}
-      DEPENDS ${fq_deps_list}
-    )
-  else()
+  add_library(
+    ${fq_target_name}
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_OBJECT_SRCS}
+    ${ADD_OBJECT_HDRS}
+  )
+  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_compile_options(${fq_target_name} PRIVATE ${compile_options})
+
+  # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
+  # cannot handle it natively. Make a separate internal target for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_GPU_TESTS_DISABLED)
     add_library(
-      ${fq_target_name}
+      ${internal_target_name}
       EXCLUDE_FROM_ALL
       OBJECT
       ${ADD_OBJECT_SRCS}
       ${ADD_OBJECT_HDRS}
     )
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    target_compile_options(${fq_target_name} PRIVATE ${compile_options})
+    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_compile_options(${internal_target_name} PRIVATE ${compile_options}
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
   if(SHOW_INTERMEDIATE_OBJECTS)
@@ -290,13 +104,18 @@ function(create_object_library fq_target_name)
       FLAGS "${ADD_OBJECT_FLAGS}"
   )
 
+  # If we built a separate internal target we want to use those target objects
+  # for testing instead of the exported target.
+  set(target_objects ${fq_target_name})
   if(TARGET ${internal_target_name})
-    set_target_properties(
-      ${fq_target_name}
-      PROPERTIES
-        OBJECT_FILES "$<TARGET_OBJECTS:${internal_target_name}>"
-    )
+    set(target_objects ${internal_target_name})
   endif()
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      OBJECT_FILES "$<TARGET_OBJECTS:${target_objects}>"
+  )
 endfunction(create_object_library)
 
 function(add_object_library target_name)
@@ -389,12 +208,19 @@ function(create_entrypoint_object fq_target_name)
 
     get_target_property(object_file ${fq_dep_name} "OBJECT_FILE")
     get_target_property(object_file_raw ${fq_dep_name} "OBJECT_FILE_RAW")
-    add_library(
-      ${internal_target_name}
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${object_file_raw}
-    )
+
+    # If the system cannot build the GPU tests we simply make a dummy target.
+    if(LIBC_TARGET_OS_IS_GPU AND LIBC_GPU_TESTS_DISABLED)
+      add_custom_target(${internal_target_name})
+    else()
+      add_library(
+        ${internal_target_name}
+        EXCLUDE_FROM_ALL
+        OBJECT
+        ${object_file_raw}
+      )
+    endif()
+
     add_dependencies(${internal_target_name} ${fq_dep_name})
     add_library(
       ${fq_target_name}
@@ -441,60 +267,42 @@ function(create_entrypoint_object fq_target_name)
     endif()
   endif()
 
-  # GPU builds require special handling for the objects because we want to
-  # export several different targets at once, e.g. for both Nvidia and AMD.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    _build_gpu_object_bundle(
-      ${fq_target_name}
-      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
-      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options} "-DLIBC_COPT_PUBLIC_PACKAGING"
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPENDS ${full_deps_list}
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-    )
-    _build_gpu_object_for_single_arch(
-      ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
-      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
-      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options}
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPENDS ${full_deps_list}
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-    )
-  else()
-    add_library(
-      ${internal_target_name}
-      # TODO: We don't need an object library for internal consumption.
-      # A future change should switch this to a normal static library.
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_ENTRYPOINT_OBJ_SRCS}
-      ${ADD_ENTRYPOINT_OBJ_HDRS}
-    )
-    target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-    target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    add_dependencies(${internal_target_name} ${full_deps_list})
-    target_link_libraries(${internal_target_name} ${full_deps_list})
-
-    add_library(
-      ${fq_target_name}
-      # We want an object library as the objects will eventually get packaged into
-      # an archive (like libc.a).
-      EXCLUDE_FROM_ALL
-      OBJECT
-      ${ADD_ENTRYPOINT_OBJ_SRCS}
-      ${ADD_ENTRYPOINT_OBJ_HDRS}
-    )
-    target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
-    target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
-    target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
-    add_dependencies(${fq_target_name} ${full_deps_list})
-    target_link_libraries(${fq_target_name} ${full_deps_list})
+  add_library(
+    ${internal_target_name}
+    # TODO: We don't need an object library for internal consumption.
+    # A future change should switch this to a normal static library.
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_ENTRYPOINT_OBJ_SRCS}
+    ${ADD_ENTRYPOINT_OBJ_HDRS}
+  )
+  target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
+  target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  add_dependencies(${internal_target_name} ${full_deps_list})
+  target_link_libraries(${internal_target_name} ${full_deps_list})
+
+  # The NVPTX target cannot use LTO for the internal targets used for testing.
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    target_compile_options(${internal_target_name} PRIVATE 
+                           -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
+  add_library(
+    ${fq_target_name}
+    # We want an object library as the objects will eventually get packaged into
+    # an archive (like libc.a).
+    EXCLUDE_FROM_ALL
+    OBJECT
+    ${ADD_ENTRYPOINT_OBJ_SRCS}
+    ${ADD_ENTRYPOINT_OBJ_HDRS}
+  )
+  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
+  target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  add_dependencies(${fq_target_name} ${full_deps_list})
+  target_link_libraries(${fq_target_name} ${full_deps_list})
+
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 6ca9516ff7a0e..373cbd6853859 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -449,7 +449,7 @@ function(add_integration_test test_name)
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${INTEGRATION_TEST_SRCS}
     ${INTEGRATION_TEST_HDRS}
   )
@@ -461,8 +461,17 @@ function(add_integration_test test_name)
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -474,9 +483,10 @@ function(add_integration_test test_name)
   target_link_libraries(
     ${fq_build_target_name}
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<NOT:$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
-    libc.startup.${LIBC_TARGET_OS}.crt1
-    libc.test.IntegrationTest.test)
+    $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>
+    libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
+    libc.test.IntegrationTest.test${internal_suffix}
+  )
   add_dependencies(${fq_build_target_name}
                    libc.test.IntegrationTest.test
                    ${INTEGRATION_TEST_DEPENDS})
@@ -495,7 +505,7 @@ function(add_integration_test test_name)
   # makes `add_custom_target` construct the correct command and execute it.
   set(test_cmd
       ${INTEGRATION_TEST_ENV}
-      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_GPU}>:${gpu_loader_exe}>
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
       ${CMAKE_CROSSCOMPILING_EMULATOR}
       ${INTEGRATION_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${INTEGRATION_TEST_ARGS})
@@ -606,7 +616,7 @@ function(add_libc_hermetic_test test_name)
     ${fq_build_target_name}
     EXCLUDE_FROM_ALL
     # The NVIDIA 'nvlink' linker does not currently support static libraries.
-    $<$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
+    $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:${link_object_files}>
     ${HERMETIC_TEST_SRCS}
     ${HERMETIC_TEST_HDRS}
   )
@@ -615,6 +625,8 @@ function(add_libc_hermetic_test test_name)
       RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
       #OUTPUT_NAME ${fq_target_name}
   )
+
+  _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
@@ -629,8 +641,17 @@ function(add_libc_hermetic_test test_name)
     endif()
   endforeach()
 
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
+  if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
+      "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
+  elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+    # We need to use the internal object versions for NVPTX.
+    set(internal_suffix ".__internal__")
+    target_link_options(${fq_build_target_name} PRIVATE 
+      -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
+      "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
   else()
@@ -642,12 +663,12 @@ function(add_libc_hermetic_test test_name)
   target_link_libraries(
     ${fq_build_target_name}
     PRIVATE
-      libc.startup.${LIBC_TARGET_OS}.crt1
+      libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix}
       ${link_libraries}
       LibcTest.hermetic
       LibcHermeticTestSupport.hermetic
       # The NVIDIA 'nvlink' linker does not currently support static libraries.
-      $<$<NOT:$<BOOL:${LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
+      $<$<NOT:$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>>:${fq_target_name}.__libc__>)
   add_dependencies(${fq_build_target_name}
                    LibcTest.hermetic
                    libc.test.UnitTest.ErrnoSetterMatcher
@@ -660,7 +681,7 @@ function(add_libc_hermetic_test test_name)
   endif()
 
   set(test_cmd ${HERMETIC_TEST_ENV}
-      $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
+      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
   add_custom_target(
     ${fq_target_name}
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 2086175bae6c7..75beef86760c8 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -1,23 +1,8 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   message(FATAL_ERROR
           "libc build: Invalid attempt to set up GPU architectures.")
 endif()
 
-# Set up the target architectures to build the GPU libc for.
-set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-                             "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942"
-                             "gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034"
-                             "gfx1035;gfx1036"
-                             "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151")
-set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-                            "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
-set(all_gpu_architectures
-    "${all_amdgpu_architectures};${all_nvptx_architectures}")
-set(LIBC_GPU_ARCHITECTURES "all" CACHE STRING
-    "List of GPU architectures to build the libc for.")
-set(AMDGPU_TARGET_TRIPLE "amdgcn-amd-amdhsa")
-set(NVPTX_TARGET_TRIPLE "nvptx64-nvidia-cuda")
-
 # Ensure the compiler is a valid clang when building the GPU target.
 set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
 if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
@@ -31,40 +16,6 @@ if(NOT LLVM_LIBC_FULL_BUILD)
                       "GPU.")
 endif()
 
-# Identify any locally installed AMD GPUs on the system using 'amdgpu-arch'.
-find_program(LIBC_AMDGPU_ARCH
-             NAMES amdgpu-arch NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin /opt/rocm/llvm/bin/)
-
-# Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
-find_program(LIBC_NVPTX_ARCH
-             NAMES nvptx-arch NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin)
-
-# Get the list of all natively supported GPU architectures.
-set(detected_gpu_architectures "")
-foreach(arch_tool ${LIBC_NVPTX_ARCH} ${LIBC_AMDGPU_ARCH})
-  if(arch_tool)
-    execute_process(COMMAND ${arch_tool}
-                    OUTPUT_VARIABLE arch_tool_output
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-    string(REPLACE "\n" ";" arch_list "${arch_tool_output}")
-    list(APPEND detected_gpu_architectures "${arch_list}")
-  endif()
-endforeach()
-list(REMOVE_DUPLICATES detected_gpu_architectures)
-
-if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
-  set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBC_GPU_ARCHITECTURES STREQUAL "native")
-  if(NOT detected_gpu_architectures)
-    message(FATAL_ERROR "No GPUs found on the system when using 'native'")
-  endif()
-  set(LIBC_GPU_ARCHITECTURES ${detected_gpu_architectures})
-endif()
-message(STATUS "Building libc for the following GPU architecture(s): "
-               "${LIBC_GPU_ARCHITECTURES}")
-
 # Identify the program used to package multiple images into a single binary.
 find_program(LIBC_CLANG_OFFLOAD_PACKAGER
              NAMES clang-offload-packager NO_DEFAULT_PATH
@@ -87,49 +38,54 @@ else()
 endif()
 
 set(LIBC_GPU_TEST_ARCHITECTURE "" CACHE STRING "Architecture for the GPU tests")
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
+  check_cxx_compiler_flag("-nogpulib -mcpu=native" PLATFORM_HAS_GPU)
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  # Identify any locally installed NVIDIA GPUs on the system using 'nvptx-arch'.
+  # Using 'check_cxx_compiler_flag' does not work currently due to the link job.
+  find_program(LIBC_NVPTX_ARCH
+               NAMES nvptx-arch NO_DEFAULT_PATH
+               PATHS ${LLVM_BINARY_DIR}/bin)
+  if(LIBC_NVPTX_ARCH)
+    execute_process(COMMAND ${LIBC_NVPTX_ARCH}
+                    OUTPUT_VARIABLE arch_tool_output
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(arch_tool_output MATCHES "^sm_[0-9]+")
+      set(PLATFORM_HAS_GPU TRUE)
+    endif()
+  endif()
+endif()
 
 set(gpu_test_architecture "")
 if(LIBC_GPU_TEST_ARCHITECTURE)
+  set(LIBC_GPU_TESTS_DISABLED FALSE)
   set(gpu_test_architecture ${LIBC_GPU_TEST_ARCHITECTURE})
   message(STATUS "Using user-specified GPU architecture for testing: "
                  "'${gpu_test_architecture}'")
-elseif(detected_gpu_architectures)
-  list(GET detected_gpu_architectures 0 gpu_test_architecture)
+elseif(PLATFORM_HAS_GPU)
+  set(LIBC_GPU_TESTS_DISABLED FALSE)
+  set(gpu_test_architecture "native")
   message(STATUS "Using GPU architecture detected on the system for testing: "
-                 "'${gpu_test_architecture}'")
+                 "'native'")
 else()
-  list(LENGTH LIBC_GPU_ARCHITECTURES n_gpu_archs)
-  if (${n_gpu_archs} EQUAL 1)
-    set(gpu_test_architecture ${LIBC_GPU_ARCHITECTURES})
-    message(STATUS "Using user-specified GPU architecture for testing: "
-                  "'${gpu_test_architecture}'")
-  else()
-    message(STATUS "No GPU architecture set for testing. GPU tests will not be "
-                  "availibe. Set 'LIBC_GPU_TEST_ARCHITECTURE' to override.")
-    return()
-  endif()
+  set(LIBC_GPU_TESTS_DISABLED TRUE)
+  message(STATUS "No GPU architecture detected or provided, tests will not be "
+                 "built")
 endif()
+set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
 
-if("${gpu_test_architecture}" IN_LIST all_amdgpu_architectures)
-  set(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU TRUE)
-  set(LIBC_GPU_TARGET_TRIPLE ${AMDGPU_TARGET_TRIPLE})
-  set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
-elseif("${gpu_test_architecture}" IN_LIST all_nvptx_architectures)
-  set(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX TRUE)
-  set(LIBC_GPU_TARGET_TRIPLE ${NVPTX_TARGET_TRIPLE})
-  set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
-else()
-  message(FATAL_ERROR "Unknown GPU architecture '${gpu_test_architecture}'")
-endif()
+if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  # FIXME: This is a hack required to keep the CUDA package from trying to find
+  #        pthreads. We only link the CUDA driver, so this is unneeded.
+  add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
   find_package(CUDAToolkit QUIET)
   if(CUDAToolkit_FOUND)
     get_filename_component(LIBC_CUDA_ROOT "${CUDAToolkit_BIN_DIR}" DIRECTORY ABSOLUTE)
   endif()
 endif()
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   # The AMDGPU environment uses different code objects to encode the ABI for
   # kernel calls and intrinsic functions. We want to specify this manually to
   # conform to whatever the test suite was built to handle.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index 71f5e7ba20393..79b9116c38ed2 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -14,25 +14,25 @@ Building the GPU library
 
 LLVM's libc GPU support *must* be built with an up-to-date ``clang`` compiler
 due to heavy reliance on ``clang``'s GPU support. This can be done automatically
-using the ``LLVM_ENABLE_RUNTIMES=libc`` option. To enable libc for the GPU,
-enable the ``LIBC_GPU_BUILD`` option. By default, ``libcgpu.a`` will be built
-using every supported GPU architecture. To restrict the number of architectures
-build, either set ``LIBC_GPU_ARCHITECTURES`` to the list of desired
-architectures manually or use ``native`` to detect the GPUs on your system. A
-typical ``cmake`` configuration will look like this:
+using the LLVM runtimes support. The GPU build is done using cross-compilation 
+to the GPU architecture. This project currently supports AMD and NVIDIA GPUs 
+which can be targeted using the appropriate target name. The following 
+invocation will enable a cross-compiling build for the GPU architecture and 
+enable the ``libc`` project only for them. 
 
 .. code-block:: sh
 
   $> cd llvm-project  # The llvm-project checkout
   $> mkdir build
   $> cd build
-  $> cmake ../llvm -G Ninja                                \
-     -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt"        \
-     -DLLVM_ENABLE_RUNTIMES="libc;openmp"                  \
+  $> cmake ../llvm -G Ninja                                               \
+     -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt"                       \
+     -DLLVM_ENABLE_RUNTIMES="openmp"                                      \
      -DCMAKE_BUILD_TYPE=<Debug|Release>   \ # Select build type
-     -DLIBC_GPU_BUILD=ON                  \ # Build in GPU mode
-     -DLIBC_GPU_ARCHITECTURES=all         \ # Build all supported architectures
-     -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live
+     -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live 
+     -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc             \      
+     -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=libc               \
+     -DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda
   $> ninja install
 
 Since we want to include ``clang``, ``lld`` and ``compiler-rt`` in our
@@ -40,13 +40,14 @@ toolchain, we list them in ``LLVM_ENABLE_PROJECTS``. To ensure ``libc`` is built
 using a compatible compiler and to support ``openmp`` offloading, we list them
 in ``LLVM_ENABLE_RUNTIMES`` to build them after the enabled projects using the
 newly built compiler. ``CMAKE_INSTALL_PREFIX`` specifies the installation
-directory in which to install the ``libcgpu.a`` library and headers along with
-LLVM. The generated headers will be placed in ``include/gpu-none-llvm``.
+directory in which to install the ``libcgpu-nvptx.a`` and ``libcgpu-amdgpu.a`` 
+libraries and headers along with LLVM. The generated headers will be placed in 
+``include/<gpu-triple>``.
 
 Usage
 =====
 
-Once the ``libcgpu.a`` static archive has been built it can be linked directly
+Once the static archive has been built it can be linked directly
 with offloading applications as a standard library. This process is described in
 the `clang documentation <https://clang.llvm.org/docs/OffloadingDesign.html>`_.
 This linking mode is used by the OpenMP toolchain, but is currently opt-in for
@@ -68,7 +69,7 @@ supported target device. The supported architectures can be seen using LLVM's
 
   OFFLOADING IMAGE [0]:
   kind            llvm ir
-  arch            gfx90a
+  arch            generic
   triple          amdgcn-amd-amdhsa
   producer        none
 
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index dc3c9b8e6328a..9090b3bca01e0 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -4,7 +4,7 @@ set(LIBC_INCLUDE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 include(LLVMLibCHeaderRules)
 
 # The GPU build wants to install files in the compiler's resource directory.
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   include(GetClangResourceDir)
 endif()
 
@@ -586,7 +586,7 @@ add_gen_header(
     .llvm-libc-types.wchar_t
 )
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/gpu)
 
   add_gen_header(
@@ -638,7 +638,7 @@ foreach(target IN LISTS all_install_header_targets)
   # The GPU optionally provides the supported declarations externally so
   # offloading languages like CUDA and OpenMP know what is supported by libc. We
   # install these in the compiler's resource directory at a preset location.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND PACKAGE_VERSION)
+  if(LIBC_TARGET_OS_IS_GPU AND PACKAGE_VERSION)
     get_target_property(decls_file ${target} DECLS_FILE_PATH)
     if(NOT decls_file)
       continue()
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index c1a804232c1f5..615f4270646fb 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -2,11 +2,7 @@ set(libc_archive_targets "")
 set(libc_archive_names "")
 set(libc_archive_entrypoint_lists "")
 if(LLVM_LIBC_FULL_BUILD)
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    list(APPEND libc_archive_names cgpu mgpu)
-  else()
-    list(APPEND libc_archive_names c m)
-  endif()
+  list(APPEND libc_archive_names c m)
   list(APPEND libc_archive_targets libc libm)
   list(APPEND libc_archive_entrypoint_lists
        TARGET_LIBC_ENTRYPOINTS TARGET_LIBM_ENTRYPOINTS)
@@ -40,6 +36,27 @@ foreach(archive IN ZIP_LISTS
     endif()
   endif()
   list(APPEND added_archive_targets ${archive_1})
+
+  # Add the offloading version of the library for offloading languages. These
+  # are installed in the standard search path separate from the other libraries.
+  if(LIBC_TARGET_OS_IS_GPU)
+    set(libc_gpu_archive_target ${archive_1}gpu)
+    set(libc_gpu_archive_name ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE})
+
+    add_gpu_entrypoint_library(
+      ${libc_gpu_archive_target}
+      DEPENDS
+        ${${archive_2}}
+    )
+    set_target_properties(
+      ${libc_gpu_archive_target}
+      PROPERTIES
+        ARCHIVE_OUTPUT_NAME ${libc_gpu_archive_name}
+    )
+    set_target_properties(${libc_gpu_archive_target} PROPERTIES 
+                          ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR})
+    list(APPEND added_gpu_archive_targets ${libc_gpu_archive_target})
+  endif()
 endforeach()
 
 install(
@@ -48,6 +65,14 @@ install(
   COMPONENT libc
 )
 
+if(LIBC_TARGET_OS_IS_GPU)
+  install(
+    TARGETS ${added_gpu_archive_targets}
+    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+    COMPONENT libc
+  )
+endif()
+
 if(NOT LIBC_TARGET_OS_IS_BAREMETAL)
   # For now we will disable libc-startup installation for baremetal. The
   # correct way to do it would be to make a hookable startup for baremetal
diff --git a/libc/src/__support/File/CMakeLists.txt b/libc/src/__support/File/CMakeLists.txt
index b3e4cc4b02779..b7c0612096aa9 100644
--- a/libc/src/__support/File/CMakeLists.txt
+++ b/libc/src/__support/File/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(NOT (TARGET libc.src.__support.threads.mutex)
-   OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
+   OR LIBC_TARGET_OS_IS_GPU)
   # Not all platforms have a mutex implementation. If mutex is unvailable,
   # we just skip everything about files.
   return()
diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
index 5a899215f4b6e..d7ebd3cab7abe 100644
--- a/libc/src/__support/GPU/CMakeLists.txt
+++ b/libc/src/__support/GPU/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt
index c19677582643e..ca3b3bf1263e0 100644
--- a/libc/src/__support/OSUtil/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/CMakeLists.txt
@@ -9,7 +9,7 @@ if(NOT TARGET ${target_os_util})
 endif()
 
 # The OSUtil is an object library in GPU mode.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_header_library(
     osutil
     HDRS
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index b44a65b3732e9..183fc6f8683e0 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 05ce51e8fc650..33dc1fc97c568 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_subdirectory(generic)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
   add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
+elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  # TODO: We should split this into 'nvptx' and 'amdgpu' for the GPU build.
+  add_subdirectory(${LIBC_TARGET_OS})
 endif()
 
 function(add_math_entrypoint_object name)
@@ -8,6 +11,7 @@ function(add_math_entrypoint_object name)
   # that first and return early if we are able to add an alias target for the
   # machine specific implementation.
   get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.${name}" fq_machine_specific_target_name)
+  get_fq_target_name("${LIBC_TARGET_OS}.${name}" fq_os_specific_target_name)
   if(TARGET ${fq_machine_specific_target_name})
     add_entrypoint_object(
       ${name}
@@ -16,17 +20,25 @@ function(add_math_entrypoint_object name)
         .${LIBC_TARGET_ARCHITECTURE}.${name}
     )
     return()
+  elseif(TARGET ${fq_os_specific_target_name})
+    add_entrypoint_object(
+      ${name}
+      ALIAS
+      DEPENDS
+        .${LIBC_TARGET_OS}.${name}
+    )
+    return()
   endif()
 
   # The GPU optionally depends on vendor libraries. If we emitted one of these
   # entrypoints it means the user requested it and we should use it instead.
-  get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.vendor.${name}" fq_vendor_specific_target_name)
+  get_fq_target_name("${LIBC_TARGET_OS}.vendor.${name}" fq_vendor_specific_target_name)
   if(TARGET ${fq_vendor_specific_target_name})
     add_entrypoint_object(
       ${name}
       ALIAS
       DEPENDS
-        .${LIBC_TARGET_ARCHITECTURE}.vendor.${name}
+        .${LIBC_TARGET_OS}.vendor.${name}
       VENDOR
     )
     return()
diff --git a/libc/src/math/gpu/vendor/CMakeLists.txt b/libc/src/math/gpu/vendor/CMakeLists.txt
index f699ca103b5f8..36087ade63bfc 100644
--- a/libc/src/math/gpu/vendor/CMakeLists.txt
+++ b/libc/src/math/gpu/vendor/CMakeLists.txt
@@ -10,7 +10,6 @@ else()
                  "functions will be an external reference to the vendor libraries.")
 endif()
 
-find_package(CUDAToolkit QUIET)
 if(CUDAToolkit_FOUND)
   set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
   if (EXISTS ${libdevice_path})
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 380474ce27118..bb8e41606c5df 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -22,7 +22,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/generic)
 endif()
 
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index a4d51fb9a11ee..ce08635df3145 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -316,7 +316,7 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
     DEPENDS
       ${SCUDO_DEPS}
   )
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_entrypoint_external(
     calloc
   )
@@ -397,7 +397,7 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.abort
 )
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   add_entrypoint_object(
     malloc
     ALIAS
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 6daaf1998ea7b..1c893280e8a3c 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -501,7 +501,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_bcmp(bcmp_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512BW)
   add_bcmp(bcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_bcmp(bcmp)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_bcmp(bcmp)
 else()
   add_bcmp(bcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -530,7 +530,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_bzero(bzero_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
   add_bzero(bzero_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_bzero(bzero)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_bzero(bzero)
 else()
   add_bzero(bzero_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -562,7 +562,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memcmp(memcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcmp(memcmp)
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memcmp(memcmp)
 else()
   add_memcmp(memcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -598,7 +598,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                       MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
   add_memcpy(memcpy                   MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memcpy(memcpy)
 else()
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -632,7 +632,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memmove(memmove_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                         MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
   add_memmove(memmove                   MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memmove(memmove)
 else()
   add_memmove(memmove_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
@@ -667,7 +667,7 @@ elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                       MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
   add_memset(memset                   MLLVM_COMPILE_OPTIONS "-tail-merge-threshold=0")
-elseif(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(LIBC_TARGET_OS_IS_GPU)
   add_memset(memset)
 else()
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index fa7f69f19520c..6f67fa9ff44f7 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -28,33 +28,24 @@ function(add_startup_object name)
   )
 endfunction()
 
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  add_subdirectory(amdgpu)
-
-  add_startup_object(
-    crt1
-    ALIAS
-    DEPENDS
-      .amdgpu.crt1
-  )
-elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_subdirectory(nvptx)
-
-  add_startup_object(
-    crt1
-    ALIAS
-    DEPENDS
-      .nvptx.crt1
-  )
-else()
-  # Skip building the startup code if there are no supported GPUs.
-  message(STATUS "Skipping startup for gpu target, no GPUs were detected")
-  return()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
+  add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
 endif()
 
+add_startup_object(
+  crt1
+  ALIAS
+  DEPENDS
+  .${LIBC_TARGET_ARCHITECTURE}.crt1
+)
+
 add_custom_target(libc-startup)
 set(startup_components crt1)
 foreach(target IN LISTS startup_components)
   set(fq_target_name libc.startup.gpu.${target})
   add_dependencies(libc-startup ${fq_target_name})
+  install(FILES $<TARGET_OBJECTS:${fq_target_name}>
+          DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
+          RENAME $<TARGET_PROPERTY:${fq_target_name},OUTPUT_NAME>
+          COMPONENT libc)
 endforeach()
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index c9d0ee2fd0e9a..3ac104ee8ba94 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_startup_object(
   crt1
-  NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
   SRC
     start.cpp
   DEPENDS
@@ -11,17 +10,5 @@ add_startup_object(
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
-    -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION} # Manually set the ABI.
 )
 get_fq_target_name(crt1 fq_name)
-
-# Ensure that clang uses the correct linker for this object type.
-target_link_libraries(
-  ${fq_name}
-  PUBLIC
-  "-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}"
-  "--target=${LIBC_GPU_TARGET_TRIPLE}"
-  "-flto"
-  "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0"
-  "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}"
-)
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 23a54516cc982..3ac104ee8ba94 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_startup_object(
   crt1
-  NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
   SRC
     start.cpp
   DEPENDS
@@ -13,11 +12,3 @@ add_startup_object(
     -fno-builtin
 )
 get_fq_target_name(crt1 fq_name)
-
-# Ensure that clang uses the correct linker for this object type.
-target_link_libraries(${fq_name}
-  PUBLIC
-  "-march=${LIBC_GPU_TARGET_ARCHITECTURE}"
-  "--target=${LIBC_GPU_TARGET_TRIPLE}"
-  "--cuda-path=${LIBC_CUDA_ROOT}"
-)
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index f22f2b183aca9..745a9a04b4af8 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -8,9 +8,9 @@ add_custom_target(libc-long-running-tests)
 
 add_subdirectory(UnitTest)
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND
-   (NOT TARGET libc.utils.gpu.loader OR NOT TARGET libc.startup.gpu.crt1))
-  message(WARNING "Cannot build libc GPU tests, missing loader implementation")
+if(LIBC_TARGET_OS_IS_GPU AND
+   (NOT TARGET libc.utils.gpu.loader OR LIBC_GPU_TESTS_DISABLED))
+  message(WARNING "Cannot build libc GPU tests, missing loader or architecture")
   return()
 endif()
 
diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt
index dca4c5a6f1b14..4f31f10b29f0b 100644
--- a/libc/test/IntegrationTest/CMakeLists.txt
+++ b/libc/test/IntegrationTest/CMakeLists.txt
@@ -1,21 +1,5 @@
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(TEST_COMPILE_FLAGS
-    -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
-    -emit-llvm # AMDGPU's intermediate object file format is bitcode.
-    --target=${LIBC_GPU_TARGET_TRIPLE}
-    -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION} # Manually set the ABI.
-  )
-elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-  set(TEST_COMPILE_FLAGS
-    -march=${LIBC_GPU_TARGET_ARCHITECTURE}
-    --target=${LIBC_GPU_TARGET_TRIPLE}
-    --cuda-path=${LIBC_CUDA_ROOT}
-  )
-endif()
-
 add_object_library(
   test
-  NO_GPU_BUNDLE # Compile this file directly without special GPU handling.
   SRCS
     test.cpp
   COMPILE_OPTIONS
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4a615d4bd5e1c..4668f0061975f 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -12,7 +12,7 @@ function(add_unittest_framework_library name)
   endif()
 
   # The Nvidia 'nvlink' linker does not support static libraries.
-  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+  if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     set(library_type OBJECT)
   else()
     set(library_type STATIC)
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9801621e6b399..53fa1323d18b7 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_custom_target(libc-support-tests)
 
 # FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_libc_test(
     blockstore_test
     SUITE
@@ -76,7 +76,7 @@ add_libc_test(
 )
 
 # The GPU does not support varargs currently.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_libc_test(
     arg_list_test
     SUITE
@@ -88,8 +88,7 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
   )
 endif()
 
-# FIXME: Crash in NVPTX target lowering for calls
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_libc_test(
     uint_test
     SUITE
@@ -159,29 +158,33 @@ add_libc_test(
     libc.src.__support.memory_size
 )
 
-add_executable(
-  libc_str_to_float_comparison_test
-  str_to_float_comparison_test.cpp
-)
+# FIXME: We shouldn't have regular executables created because we could be
+#        cross-compiling the tests and running through an emulator.
+if(NOT LIBC_TARGET_OS_IS_GPU)
+  add_executable(
+    libc_str_to_float_comparison_test
+    str_to_float_comparison_test.cpp
+  )
 
-target_link_libraries(libc_str_to_float_comparison_test
-  PRIVATE
-    "${LIBC_TARGET}"
-)
+  target_link_libraries(libc_str_to_float_comparison_test
+    PRIVATE
+      "${LIBC_TARGET}"
+  )
 
-add_executable(
-  libc_system_str_to_float_comparison_test
-  str_to_float_comparison_test.cpp
-)
+  add_executable(
+    libc_system_str_to_float_comparison_test
+    str_to_float_comparison_test.cpp
+  )
 
-set(float_test_file ${CMAKE_CURRENT_SOURCE_DIR}/str_to_float_comparison_data.txt)
+  set(float_test_file ${CMAKE_CURRENT_SOURCE_DIR}/str_to_float_comparison_data.txt)
 
-add_custom_command(TARGET libc_str_to_float_comparison_test
-                   POST_BUILD
-                   COMMAND $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
-                   DEPENDS ${float_test_file}
-                   COMMENT "Test the strtof and strtod implementations against precomputed results." 
-                   VERBATIM)
+  add_custom_command(TARGET libc_str_to_float_comparison_test
+                     POST_BUILD
+                     COMMAND $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
+                     DEPENDS ${float_test_file}
+                     COMMENT "Test the strtof and strtod implementations against precomputed results." 
+                     VERBATIM)
+endif()
 
 add_subdirectory(CPP)
 add_subdirectory(File)
diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt
index 6927579289bc2..d7f332f5b0fbd 100644
--- a/libc/test/src/__support/CPP/CMakeLists.txt
+++ b/libc/test/src/__support/CPP/CMakeLists.txt
@@ -64,7 +64,7 @@ add_libc_test(
 
 
 # This test fails with invalid address space operations on sm_60
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_libc_test(
     atomic_test
     SUITE
diff --git a/libc/test/src/__support/File/CMakeLists.txt b/libc/test/src/__support/File/CMakeLists.txt
index f193480c60c2b..9191469b4927c 100644
--- a/libc/test/src/__support/File/CMakeLists.txt
+++ b/libc/test/src/__support/File/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT (TARGET libc.src.__support.threads.mutex) OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT (TARGET libc.src.__support.threads.mutex) OR LIBC_TARGET_OS_IS_GPU)
   # Not all platforms have a mutex implementation. If mutex is unvailable,
   # we just skip everything about files. The GPU does not currently support
   # files as well.
diff --git a/libc/test/src/errno/CMakeLists.txt b/libc/test/src/errno/CMakeLists.txt
index 633d46a1f5f88..b73962fb4de4d 100644
--- a/libc/test/src/errno/CMakeLists.txt
+++ b/libc/test/src/errno/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LLVM_LIBC_FULL_BUILD OR LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LLVM_LIBC_FULL_BUILD OR LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 8c105515e3525..81d2e1e55b552 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1,10 +1,14 @@
 add_custom_target(libc-math-unittests)
 
-add_library(
-  libc_math_test_utils
-  RandUtils.cpp
-  RandUtils.h
-)
+# FIXME: We shouldn't have regular libraries created because we could be
+#        cross-compiling the tests and running through an emulator.
+if(NOT LIBC_TARGET_OS_IS_GPU)
+  add_library(
+    libc_math_test_utils
+    RandUtils.cpp
+    RandUtils.h
+  )
+endif()
 
 add_fp_unittest(
   cosf_test
@@ -755,7 +759,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     ilogb_test
     SUITE
@@ -986,7 +990,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_fp_unittest(
     fminf_test
     SUITE
@@ -1231,7 +1235,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     nextafter_test
     SUITE
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 1824c672cb974..2d24b5a76b013 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -819,7 +819,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     ilogb_test
     SUITE
@@ -1073,7 +1073,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_fp_unittest(
     fminf_test
     SUITE
@@ -1417,7 +1417,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     nextafter_test
     SUITE
@@ -1465,7 +1465,7 @@ add_fp_unittest(
 )
 
 # FIXME: These tests are currently spurious for the GPU.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(NOT LIBC_TARGET_OS_IS_GPU)
   add_fp_unittest(
     nexttoward_test
     SUITE
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 8db2293ab74a9..93c21aa994ef4 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -430,7 +430,7 @@ add_libc_test(
 # Create an output directory for any temporary test files.
 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/testdata)
 
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index da07dbbe79772..5826cfe8d4ca3 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -55,7 +55,7 @@ add_libc_test(
 )
 
 # This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_fp_unittest(
     strtod_test
     SUITE
@@ -127,7 +127,7 @@ add_libc_test(
 )
 
 # This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_libc_test(
     strtold_test
     SUITE
@@ -339,7 +339,7 @@ if(LLVM_LIBC_FULL_BUILD)
   )
 
   # Only the GPU has an in-tree 'malloc' implementation.
-  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  if(LIBC_TARGET_OS_IS_GPU)
     add_libc_test(
       malloc_test
       HERMETIC_TEST_ONLY
diff --git a/libc/test/utils/UnitTest/CMakeLists.txt b/libc/test/utils/UnitTest/CMakeLists.txt
index 6f61e0ffefb00..3b917e06cde21 100644
--- a/libc/test/utils/UnitTest/CMakeLists.txt
+++ b/libc/test/utils/UnitTest/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt
index 9754dcf3854aa..7bf02a4af7dea 100644
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(LLVM_INCLUDE_TESTS)
   add_subdirectory(MPFRWrapper)
 endif()
-if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+if(LIBC_TARGET_OS_IS_GPU)
   add_subdirectory(gpu)
 endif()
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index adc073c9a91f5..6f44ca0d786c8 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -24,6 +24,6 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib)
   endif()
   target_link_libraries(libcMPFRWrapper PUBLIC LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
-elseif(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+elseif(NOT LIBC_TARGET_OS_IS_GPU)
   message(WARNING "Math tests using MPFR will be skipped.")
 endif()
diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt
index 7c15f36052cf3..4d1ebcfb9f8e6 100644
--- a/libc/utils/gpu/CMakeLists.txt
+++ b/libc/utils/gpu/CMakeLists.txt
@@ -1,2 +1,4 @@
 add_subdirectory(server)
-add_subdirectory(loader)
+if(LIBC_TARGET_OS_IS_GPU)
+  add_subdirectory(loader)
+endif()
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index f195b887c9af6..189460bb02e6e 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -1,31 +1,30 @@
 add_library(gpu_loader OBJECT Main.cpp)
+
 target_include_directories(gpu_loader PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}
   ${LIBC_SOURCE_DIR}/include
   ${LIBC_SOURCE_DIR}
 )
 
+# This utility needs to be compiled for the host system when cross compiling.
+if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
+  target_compile_options(gpu_loader PUBLIC --target=${LLVM_HOST_TRIPLE})
+  target_link_libraries(gpu_loader PUBLIC "--target=${LLVM_HOST_TRIPLE}")
+endif()
+
 find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(hsa-runtime64_FOUND)
+if(hsa-runtime64_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_subdirectory(amdgpu)
-else()
+elseif(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   message(STATUS "Skipping HSA loader for gpu target, no HSA was detected")
 endif()
 
-find_package(CUDAToolkit QUIET)
 # The CUDA loader requires LLVM to traverse the ELF image for symbols.
 find_package(LLVM QUIET)
-if(CUDAToolkit_FOUND AND LLVM_FOUND AND
-   "${CUDAToolkit_VERSION}" VERSION_GREATER_EQUAL "11.2")
+if(CUDAToolkit_FOUND AND LLVM_FOUND AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_subdirectory(nvptx)
-else()
-  if("${CUDAToolkit_VERSION}" VERSION_LESS "11.2")
-    message(WARNING 
-      "Skipping CUDA loader for gpu target, CUDA must be version 11.2 or later.
-       Found CUDA Version ${CUDAToolkit_VERSION}")
-  else()
-    message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
-  endif()
+elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
+  message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
 endif()
 
 # Add a custom target to be used for testing.
@@ -37,20 +36,31 @@ if(LIBC_GPU_LOADER_EXECUTABLE)
     PROPERTIES
       EXECUTABLE "${LIBC_GPU_LOADER_EXECUTABLE}"
   )
-elseif(TARGET amdhsa_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+elseif(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
   add_custom_target(libc.utils.gpu.loader)
-  add_dependencies(libc.utils.gpu.loader amdhsa_loader)
+  add_dependencies(libc.utils.gpu.loader amdhsa-loader)
   set_target_properties(
     libc.utils.gpu.loader
     PROPERTIES
-      EXECUTABLE "$<TARGET_FILE:amdhsa_loader>"
+      TARGET amdhsa-loader
+      EXECUTABLE "$<TARGET_FILE:amdhsa-loader>"
   )
-elseif(TARGET nvptx_loader AND LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   add_custom_target(libc.utils.gpu.loader)
-  add_dependencies(libc.utils.gpu.loader nvptx_loader)
+  add_dependencies(libc.utils.gpu.loader nvptx-loader)
   set_target_properties(
     libc.utils.gpu.loader
     PROPERTIES
-      EXECUTABLE "$<TARGET_FILE:nvptx_loader>"
+      TARGET nvptx-loader
+      EXECUTABLE "$<TARGET_FILE:nvptx-loader>"
   )
 endif()
+
+if(TARGET libc.utils.gpu.loader)
+  get_target_property(gpu_loader_tgt libc.utils.gpu.loader "TARGET")
+  if(gpu_loader_tgt)
+    install(TARGETS ${gpu_loader_tgt}
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+            COMPONENT libc)
+  endif()
+endif()
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 8e9c9a2bdc7d2..b99319f504011 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -1,7 +1,7 @@
-add_executable(amdhsa_loader Loader.cpp)
-add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc)
+add_executable(amdhsa-loader Loader.cpp)
+add_dependencies(amdhsa-loader libc.src.__support.RPC.rpc)
 
-target_link_libraries(amdhsa_loader
+target_link_libraries(amdhsa-loader
   PRIVATE
   hsa-runtime64::hsa-runtime64
   gpu_loader
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
index 0c76c49fa3098..e76362a1e8cca 100644
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,11 +1,11 @@
-add_executable(nvptx_loader Loader.cpp)
-add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
+add_executable(nvptx-loader Loader.cpp)
+add_dependencies(nvptx-loader libc.src.__support.RPC.rpc)
 
 if(NOT LLVM_ENABLE_RTTI)
-  target_compile_options(nvptx_loader PRIVATE -fno-rtti)
+  target_compile_options(nvptx-loader PRIVATE -fno-rtti)
 endif()
-target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
-target_link_libraries(nvptx_loader
+target_include_directories(nvptx-loader PRIVATE ${LLVM_INCLUDE_DIRS})
+target_link_libraries(nvptx-loader
   PRIVATE
   gpu_loader
   llvmlibc_rpc_server
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
index 3d9b2bcab4dbc..94cdfe5bf6521 100644
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -5,12 +5,21 @@ target_include_directories(llvmlibc_rpc_server PRIVATE ${LIBC_SOURCE_DIR})
 target_include_directories(llvmlibc_rpc_server PUBLIC ${LIBC_SOURCE_DIR}/include)
 target_include_directories(llvmlibc_rpc_server PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 
+
 # Ignore unsupported clang attributes if we're using GCC.
 target_compile_options(llvmlibc_rpc_server PUBLIC
                        $<$<CXX_COMPILER_ID:GNU>:-Wno-attributes>)
 target_compile_definitions(llvmlibc_rpc_server PUBLIC
                            LIBC_NAMESPACE=${LIBC_NAMESPACE})
 
+# This utility needs to be compiled for the host system when cross compiling.
+if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
+  target_compile_options(llvmlibc_rpc_server PUBLIC 
+                         --target=${LLVM_HOST_TRIPLE})
+  target_link_libraries(llvmlibc_rpc_server PUBLIC
+                        "--target=${LLVM_HOST_TRIPLE}")
+endif()
+
 # Install the server and associated header.
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/rpc_server.h
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm/
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index dbd5fbf226bd5..f5f7d3f3253fd 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -175,7 +175,9 @@ else()
   foreach(_name ${LLVM_RUNTIME_TARGETS})
     if("libc" IN_LIST RUNTIMES_${_name}_LLVM_ENABLE_RUNTIMES)
       set(NEED_LIBC_HDRGEN TRUE)
-      break()
+      if("${_name}" STREQUAL "amdgcn-amd-amdhsa" OR "${_name}" STREQUAL "nvptx64-nvidia-cuda")
+        set(LLVM_LIBC_GPU_BUILD ON)
+      endif()
     endif()
   endforeach()
 endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 486df22c2c1bb..4257083e53ad4 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -120,6 +120,13 @@ if( LLVM_ENABLE_ASSERTIONS )
   endif()
 endif()
 
+# If we are targeting a GPU architecture we want to ignore all the standard
+# flag handling.
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
+   "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx64")
+  return()
+endif()
+
 if(LLVM_ENABLE_EXPENSIVE_CHECKS)
   add_compile_definitions(EXPENSIVE_CHECKS)
 
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 8c48d85a4346f..9b5e758b6ede5 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -199,7 +199,7 @@ foreach(entry ${runtimes})
     list(APPEND prefixes "LLVM_LIBC")
     list(APPEND prefixes "LIBC_")
     # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LLVM_LIBC_GPU_BUILD)
       list(APPEND prefixes "CUDA")
     endif()
   endif()
@@ -424,7 +424,7 @@ if(runtimes)
     endforeach()
   endif()
   if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND
-      (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES))
+      (LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD))
     if(LIBC_HDRGEN_EXE)
       set(hdrgen_exe ${LIBC_HDRGEN_EXE})
     else()
@@ -441,7 +441,12 @@ if(runtimes)
     set(libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}"
                         "-DLLVM_LIBC_FULL_BUILD=ON")
     list(APPEND extra_deps ${hdrgen_deps})
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LLVM_LIBC_GPU_BUILD)
+      list(APPEND libc_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
+      # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
+      if(CUDAToolkit_ROOT)
+        list(APPEND libc_cmake_args "-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+      endif()
       foreach(dep clang-offload-packager nvptx-arch amdgpu-arch)
         if(TARGET ${dep})
           list(APPEND extra_deps ${dep})
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 17e61d0bc47dc..a74eff0c0bebf 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -119,14 +119,7 @@ endif()
 
 pythonize_bool(LIBOMPTARGET_OMPT_SUPPORT)
 
-# Check if this build supports the GPU libc.
-set(LIBC_GPU_SUPPORT FALSE)
-if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND (LIBC_GPU_BUILD OR
-                                            LIBC_GPU_ARCHITECTURES))
-  set(LIBC_GPU_SUPPORT TRUE)
-endif()
-
-set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LIBC_GPU_SUPPORT} CACHE BOOL
+set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${LLVM_LIBC_GPU_BUILD} CACHE BOOL
     "Libomptarget support for the GPU libc")
 pythonize_bool(LIBOMPTARGET_GPU_LIBC_SUPPORT)
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
index 8ae3ff2a6d291..085d443071650 100644
--- a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
@@ -73,8 +73,12 @@ elseif(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
   find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server
                PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
   if(llvmlibc_rpc_server)
-		target_link_libraries(PluginCommon PRIVATE llvmlibc_rpc_server)
+		target_link_libraries(PluginCommon PRIVATE ${llvmlibc_rpc_server})
 		target_compile_definitions(PluginCommon PRIVATE LIBOMPTARGET_RPC_SUPPORT)
+    # We may need to get the headers directly from the 'libc' source directory.
+    target_include_directories(PluginCommon PRIVATE
+                               ${CMAKE_SOURCE_DIR}/../libc/utils/gpu/server
+                               ${CMAKE_SOURCE_DIR}/../libc/include)
   endif()
 endif()
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
index 54aced11b31c3..cb6a5086bc4dd 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -18,7 +18,8 @@
 #if __has_include(<gpu-none-llvm/rpc_server.h>)
 #include <gpu-none-llvm/rpc_server.h>
 #elif defined(LIBOMPTARGET_RPC_SUPPORT)
-#include <rpc_server.h>
+// Just pull this out of the source if available.
+#include "rpc_server.h"
 #endif
 
 using namespace llvm;
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 565556e64ff29..6c590603079c4 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -180,8 +180,12 @@ def remove_suffix_if_present(name):
 
 def add_libraries(source):
     if config.libomptarget_has_libc:
-        return source + " " + config.llvm_library_dir + "/libcgpu.a " + \
-               config.llvm_library_intdir + "/libomptarget.devicertl.a"
+        if config.libomptarget_current_target.startswith('nvptx'):
+            return source + " " + config.llvm_library_dir + "/libcgpu-nvptx.a " + \
+                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
+        elif config.libomptarget_current_target.startswith('amdgcn'):
+            return source + " " + config.llvm_library_dir + "/libcgpu-amdgpu.a " + \
+                   config.llvm_library_intdir + "/libomptarget.devicertl.a"
     return source + " " + config.llvm_library_intdir + "/libomptarget.devicertl.a"
 
 # substitutions

From 3ed4b95bcf2039e7293f45e3b3fdf26b81dc319f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 15:37:33 -0600
Subject: [PATCH 097/546] [Flang] Fix test not updated after 'clang' case
 change

Summary:
The shared 'clang' code changed this slightly but did not update the
flang test.
---
 flang/test/Driver/omp-driver-offload.f90 | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index b45ed70195fb4..23c2a121a5afa 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -172,13 +172,25 @@
 
 ! Check that `-gpulibc` includes the LLVM C libraries for the GPU.
 ! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
-! RUN:      --offload-arch=gfx90a --offload-arch=sm_52 \
+! RUN:      --offload-arch=sm_52 \
 ! RUN:      -gpulibc %s 2>&1 \
-! RUN:   | FileCheck --check-prefix=LIBC-GPU %s
-! LIBC-GPU: "-lcgpu"{{.*}}"-lmgpu"
+! RUN:   | FileCheck --check-prefix=LIBC-GPU-NVPTX %s
+! LIBC-GPU-NVPTX: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx"
 
 ! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
-! RUN:      --offload-arch=gfx90a --offload-arch=sm_52 \
+! RUN:      --offload-arch=sm_52 \
 ! RUN:      -nogpulibc %s 2>&1 \
-! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU %s
-! NO-LIBC-GPU-NOT: "-lcgpu"{{.*}}"-lmgpu"
+! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU-NVPTX %s
+! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx"
+
+! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
+! RUN:      --offload-arch=gfx90a \
+! RUN:      -gpulibc %s 2>&1 \
+! RUN:   | FileCheck --check-prefix=LIBC-GPU-AMDGPU %s
+! LIBC-GPU-AMDGPU: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu"
+
+! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
+! RUN:      --offload-arch=gfx90a \
+! RUN:      -nogpulibc %s 2>&1 \
+! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU-AMDGPU %s
+! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu"

From 72763521c34287bce68402eb2a9d71dcb4eed5a0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Thu, 22 Feb 2024 22:48:47 +0100
Subject: [PATCH 098/546] [LSR] Clear SCEVExpander before calling
 DeleteDeadPHIs

To avoid an assertion failure when an AssertingVH is removed,
as reported in:
https://github.com/llvm/llvm-project/pull/82362#issuecomment-1960067147

Also remove an unnecessary use of SCEVExpanderCleaner.
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  4 +-
 .../RISCV/term-fold-crash.ll                  | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 627c863f7091f..08021f3ba853e 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -7033,7 +7033,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       // SCEVExpander for both use in preheader and latch
       const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
       SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
-      SCEVExpanderCleaner ExpCleaner(Expander);
 
       assert(Expander.isSafeToExpand(TermValueS) &&
              "Terminating value was checked safe in canFoldTerminatingCondition");
@@ -7064,10 +7063,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
 
       BI->setCondition(NewTermCond);
 
+      Expander.clear();
       OldTermCond->eraseFromParent();
       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
-
-      ExpCleaner.markResultUsed();
     }
   }
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll
new file mode 100644
index 0000000000000..8ca7f0010bbbe
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/term-fold-crash.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=loop-reduce -mtriple=riscv64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(ptr %p, i8 %arg, i32 %start) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i8 [[ARG:%.*]], i32 [[START:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[ARG]] to i32
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[CONV]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[START]], [[SHR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[ADD810:%.*]] = phi i32 [ [[START]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[ADD810]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[P]], i64 [[IDXPROM2]]
+; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[ADD]] = add i32 [[ADD810]], 1
+; CHECK-NEXT:    [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq i32 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %arg to i32
+  %shr = lshr i32 %conv, 1
+  %wide.trip.count = zext nneg i32 %shr to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add810 = phi i32 [ %start, %entry ], [ %add, %for.body ]
+  %idxprom2 = zext i32 %add810 to i64
+  %arrayidx3 = getelementptr i8, ptr %p, i64 %idxprom2
+  %v = load i8, ptr %arrayidx3, align 1
+  %add = add i32 %add810, 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv, %wide.trip.count
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret void
+}

From d4bfca3b2e673789f7c278d46a199ae8910ddd37 Mon Sep 17 00:00:00 2001
From: Wentao Zhang <35722712+whentojump@users.noreply.github.com>
Date: Thu, 22 Feb 2024 16:04:25 -0600
Subject: [PATCH 099/546] [clang][CodeGen] Keep processing the rest of AST
 after encountering unsupported MC/DC expressions (#82464)

Currently, upon seeing unsupported decisions (more than 6 conditions, or
split nesting), the post-visitor hook dataTraverseStmtPost() returns a
false. As a result, in the rest of tree even supported decisions will
be skipped as well. Like in the below code:

{ // CompoundStmt
  a && b;           // 1: BinaryOperator (supported)
  a && foo(b && c); // 2: BinaryOperator (not yet supported due to split
                    //                    nesting)
  a && b;           // 3: BinaryOperator (supported)
}

Decision 3 will not be processed at all. And only one "Decision" region
will be emitted. Compiler explorer example:
https://godbolt.org/z/Px61sesoo

We hope to process such cases and emit two "Decision" regions (1 and 3)
in the above example.
---
 clang/lib/CodeGen/CodeGenPGO.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 48c5e68a3b7ba..1ef7be3c72593 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -239,9 +239,12 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     if (MCDCMaxCond == 0)
       return true;
 
-    /// At the top of the logical operator nest, reset the number of conditions.
-    if (LogOpStack.empty())
+    /// At the top of the logical operator nest, reset the number of conditions,
+    /// also forget previously seen split nesting cases.
+    if (LogOpStack.empty()) {
       NumCond = 0;
+      SplitNestedLogicalOp = false;
+    }
 
     if (const Expr *E = dyn_cast<Expr>(S)) {
       const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(E->IgnoreParens());
@@ -292,7 +295,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
                 "contains an operation with a nested boolean expression. "
                 "Expression will not be covered");
             Diag.Report(S->getBeginLoc(), DiagID);
-            return false;
+            return true;
           }
 
           /// Was the maximum number of conditions encountered?
@@ -303,7 +306,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
                 "number of conditions (%0) exceeds max (%1). "
                 "Expression will not be covered");
             Diag.Report(S->getBeginLoc(), DiagID) << NumCond << MCDCMaxCond;
-            return false;
+            return true;
           }
 
           // Otherwise, allocate the number of bytes required for the bitmap

From ae3e14276b7181ae51e9ef731f44f813a1a3f123 Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Thu, 22 Feb 2024 22:04:17 +0000
Subject: [PATCH 100/546] Fix test/Dialect/Vector/vector-transfer-flatten.mlir

---
 mlir/test/Dialect/Vector/vector-transfer-flatten.mlir | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 3b6441d0c9560..2766e782a3fb2 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -475,6 +475,8 @@ func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, str
 //       CHECK:      %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>>
 //       CHECK:     %[[APPLY:.*]] = affine.apply #[[$MAP]]()
 
+// CHECK-128B-LABEL: func @regression_non_contiguous_dim_read(
+
 // -----
 
 func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>,
@@ -487,3 +489,5 @@ func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>,
 
 // CHECK-LABEL:  func.func @unsupported_non_contiguous_dim_write(
 //   CHECK-NOT:    memref.collapse_shape
+
+// CHECK-128B-LABEL: func @unsupported_non_contiguous_dim_write(

From e2f08268304dc972440391c43bf1d47e28fad93e Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 22 Feb 2024 14:11:10 -0800
Subject: [PATCH 101/546] [MLIR] Fix LLVM dialect specification to use
 AnySignlessInteger instead of AnyInteger (#82694)

LLVM IR does not support signed integer, the LLVM dialect was
underspecified (likely unintentionally) and the AnyInteger constraint
was overly lax.

The arithmetic dialect is already consistently using AnySignlessInteger.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 46 ++++++++++-----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index d9b130bdf18cb..3da5deeb4ec7e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -49,7 +49,7 @@ class LLVM_ArithmeticOpBase<Type type, string mnemonic,
 }
 class LLVM_IntArithmeticOp<string mnemonic, string instName,
                            list<Trait> traits = []> :
-    LLVM_ArithmeticOpBase<AnyInteger, mnemonic, instName, traits> {
+    LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName, traits> {
   let arguments = commonArgs;
   string mlirBuilder = [{
     $res = $_builder.create<$_qualCppClassName>($_location, $lhs, $rhs);
@@ -57,7 +57,7 @@ class LLVM_IntArithmeticOp<string mnemonic, string instName,
 }
 class LLVM_IntArithmeticOpWithOverflowFlag<string mnemonic, string instName,
                                    list<Trait> traits = []> :
-    LLVM_ArithmeticOpBase<AnyInteger, mnemonic, instName,
+    LLVM_ArithmeticOpBase<AnySignlessInteger, mnemonic, instName,
     !listconcat([DeclareOpInterfaceMethods<IntegerOverflowFlagsInterface>], traits)> {
   dag iofArg = (
     ins DefaultValuedAttr<LLVM_IntegerOverflowFlagsAttr, "{}">:$overflowFlags);
@@ -143,9 +143,9 @@ class LLVM_ArithmeticCmpOp<string mnemonic, list<Trait> traits = []> :
 // Other integer operations.
 def LLVM_ICmpOp : LLVM_ArithmeticCmpOp<"icmp", [Pure]> {
   let arguments = (ins ICmpPredicate:$predicate,
-                   AnyTypeOf<[LLVM_ScalarOrVectorOf<AnyInteger>,
+                   AnyTypeOf<[LLVM_ScalarOrVectorOf<AnySignlessInteger>,
                               LLVM_ScalarOrVectorOf<LLVM_AnyPointer>]>:$lhs,
-                   AnyTypeOf<[LLVM_ScalarOrVectorOf<AnyInteger>,
+                   AnyTypeOf<[LLVM_ScalarOrVectorOf<AnySignlessInteger>,
                               LLVM_ScalarOrVectorOf<LLVM_AnyPointer>]>:$rhs);
   let hasCustomAssemblyFormat = 1;
   string llvmInstName = "ICmp";
@@ -204,7 +204,7 @@ def LLVM_AllocaOp : LLVM_Op<"alloca",
      DeclareOpInterfaceMethods<DestructurableAllocationOpInterface>,
      DeclareOpInterfaceMethods<GetResultPtrElementType>]>,
   LLVM_MemOpPatterns {
-  let arguments = (ins AnyInteger:$arraySize,
+  let arguments = (ins AnySignlessInteger:$arraySize,
                    OptionalAttr<I64Attr>:$alignment,
                    TypeAttr:$elem_type,
                    UnitAttr:$inalloca);
@@ -250,7 +250,7 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
     DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
     DeclareOpInterfaceMethods<GetResultPtrElementType>]> {
   let arguments = (ins LLVM_ScalarOrVectorOf<LLVM_AnyPointer>:$base,
-                   Variadic<LLVM_ScalarOrVectorOf<AnyInteger>>:$dynamicIndices,
+                   Variadic<LLVM_ScalarOrVectorOf<AnySignlessInteger>>:$dynamicIndices,
                    DenseI32ArrayAttr:$rawConstantIndices,
                    TypeAttr:$elem_type,
                    UnitAttr:$inbounds);
@@ -499,37 +499,37 @@ def LLVM_AddrSpaceCastOp : LLVM_CastOp<"addrspacecast", "AddrSpaceCast",
   let hasFolder = 1;
 }
 def LLVM_IntToPtrOp : LLVM_CastOp<"inttoptr", "IntToPtr",
-                                  LLVM_ScalarOrVectorOf<AnyInteger>,
+                                  LLVM_ScalarOrVectorOf<AnySignlessInteger>,
                                   LLVM_ScalarOrVectorOf<LLVM_AnyPointer>>;
 def LLVM_PtrToIntOp : LLVM_CastOp<"ptrtoint", "PtrToInt",
                                   LLVM_ScalarOrVectorOf<LLVM_AnyPointer>,
-                                  LLVM_ScalarOrVectorOf<AnyInteger>>;
+                                  LLVM_ScalarOrVectorOf<AnySignlessInteger>>;
 def LLVM_SExtOp : LLVM_CastOp<"sext", "SExt",
-                              LLVM_ScalarOrVectorOf<AnyInteger>,
-                              LLVM_ScalarOrVectorOf<AnyInteger>> {
+                              LLVM_ScalarOrVectorOf<AnySignlessInteger>,
+                              LLVM_ScalarOrVectorOf<AnySignlessInteger>> {
   let hasVerifier = 1;
 }
 def LLVM_ZExtOp : LLVM_CastOp<"zext", "ZExt",
-                              LLVM_ScalarOrVectorOf<AnyInteger>,
-                              LLVM_ScalarOrVectorOf<AnyInteger>> {
+                              LLVM_ScalarOrVectorOf<AnySignlessInteger>,
+                              LLVM_ScalarOrVectorOf<AnySignlessInteger>> {
   let hasFolder = 1;
   let hasVerifier = 1;
 }
 def LLVM_TruncOp : LLVM_CastOp<"trunc", "Trunc",
-                               LLVM_ScalarOrVectorOf<AnyInteger>,
-                               LLVM_ScalarOrVectorOf<AnyInteger>>;
+                               LLVM_ScalarOrVectorOf<AnySignlessInteger>,
+                               LLVM_ScalarOrVectorOf<AnySignlessInteger>>;
 def LLVM_SIToFPOp : LLVM_CastOp<"sitofp", "SIToFP",
-                                LLVM_ScalarOrVectorOf<AnyInteger>,
+                                LLVM_ScalarOrVectorOf<AnySignlessInteger>,
                                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>>;
 def LLVM_UIToFPOp : LLVM_CastOp<"uitofp", "UIToFP",
-                                LLVM_ScalarOrVectorOf<AnyInteger>,
+                                LLVM_ScalarOrVectorOf<AnySignlessInteger>,
                                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>>;
 def LLVM_FPToSIOp : LLVM_CastOp<"fptosi", "FPToSI",
                                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>,
-                                LLVM_ScalarOrVectorOf<AnyInteger>>;
+                                LLVM_ScalarOrVectorOf<AnySignlessInteger>>;
 def LLVM_FPToUIOp : LLVM_CastOp<"fptoui", "FPToUI",
                                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>,
-                                LLVM_ScalarOrVectorOf<AnyInteger>>;
+                                LLVM_ScalarOrVectorOf<AnySignlessInteger>>;
 def LLVM_FPExtOp : LLVM_CastOp<"fpext", "FPExt",
                                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>,
                                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>>;
@@ -671,7 +671,7 @@ def LLVM_ExtractElementOp : LLVM_Op<"extractelement", [Pure,
                    "LLVM::getVectorElementType($_self)">]> {
   let summary = "Extract an element from an LLVM vector.";
 
-  let arguments = (ins LLVM_AnyVector:$vector, AnyInteger:$position);
+  let arguments = (ins LLVM_AnyVector:$vector, AnySignlessInteger:$position);
   let results = (outs LLVM_Type:$res);
 
   let assemblyFormat = [{
@@ -733,7 +733,7 @@ def LLVM_InsertElementOp : LLVM_Op<"insertelement", [Pure,
   let summary = "Insert an element into an LLVM vector.";
 
   let arguments = (ins LLVM_AnyVector:$vector, LLVM_PrimitiveType:$value,
-                       AnyInteger:$position);
+                       AnySignlessInteger:$position);
   let results = (outs LLVM_AnyVector:$res);
 
   let builders = [LLVM_OneResultOpBuilder];
@@ -971,7 +971,7 @@ def LLVM_SwitchOp : LLVM_TerminatorOp<"switch",
      DeclareOpInterfaceMethods<BranchWeightOpInterface>,
      Pure]> {
   let arguments = (ins
-    AnyInteger:$value,
+    AnySignlessInteger:$value,
     Variadic<AnyType>:$defaultOperands,
     VariadicOfVariadic<AnyType, "case_operand_segments">:$caseOperands,
     OptionalAttr<AnyIntElementsAttr>:$case_values,
@@ -1647,7 +1647,7 @@ def LLVM_ConstantOp
 // Atomic operations.
 //
 
-def LLVM_AtomicRMWType : AnyTypeOf<[LLVM_AnyFloat, LLVM_AnyPointer, AnyInteger]>;
+def LLVM_AtomicRMWType : AnyTypeOf<[LLVM_AnyFloat, LLVM_AnyPointer, AnySignlessInteger]>;
 
 def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [
       TypesMatchWith<"result #0 and operand #1 have the same type",
@@ -1696,7 +1696,7 @@ def LLVM_AtomicRMWOp : LLVM_MemAccessOpBase<"atomicrmw", [
   let hasVerifier = 1;
 }
 
-def LLVM_AtomicCmpXchgType : AnyTypeOf<[AnyInteger, LLVM_AnyPointer]>;
+def LLVM_AtomicCmpXchgType : AnyTypeOf<[AnySignlessInteger, LLVM_AnyPointer]>;
 
 def LLVM_AtomicCmpXchgOp : LLVM_MemAccessOpBase<"cmpxchg", [
       TypesMatchWith<"operand #1 and operand #2 have the same type",

From e314622f204a01ffeda59cbe046dd403b01f8b74 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Thu, 22 Feb 2024 14:26:11 -0800
Subject: [PATCH 102/546] [clang][driver] Allow unaligned access on ARMv7 and
 higher by default (#82400)

ARM's Clang and GCC embedded compilers default to allowing unaligned
access for ARMv7+. This patch changes the Clang driver default to match.
Users can opt out with `-mno-unaligned-access`.

Fixes #59560
---
 clang/docs/ReleaseNotes.rst              | 11 +++++++++++
 clang/lib/Driver/ToolChains/Arch/ARM.cpp | 24 ++++++++++++------------
 clang/test/Driver/arm-alignment.c        | 15 +++++++++++++++
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 74bb9a07f0b13..19cc5b7756431 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -302,6 +302,17 @@ X86 Support
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- ARMv7+ targets now default to allowing unaligned access, except Armv6-M, and
+  Armv8-M without the Main Extension. Baremetal targets should check that the
+  new default will work with their system configurations, since it requires
+  that SCTLR.A is 0, SCTLR.U is 1, and that the memory in question is
+  configured as "normal" memory. This brings Clang in-line with the default
+  settings for GCC and Arm Compiler. Aside from making Clang align with other
+  compilers, changing the default brings major performance and code size
+  improvements for most targets. We have not changed the default behavior for
+  ARMv6, but may revisit that decision in the future. Users can restore the old
+  behavior with -m[no-]unaligned-access.
+
 Android Support
 ^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index e6ee2f88a84ed..ba158b92bb44b 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -890,25 +890,25 @@ llvm::ARM::FPUKind arm::getARMTargetFeatures(const Driver &D,
     // SCTLR.U bit, which is architecture-specific. We assume ARMv6
     // Darwin and NetBSD targets support unaligned accesses, and others don't.
     //
-    // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
-    // which raises an alignment fault on unaligned accesses. Linux
-    // defaults this bit to 0 and handles it as a system-wide (not
-    // per-process) setting. It is therefore safe to assume that ARMv7+
-    // Linux targets support unaligned accesses. The same goes for NaCl
-    // and Windows.
+    // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit which
+    // raises an alignment fault on unaligned accesses. Assume ARMv7+ supports
+    // unaligned accesses, except ARMv6-M, and ARMv8-M without the Main
+    // Extension. This aligns with the default behavior of ARM's downstream
+    // versions of GCC and Clang.
     //
-    // The above behavior is consistent with GCC.
+    // Users can change the default behavior via -m[no-]unaliged-access.
     int VersionNum = getARMSubArchVersionNumber(Triple);
     if (Triple.isOSDarwin() || Triple.isOSNetBSD()) {
       if (VersionNum < 6 ||
           Triple.getSubArch() == llvm::Triple::SubArchType::ARMSubArch_v6m)
         Features.push_back("+strict-align");
-    } else if (Triple.isOSLinux() || Triple.isOSNaCl() ||
-               Triple.isOSWindows()) {
-      if (VersionNum < 7)
-        Features.push_back("+strict-align");
-    } else
+    } else if (VersionNum < 7 ||
+               Triple.getSubArch() ==
+                   llvm::Triple::SubArchType::ARMSubArch_v6m ||
+               Triple.getSubArch() ==
+                   llvm::Triple::SubArchType::ARMSubArch_v8m_baseline) {
       Features.push_back("+strict-align");
+    }
   }
 
   // llvm does not support reserving registers in general. There is support
diff --git a/clang/test/Driver/arm-alignment.c b/clang/test/Driver/arm-alignment.c
index 9177b625729b8..8c915477af9af 100644
--- a/clang/test/Driver/arm-alignment.c
+++ b/clang/test/Driver/arm-alignment.c
@@ -22,6 +22,21 @@
 // RUN: %clang -target armv7-windows -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
 
+// RUN: %clang --target=armv6 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ALIGNED-ARM < %t %s
+
+// RUN: %clang --target=armv7 -### %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-UNALIGNED-ARM < %t %s
+
+// RUN: %clang -target thumbv6m-none-gnueabi -mcpu=cortex-m0 -### %s 2> %t
+// RUN: FileCheck --check-prefix CHECK-ALIGNED-ARM <%t %s
+
+// RUN: %clang -target thumb-none-gnueabi -mcpu=cortex-m0 -### %s 2> %t
+// RUN: FileCheck --check-prefix CHECK-ALIGNED-ARM <%t %s
+
+// RUN: %clang -target thumbv8m.base-none-gnueabi -### %s 2> %t
+// RUN: FileCheck --check-prefix CHECK-ALIGNED-ARM <%t %s
+
 // RUN: %clang --target=aarch64 -munaligned-access -### %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-UNALIGNED-AARCH64 < %t %s
 

From d5a15f3116f8c3ec32df1f13a2fc521a98b03d96 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 16:27:53 -0600
Subject: [PATCH 103/546] [Clang][NVPTX] Allow passing arguments to the linker
 while standalone (#73030)

Summary:
We support standalone compilation for the NVPTX architecture using
'nvlink' as our linker. Because of the special handling required to
transform input files to cubins, as nvlink expects for some reason, we
didn't use the standard AddLinkerInput method. However, this also meant
that we weren't forwarding options passed with -Wl to the linker. Add
this support in for the standalone toolchain path.

Revived from https://reviews.llvm.org/D149978
---
 clang/lib/Driver/ToolChains/Cuda.cpp          | 43 +++++++++----------
 clang/test/Driver/cuda-cross-compiling.c      |  9 +++-
 .../ClangLinkerWrapper.cpp                    |  4 +-
 3 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index ed5924c3b73b5..94d4982d102bb 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -623,35 +623,34 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       continue;
     }
 
-    // Currently, we only pass the input files to the linker, we do not pass
-    // any libraries that may be valid only for the host.
-    if (!II.isFilename())
-      continue;
-
     // The 'nvlink' application performs RDC-mode linking when given a '.o'
     // file and device linking when given a '.cubin' file. We always want to
     // perform device linking, so just rename any '.o' files.
     // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
-    auto InputFile = getToolChain().getInputFilename(II);
-    if (llvm::sys::path::extension(InputFile) != ".cubin") {
-      // If there are no actions above this one then this is direct input and we
-      // can copy it. Otherwise the input is internal so a `.cubin` file should
-      // exist.
-      if (II.getAction() && II.getAction()->getInputs().size() == 0) {
-        const char *CubinF =
-            Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath(
-                llvm::sys::path::stem(InputFile), "cubin"));
-        if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF)))
-          continue;
+    if (II.isFilename()) {
+      auto InputFile = getToolChain().getInputFilename(II);
+      if (llvm::sys::path::extension(InputFile) != ".cubin") {
+        // If there are no actions above this one then this is direct input and
+        // we can copy it. Otherwise the input is internal so a `.cubin` file
+        // should exist.
+        if (II.getAction() && II.getAction()->getInputs().size() == 0) {
+          const char *CubinF =
+              Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath(
+                  llvm::sys::path::stem(InputFile), "cubin"));
+          if (llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF)))
+            continue;
 
-        CmdArgs.push_back(CubinF);
+          CmdArgs.push_back(CubinF);
+        } else {
+          SmallString<256> Filename(InputFile);
+          llvm::sys::path::replace_extension(Filename, "cubin");
+          CmdArgs.push_back(Args.MakeArgString(Filename));
+        }
       } else {
-        SmallString<256> Filename(InputFile);
-        llvm::sys::path::replace_extension(Filename, "cubin");
-        CmdArgs.push_back(Args.MakeArgString(Filename));
+        CmdArgs.push_back(Args.MakeArgString(InputFile));
       }
-    } else {
-      CmdArgs.push_back(Args.MakeArgString(InputFile));
+    } else if (!II.isNothing()) {
+      II.getInputArg().renderAsInput(Args, CmdArgs);
     }
   }
 
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index 6c9e2cac736b7..25058358b63a8 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -69,6 +69,13 @@
 // LOWERING: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-mllvm" "--nvptx-lower-global-ctor-dtor"
 
 //
+// Test passing arguments directly to nvlink.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINKER-ARGS %s
+
+// LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b"
+
 // Tests for handling a missing architecture.
 //
 // RUN: not %clang -target nvptx64-nvidia-cuda %s -### 2>&1 \
@@ -80,4 +87,4 @@
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
 
-// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
+// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
\ No newline at end of file
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 095cf5ed38169..576e8f2cd7f8f 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -454,9 +454,11 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
       Triple.isAMDGPU() ? Args.MakeArgString("-mcpu=" + Arch)
                         : Args.MakeArgString("-march=" + Arch),
       Args.MakeArgString("-" + OptLevel),
-      "-Wl,--no-undefined",
   };
 
+  if (!Triple.isNVPTX())
+    CmdArgs.push_back("-Wl,--no-undefined");
+
   for (StringRef InputFile : InputFiles)
     CmdArgs.push_back(InputFile);
 

From 018c992879248ad28a04fc7d061922f5ccee4e08 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 16:29:09 -0600
Subject: [PATCH 104/546] [Flang] Fix the test ordering of the GPU libraries

Summary:
Turns out these are out of order
---
 flang/test/Driver/omp-driver-offload.f90 | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/flang/test/Driver/omp-driver-offload.f90 b/flang/test/Driver/omp-driver-offload.f90
index 23c2a121a5afa..9b62699030c68 100644
--- a/flang/test/Driver/omp-driver-offload.f90
+++ b/flang/test/Driver/omp-driver-offload.f90
@@ -175,22 +175,24 @@
 ! RUN:      --offload-arch=sm_52 \
 ! RUN:      -gpulibc %s 2>&1 \
 ! RUN:   | FileCheck --check-prefix=LIBC-GPU-NVPTX %s
-! LIBC-GPU-NVPTX: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx"
+! LIBC-GPU-NVPTX-DAG: "-lcgpu-nvptx"
+! LIBC-GPU-NVPTX-DAG: "-lmgpu-nvptx"
 
 ! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
 ! RUN:      --offload-arch=sm_52 \
 ! RUN:      -nogpulibc %s 2>&1 \
 ! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU-NVPTX %s
-! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx"{{.*}}"-lmgpu-nvptx"
+! NO-LIBC-GPU-NVPTX-NOT: "-lcgpu-nvptx"
 
 ! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
 ! RUN:      --offload-arch=gfx90a \
 ! RUN:      -gpulibc %s 2>&1 \
 ! RUN:   | FileCheck --check-prefix=LIBC-GPU-AMDGPU %s
-! LIBC-GPU-AMDGPU: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu"
+! LIBC-GPU-AMDGPU-DAG: "-lcgpu-amdgpu"
+! LIBC-GPU-AMDGPU-DAG: "-lmgpu-amdgpu"
 
 ! RUN:   %flang -### --target=x86_64-unknown-linux-gnu -fopenmp  \
 ! RUN:      --offload-arch=gfx90a \
 ! RUN:      -nogpulibc %s 2>&1 \
 ! RUN:   | FileCheck --check-prefix=NO-LIBC-GPU-AMDGPU %s
-! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu"{{.*}}"-lmgpu-amdgpu"
+! NO-LIBC-GPU-AMDGPU-NOT: "-lcgpu-amdgpu"

From bc5aba9dd63f919037aded04405f3e05092c9039 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Thu, 22 Feb 2024 17:31:11 -0500
Subject: [PATCH 105/546] [CodeGen][MIR][UnitTests] Fix shared build. NFC

---
 llvm/unittests/MIR/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt
index f485dcbd971b6..0ad52134a34da 100644
--- a/llvm/unittests/MIR/CMakeLists.txt
+++ b/llvm/unittests/MIR/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  Analysis
   CodeGen
   CodeGenTypes
   Core

From 87b410821148402d74ac7a14bed233078a49cb7b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 16:49:21 -0600
Subject: [PATCH 106/546] [Libomptarget][NFC] Remove concept of optional plugin
 functions (#82681)

Summary:
Ever since the introduction of the new plugins we haven't exercised the
concept of "optional" plugin functions. This is done in perparation for
making the plugins use a static interface as it will greatly simplify
the implementation if we assert that every function has the entrypoints.
Currently some unsupported functions will just return failure or some
other default value, so this shouldn't change anything.
---
 openmp/libomptarget/include/PluginManager.h   |  8 ++-
 .../libomptarget/include/Shared/PluginAPI.inc | 72 +++++++++----------
 openmp/libomptarget/src/PluginManager.cpp     |  4 +-
 3 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/openmp/libomptarget/include/PluginManager.h b/openmp/libomptarget/include/PluginManager.h
index 5e5306ac776f0..77684285ddf15 100644
--- a/openmp/libomptarget/include/PluginManager.h
+++ b/openmp/libomptarget/include/PluginManager.h
@@ -69,7 +69,7 @@ struct PluginAdaptorTy {
   /// Access to the shared object file representing the plugin.
   std::unique_ptr<llvm::sys::DynamicLibrary> LibraryHandler;
 
-#define PLUGIN_API_HANDLE(NAME, MANDATORY)                                     \
+#define PLUGIN_API_HANDLE(NAME)                                                \
   using NAME##_ty = decltype(__tgt_rtl_##NAME);                                \
   NAME##_ty *NAME = nullptr;
 
@@ -114,8 +114,10 @@ struct PluginManager {
   // Unregister a shared library from all RTLs.
   void unregisterLib(__tgt_bin_desc *Desc);
 
-  void addDeviceImage(__tgt_bin_desc &TgtBinDesc, __tgt_device_image &TgtDeviceImage) {
-    DeviceImages.emplace_back(std::make_unique<DeviceImageTy>(TgtBinDesc, TgtDeviceImage));
+  void addDeviceImage(__tgt_bin_desc &TgtBinDesc,
+                      __tgt_device_image &TgtDeviceImage) {
+    DeviceImages.emplace_back(
+        std::make_unique<DeviceImageTy>(TgtBinDesc, TgtDeviceImage));
   }
 
   /// Return the device presented to the user as device \p DeviceNo if it is
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index 3b982e30307ac..e445da6852f7b 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -13,39 +13,39 @@
 
 // No include guards!
 
-PLUGIN_API_HANDLE(init_plugin, true);
-PLUGIN_API_HANDLE(is_valid_binary, true);
-PLUGIN_API_HANDLE(is_data_exchangable, false);
-PLUGIN_API_HANDLE(number_of_devices, true);
-PLUGIN_API_HANDLE(init_device, true);
-PLUGIN_API_HANDLE(load_binary, true);
-PLUGIN_API_HANDLE(get_global, true);
-PLUGIN_API_HANDLE(get_function, true);
-PLUGIN_API_HANDLE(data_alloc, true);
-PLUGIN_API_HANDLE(data_submit, true);
-PLUGIN_API_HANDLE(data_submit_async, false);
-PLUGIN_API_HANDLE(data_retrieve, true);
-PLUGIN_API_HANDLE(data_retrieve_async, false);
-PLUGIN_API_HANDLE(data_exchange, false);
-PLUGIN_API_HANDLE(data_exchange_async, false);
-PLUGIN_API_HANDLE(data_delete, true);
-PLUGIN_API_HANDLE(launch_kernel, true);
-PLUGIN_API_HANDLE(init_requires, false);
-PLUGIN_API_HANDLE(synchronize, false);
-PLUGIN_API_HANDLE(query_async, false);
-PLUGIN_API_HANDLE(set_info_flag, false);
-PLUGIN_API_HANDLE(print_device_info, false);
-PLUGIN_API_HANDLE(create_event, false);
-PLUGIN_API_HANDLE(record_event, false);
-PLUGIN_API_HANDLE(wait_event, false);
-PLUGIN_API_HANDLE(sync_event, false);
-PLUGIN_API_HANDLE(destroy_event, false);
-PLUGIN_API_HANDLE(init_async_info, false);
-PLUGIN_API_HANDLE(init_device_info, false);
-PLUGIN_API_HANDLE(data_lock, false);
-PLUGIN_API_HANDLE(data_unlock, false);
-PLUGIN_API_HANDLE(data_notify_mapped, false);
-PLUGIN_API_HANDLE(data_notify_unmapped, false);
-PLUGIN_API_HANDLE(set_device_offset, false);
-PLUGIN_API_HANDLE(initialize_record_replay, false);
-PLUGIN_API_HANDLE(use_auto_zero_copy, false);
+PLUGIN_API_HANDLE(init_plugin);
+PLUGIN_API_HANDLE(is_valid_binary);
+PLUGIN_API_HANDLE(is_data_exchangable);
+PLUGIN_API_HANDLE(number_of_devices);
+PLUGIN_API_HANDLE(init_device);
+PLUGIN_API_HANDLE(load_binary);
+PLUGIN_API_HANDLE(get_global);
+PLUGIN_API_HANDLE(get_function);
+PLUGIN_API_HANDLE(data_alloc);
+PLUGIN_API_HANDLE(data_submit);
+PLUGIN_API_HANDLE(data_submit_async);
+PLUGIN_API_HANDLE(data_retrieve);
+PLUGIN_API_HANDLE(data_retrieve_async);
+PLUGIN_API_HANDLE(data_exchange);
+PLUGIN_API_HANDLE(data_exchange_async);
+PLUGIN_API_HANDLE(data_delete);
+PLUGIN_API_HANDLE(launch_kernel);
+PLUGIN_API_HANDLE(init_requires);
+PLUGIN_API_HANDLE(synchronize);
+PLUGIN_API_HANDLE(query_async);
+PLUGIN_API_HANDLE(set_info_flag);
+PLUGIN_API_HANDLE(print_device_info);
+PLUGIN_API_HANDLE(create_event);
+PLUGIN_API_HANDLE(record_event);
+PLUGIN_API_HANDLE(wait_event);
+PLUGIN_API_HANDLE(sync_event);
+PLUGIN_API_HANDLE(destroy_event);
+PLUGIN_API_HANDLE(init_async_info);
+PLUGIN_API_HANDLE(init_device_info);
+PLUGIN_API_HANDLE(data_lock);
+PLUGIN_API_HANDLE(data_unlock);
+PLUGIN_API_HANDLE(data_notify_mapped);
+PLUGIN_API_HANDLE(data_notify_unmapped);
+PLUGIN_API_HANDLE(set_device_offset);
+PLUGIN_API_HANDLE(initialize_record_replay);
+PLUGIN_API_HANDLE(use_auto_zero_copy);
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 09f9c6400569c..928913275332c 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -56,10 +56,10 @@ PluginAdaptorTy::PluginAdaptorTy(const std::string &Name,
 
 Error PluginAdaptorTy::init() {
 
-#define PLUGIN_API_HANDLE(NAME, MANDATORY)                                     \
+#define PLUGIN_API_HANDLE(NAME)                                                \
   NAME = reinterpret_cast<decltype(NAME)>(                                     \
       LibraryHandler->getAddressOfSymbol(GETNAME(__tgt_rtl_##NAME)));          \
-  if (MANDATORY && !NAME) {                                                    \
+  if (!NAME) {                                                                 \
     return createStringError(inconvertibleErrorCode(),                         \
                              "Invalid plugin as necessary interface function " \
                              "(%s) was not found.\n",                          \

From e3cab8fe82eb71fadb251d11fec7df9fa0dbdd27 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 16:54:03 -0600
Subject: [PATCH 107/546] [LinkerWrapper] Fix test after permitting NVPTX
 linker arguments

Summary:
Forgot to change this after a previous patch altered its behaviour.
---
 clang/test/Driver/linker-wrapper.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 7fd46778ac910..83df2b84adefe 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -21,7 +21,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK
 
-// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
@@ -30,7 +30,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \
 // RUN:   --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG
 
-// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o -g 
+// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 {{.*}}.o {{.*}}.o -g 
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \

From 4ebee956455caa0da7783280f8515040eac89d08 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Fri, 23 Feb 2024 06:54:39 +0800
Subject: [PATCH 108/546] [mlir][test] Fix -Wunused-variable in
 PassBuilderCallbacksTest.cpp (NFC)

llvm-project/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp:333:10:
error: unused variable 'Ret' [-Werror,-Wunused-variable]
    bool Ret = MIR->parseMachineFunctions(*Mod, MMI);
         ^
1 error generated.
---
 llvm/unittests/MIR/PassBuilderCallbacksTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
index 4b7d7846b0336..8e3738dc91920 100644
--- a/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
+++ b/llvm/unittests/MIR/PassBuilderCallbacksTest.cpp
@@ -330,7 +330,7 @@ class MachineFunctionCallbacksTest : public testing::Test {
     Mod->setModuleIdentifier("module");
     Mod->setDataLayout(TM.createDataLayout());
 
-    bool Ret = MIR->parseMachineFunctions(*Mod, MMI);
+    [[maybe_unused]] bool Ret = MIR->parseMachineFunctions(*Mod, MMI);
     assert(!Ret);
 
     return Mod;

From e8740d4eb1c88e968b155f73ac745f80b4681589 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 16:59:09 -0600
Subject: [PATCH 109/546] [Clang] Fix missing architecture on CUDA test

Summary:
Sorry about the churn here, my local git tree got corrupted so a few
broken tests slipped by while trying to fix it.
---
 clang/test/Driver/cuda-cross-compiling.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index 25058358b63a8..086840accebe7 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -71,7 +71,7 @@
 //
 // Test passing arguments directly to nvlink.
 //
-// RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -### %s 2>&1 \
+// RUN: %clang -target nvptx64-nvidia-cuda -Wl,-v -Wl,a,b -march=sm_52 -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=LINKER-ARGS %s
 
 // LINKER-ARGS: nvlink{{.*}}"-v"{{.*}}"a" "b"
@@ -87,4 +87,4 @@
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
 
-// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"
\ No newline at end of file
+// GENERIC-NOT: -cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu"

From 5bd0c44bd0b944230ba05c87c19292304b84e980 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 22 Feb 2024 15:22:49 -0800
Subject: [PATCH 110/546] [libc] Match the names of BSD sys/queue.h member
 names (#82696)

While these names are technically internal implemenetation detail,
there's an existing code which relies on these details and using
different names makes LLVM libc implementation incompatible. Since our
goal is for LLVM libc to be a drop in replacement, use the same name as
BSD sys/queue.h version.
---
 .../llvm-libc-macros/sys-queue-macros.h       | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/libc/include/llvm-libc-macros/sys-queue-macros.h b/libc/include/llvm-libc-macros/sys-queue-macros.h
index 59e6a9a392c97..7da643cb72533 100644
--- a/libc/include/llvm-libc-macros/sys-queue-macros.h
+++ b/libc/include/llvm-libc-macros/sys-queue-macros.h
@@ -22,12 +22,12 @@
 
 #define SLIST_HEAD(name, type)                                                 \
   struct name {                                                                \
-    struct type *first;                                                        \
+    struct type *slh_first;                                                    \
   }
 
 #define SLIST_CLASS_HEAD(name, type)                                           \
   struct name {                                                                \
-    class type *first;                                                         \
+    class type *slh_first;                                                     \
   }
 
 #define SLIST_HEAD_INITIALIZER(head)                                           \
@@ -45,8 +45,8 @@
 
 // Singly-linked list access methods.
 
-#define SLIST_EMPTY(head) ((head)->first == NULL)
-#define SLIST_FIRST(head) ((head)->first)
+#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
+#define SLIST_FIRST(head) ((head)->slh_first)
 #define SLIST_NEXT(elem, field) ((elem)->field.next)
 
 #define SLIST_FOREACH(var, head, field)                                        \
@@ -132,18 +132,18 @@
 
 #define STAILQ_HEAD(name, type)                                                \
   struct name {                                                                \
-    struct type *first;                                                        \
-    struct type **last;                                                        \
+    struct type *stqh_first;                                                   \
+    struct type **stqh_last;                                                   \
   }
 
 #define STAILQ_CLASS_HEAD(name, type)                                          \
   struct name {                                                                \
-    class type *first;                                                         \
-    class type **last;                                                         \
+    class type *stqh_first;                                                    \
+    class type **stqh_last;                                                    \
   }
 
 #define STAILQ_HEAD_INITIALIZER(head)                                          \
-  { NULL, &(head).first }
+  { NULL, &(head).stqh_first }
 
 #define STAILQ_ENTRY(type)                                                     \
   struct {                                                                     \
@@ -157,12 +157,12 @@
 
 // Singly-linked tail queue access methods.
 
-#define STAILQ_EMPTY(head) ((head)->first == NULL)
-#define STAILQ_FIRST(head) ((head)->first)
+#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
+#define STAILQ_FIRST(head) ((head)->stqh_first)
 #define STAILQ_LAST(head, type, field)                                         \
   (STAILQ_EMPTY(head)                                                          \
        ? NULL                                                                  \
-       : __containerof((head)->last, QUEUE_TYPEOF(type), field.next))
+       : __containerof((head)->stqh_last, QUEUE_TYPEOF(type), field.next))
 #define STAILQ_NEXT(elem, field) ((elem)->field.next)
 
 #define STAILQ_FOREACH(var, head, field)                                       \
@@ -187,8 +187,8 @@
 #define STAILQ_CONCAT(head1, head2, type, field)                               \
   do {                                                                         \
     if (!STAILQ_EMPTY(head2)) {                                                \
-      *(head1)->last = (head2)->first;                                         \
-      (head1)->last = (head2)->last;                                           \
+      *(head1)->stqh_last = (head2)->stqh_first;                               \
+      (head1)->stqh_last = (head2)->stqh_last;                                 \
       STAILQ_INIT(head2);                                                      \
     }                                                                          \
   } while (0)
@@ -196,28 +196,28 @@
 #define STAILQ_INIT(head)                                                      \
   do {                                                                         \
     STAILQ_FIRST(head) = NULL;                                                 \
-    (head)->last = &STAILQ_FIRST(head);                                        \
+    (head)->stqh_last = &STAILQ_FIRST(head);                                   \
   } while (0)
 
 #define STAILQ_INSERT_AFTER(head, listelem, elem, field)                       \
   do {                                                                         \
     if ((STAILQ_NEXT(elem, field) = STAILQ_NEXT(listelem, field)) == NULL)     \
-      (head)->last = &STAILQ_NEXT(elem, field);                                \
+      (head)->stqh_last = &STAILQ_NEXT(elem, field);                           \
     STAILQ_NEXT(listelem, field) = (elem);                                     \
   } while (0)
 
 #define STAILQ_INSERT_HEAD(head, elem, field)                                  \
   do {                                                                         \
     if ((STAILQ_NEXT(elem, field) = STAILQ_FIRST(head)) == NULL)               \
-      (head)->last = &STAILQ_NEXT(elem, field);                                \
+      (head)->stqh_last = &STAILQ_NEXT(elem, field);                           \
     STAILQ_FIRST(head) = (elem);                                               \
   } while (0)
 
 #define STAILQ_INSERT_TAIL(head, elem, field)                                  \
   do {                                                                         \
     STAILQ_NEXT(elem, field) = NULL;                                           \
-    *(head)->last = (elem);                                                    \
-    (head)->last = &STAILQ_NEXT(elem, field);                                  \
+    *(head)->stqh_last = (elem);                                               \
+    (head)->stqh_last = &STAILQ_NEXT(elem, field);                             \
   } while (0)
 
 #define STAILQ_REMOVE(head, elem, type, field)                                 \
@@ -236,27 +236,27 @@
   do {                                                                         \
     if ((STAILQ_NEXT(elem, field) =                                            \
              STAILQ_NEXT(STAILQ_NEXT(elem, field), field)) == NULL)            \
-      (head)->last = &STAILQ_NEXT(elem, field);                                \
+      (head)->stqh_last = &STAILQ_NEXT(elem, field);                           \
   } while (0)
 
 #define STAILQ_REMOVE_HEAD(head, field)                                        \
   do {                                                                         \
     if ((STAILQ_FIRST(head) = STAILQ_NEXT(STAILQ_FIRST(head), field)) == NULL) \
-      (head)->last = &STAILQ_FIRST(head);                                      \
+      (head)->stqh_last = &STAILQ_FIRST(head);                                 \
   } while (0)
 
 #define STAILQ_SWAP(head1, head2, type)                                        \
   do {                                                                         \
     QUEUE_TYPEOF(type) *first = STAILQ_FIRST(head1);                           \
-    QUEUE_TYPEOF(type) **last = (head1)->last;                                 \
+    QUEUE_TYPEOF(type) **last = (head1)->stqh_last;                            \
     STAILQ_FIRST(head1) = STAILQ_FIRST(head2);                                 \
-    (head1)->last = (head2)->last;                                             \
+    (head1)->stqh_last = (head2)->stqh_last;                                   \
     STAILQ_FIRST(head2) = first;                                               \
-    (head2)->last = last;                                                      \
+    (head2)->stqh_last = last;                                                 \
     if (STAILQ_EMPTY(head1))                                                   \
-      (head1)->last = &STAILQ_FIRST(head1);                                    \
+      (head1)->stqh_last = &STAILQ_FIRST(head1);                               \
     if (STAILQ_EMPTY(head2))                                                   \
-      (head2)->last = &STAILQ_FIRST(head2);                                    \
+      (head2)->stqh_last = &STAILQ_FIRST(head2);                               \
   } while (0)
 
 #endif // __LLVM_LIBC_MACROS_SYS_QUEUE_MACROS_H

From aaf2d078b62251b867f37eaa94621dbbbfa0e5b0 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@quicinc.com>
Date: Thu, 22 Feb 2024 17:31:37 -0600
Subject: [PATCH 111/546] [Hexagon] Clean up redundant transfer instructions.
 (#82663)

This patch adds a Hexagon specific backend pass that cleans up redundant
transfers after register allocation.
---
 llvm/lib/Target/Hexagon/CMakeLists.txt        |   1 +
 .../Target/Hexagon/HexagonTargetMachine.cpp   |  10 +
 llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp | 324 ++++++++++++++++++
 .../Hexagon/atomicrmw-uinc-udec-wrap.ll       |   6 +-
 llvm/test/CodeGen/Hexagon/isel/select-vec.ll  |   2 +-
 llvm/test/CodeGen/Hexagon/reg-by-name.ll      |   4 +-
 llvm/test/CodeGen/Hexagon/tfr-slotindex.ll    |  26 ++
 7 files changed, 366 insertions(+), 7 deletions(-)
 create mode 100644 llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
 create mode 100644 llvm/test/CodeGen/Hexagon/tfr-slotindex.ll

diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 19ccd770f071d..2870f0bb6ad32 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -62,6 +62,7 @@ add_llvm_target(HexagonCodeGen
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
   HexagonTargetTransformInfo.cpp
+  HexagonTfrCleanup.cpp
   HexagonVectorCombine.cpp
   HexagonVectorLoopCarriedReuse.cpp
   HexagonVectorPrint.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index f640f76bc47b8..a5ebd64f1f8af 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -65,6 +65,10 @@ static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets",
                                           cl::init(true), cl::Hidden,
                                           cl::desc("Early expansion of MUX"));
 
+static cl::opt<bool> EnableTfrCleanup("hexagon-tfr-cleanup", cl::init(true),
+                                      cl::Hidden,
+                                      cl::desc("Cleanup of TFRs/COPYs"));
+
 static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden,
                                    cl::desc("Enable early if-conversion"));
 
@@ -153,6 +157,7 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 
 namespace llvm {
   extern char &HexagonExpandCondsetsID;
+  extern char &HexagonTfrCleanupID;
   void initializeHexagonBitSimplifyPass(PassRegistry&);
   void initializeHexagonConstExtendersPass(PassRegistry&);
   void initializeHexagonConstPropagationPass(PassRegistry&);
@@ -169,6 +174,7 @@ namespace llvm {
   void initializeHexagonPostIncOptPass(PassRegistry &);
   void initializeHexagonRDFOptPass(PassRegistry&);
   void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+  void initializeHexagonTfrCleanupPass(PassRegistry &);
   void initializeHexagonVExtractPass(PassRegistry &);
   void initializeHexagonVectorCombineLegacyPass(PassRegistry&);
   void initializeHexagonVectorLoopCarriedReuseLegacyPassPass(PassRegistry &);
@@ -204,6 +210,7 @@ namespace llvm {
   FunctionPass *createHexagonSplitConst32AndConst64();
   FunctionPass *createHexagonSplitDoubleRegs();
   FunctionPass *createHexagonStoreWidening();
+  FunctionPass *createHexagonTfrCleanup();
   FunctionPass *createHexagonVectorCombineLegacyPass();
   FunctionPass *createHexagonVectorPrint();
   FunctionPass *createHexagonVExtract();
@@ -258,6 +265,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
       TLOF(std::make_unique<HexagonTargetObjectFile>()) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+  initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
   initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
@@ -426,6 +434,8 @@ void HexagonPassConfig::addPreRegAlloc() {
       addPass(createHexagonConstExtenders());
     if (EnableExpandCondsets)
       insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
+    if (EnableTfrCleanup)
+      insertPass(&VirtRegRewriterID, &HexagonTfrCleanupID);
     if (!DisableStoreWidening)
       addPass(createHexagonStoreWidening());
     if (EnableGenMemAbs)
diff --git a/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
new file mode 100644
index 0000000000000..a4b359af303a3
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonTfrCleanup.cpp
@@ -0,0 +1,324 @@
+//===------- HexagonTfrCleanup.cpp - Hexagon Transfer Cleanup Pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass is to address a situation that appears after register allocaion
+// evey now and then, namely a register copy from a source that was defined
+// as an immediate value in the same block (usually just before the copy).
+//
+// Here is an example of actual code emitted that shows this problem:
+//
+//  .LBB0_5:
+//  {
+//    r5 = zxtb(r8)
+//    r6 = or(r6, ##12345)
+//  }
+//  {
+//    r3 = xor(r1, r2)
+//    r1 = #0               <-- r1 set to #0
+//  }
+//  {
+//    r7 = r1               <-- r7 set to r1
+//    r0 = zxtb(r3)
+//  }
+
+#define DEBUG_TYPE "tfr-cleanup"
+#include "HexagonTargetMachine.h"
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+FunctionPass *createHexagonTfrCleanup();
+void initializeHexagonTfrCleanupPass(PassRegistry &);
+} // namespace llvm
+
+namespace {
+class HexagonTfrCleanup : public MachineFunctionPass {
+public:
+  static char ID;
+  HexagonTfrCleanup() : MachineFunctionPass(ID), HII(0), TRI(0) {
+    PassRegistry &R = *PassRegistry::getPassRegistry();
+    initializeHexagonTfrCleanupPass(R);
+  }
+  StringRef getPassName() const override { return "Hexagon TFR Cleanup"; }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  const HexagonInstrInfo *HII;
+  const TargetRegisterInfo *TRI;
+
+  typedef DenseMap<unsigned, uint64_t> ImmediateMap;
+
+  bool isIntReg(unsigned Reg, bool &Is32);
+  void setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap);
+  bool getReg(unsigned Reg, uint64_t &Val, ImmediateMap &IMap);
+  bool updateImmMap(MachineInstr *MI, ImmediateMap &IMap);
+  bool rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap, SlotIndexes *Indexes);
+  bool eraseIfRedundant(MachineInstr *MI, SlotIndexes *Indexes);
+};
+} // namespace
+
+char HexagonTfrCleanup::ID = 0;
+
+namespace llvm {
+char &HexagonTfrCleanupID = HexagonTfrCleanup::ID;
+}
+
+bool HexagonTfrCleanup::isIntReg(unsigned Reg, bool &Is32) {
+  Is32 = Hexagon::IntRegsRegClass.contains(Reg);
+  return Is32 || Hexagon::DoubleRegsRegClass.contains(Reg);
+}
+
+// Assign given value V32 to the specified the register R32 in the map. Only
+// 32-bit registers are valid arguments.
+void HexagonTfrCleanup::setReg(unsigned R32, uint32_t V32, ImmediateMap &IMap) {
+  ImmediateMap::iterator F = IMap.find(R32);
+  if (F == IMap.end())
+    IMap.insert(std::make_pair(R32, V32));
+  else
+    F->second = V32;
+}
+
+// Retrieve a value of the provided register Reg and store it into Val.
+// Return "true" if a value was found, "false" otherwise.
+bool HexagonTfrCleanup::getReg(unsigned Reg, uint64_t &Val,
+                               ImmediateMap &IMap) {
+  bool Is32;
+  if (!isIntReg(Reg, Is32))
+    return false;
+
+  if (Is32) {
+    ImmediateMap::iterator F = IMap.find(Reg);
+    if (F == IMap.end())
+      return false;
+    Val = F->second;
+    return true;
+  }
+
+  // For 64-bit registers, compose the value from the values of its
+  // subregisters.
+  unsigned SubL = TRI->getSubReg(Reg, Hexagon::isub_lo);
+  unsigned SubH = TRI->getSubReg(Reg, Hexagon::isub_hi);
+  ImmediateMap::iterator FL = IMap.find(SubL), FH = IMap.find(SubH);
+  if (FL == IMap.end() || FH == IMap.end())
+    return false;
+  Val = (FH->second << 32) | FL->second;
+  return true;
+}
+
+// Process an instruction and record the relevant information in the imme-
+// diate map.
+bool HexagonTfrCleanup::updateImmMap(MachineInstr *MI, ImmediateMap &IMap) {
+  using namespace Hexagon;
+
+  if (MI->isCall()) {
+    IMap.clear();
+    return true;
+  }
+
+  // If this is an instruction that loads a constant into a register,
+  // record this information in IMap.
+  unsigned Opc = MI->getOpcode();
+  if (Opc == A2_tfrsi || Opc == A2_tfrpi) {
+    unsigned DefR = MI->getOperand(0).getReg();
+    bool Is32;
+    if (!isIntReg(DefR, Is32))
+      return false;
+    if (!MI->getOperand(1).isImm()) {
+      if (!Is32) {
+        IMap.erase(TRI->getSubReg(DefR, isub_lo));
+        IMap.erase(TRI->getSubReg(DefR, isub_hi));
+      } else {
+        IMap.erase(DefR);
+      }
+      return false;
+    }
+    uint64_t Val = MI->getOperand(1).getImm();
+    // If it's a 64-bit register, break it up into subregisters.
+    if (!Is32) {
+      uint32_t VH = (Val >> 32), VL = (Val & 0xFFFFFFFFU);
+      setReg(TRI->getSubReg(DefR, isub_lo), VL, IMap);
+      setReg(TRI->getSubReg(DefR, isub_hi), VH, IMap);
+    } else {
+      setReg(DefR, Val, IMap);
+    }
+    return true;
+  }
+
+  // Not a A2_tfr[sp]i. Invalidate all modified registers in IMap.
+  for (MachineInstr::mop_iterator Mo = MI->operands_begin(),
+                                  E = MI->operands_end();
+       Mo != E; ++Mo) {
+    if (Mo->isRegMask()) {
+      IMap.clear();
+      return true;
+    }
+    if (!Mo->isReg() || !Mo->isDef())
+      continue;
+    unsigned R = Mo->getReg();
+    for (MCRegAliasIterator AR(R, TRI, true); AR.isValid(); ++AR) {
+      ImmediateMap::iterator F = IMap.find(*AR);
+      if (F != IMap.end())
+        IMap.erase(F);
+    }
+  }
+  return true;
+}
+
+// Rewrite the instruction as A2_tfrsi/A2_tfrpi, it is a copy of a source that
+// has a known constant value.
+bool HexagonTfrCleanup::rewriteIfImm(MachineInstr *MI, ImmediateMap &IMap,
+                                     SlotIndexes *Indexes) {
+  using namespace Hexagon;
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case A2_tfr:
+  case A2_tfrp:
+  case COPY:
+    break;
+  default:
+    return false;
+  }
+
+  unsigned DstR = MI->getOperand(0).getReg();
+  unsigned SrcR = MI->getOperand(1).getReg();
+  bool Tmp, Is32;
+  if (!isIntReg(DstR, Is32) || !isIntReg(SrcR, Tmp))
+    return false;
+  assert(Tmp == Is32 && "Register size mismatch");
+  uint64_t Val;
+  bool Found = getReg(SrcR, Val, IMap);
+  if (!Found)
+    return false;
+
+  MachineBasicBlock &B = *MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  int64_t SVal = Is32 ? int32_t(Val) : Val;
+  auto &HST = B.getParent()->getSubtarget<HexagonSubtarget>();
+  MachineInstr *NewMI;
+  if (Is32)
+    NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrsi), DstR).addImm(SVal);
+  else if (isInt<8>(SVal))
+    NewMI = BuildMI(B, MI, DL, HII->get(A2_tfrpi), DstR).addImm(SVal);
+  else if (isInt<8>(SVal >> 32) && isInt<8>(int32_t(Val & 0xFFFFFFFFLL)))
+    NewMI = BuildMI(B, MI, DL, HII->get(A2_combineii), DstR)
+                .addImm(int32_t(SVal >> 32))
+                .addImm(int32_t(Val & 0xFFFFFFFFLL));
+  else if (HST.isTinyCore())
+    // Disable generating CONST64 since it requires load resource.
+    return false;
+  else
+    NewMI = BuildMI(B, MI, DL, HII->get(CONST64), DstR).addImm(Val);
+
+  // Replace the MI to reuse the same slot index
+  if (Indexes)
+    Indexes->replaceMachineInstrInMaps(*MI, *NewMI);
+  MI->eraseFromParent();
+  return true;
+}
+
+// Remove the instruction if it is a self-assignment.
+bool HexagonTfrCleanup::eraseIfRedundant(MachineInstr *MI,
+                                         SlotIndexes *Indexes) {
+  unsigned Opc = MI->getOpcode();
+  unsigned DefR, SrcR;
+  bool IsUndef = false;
+  switch (Opc) {
+  case Hexagon::A2_tfr:
+    // Rd = Rd
+    DefR = MI->getOperand(0).getReg();
+    SrcR = MI->getOperand(1).getReg();
+    IsUndef = MI->getOperand(1).isUndef();
+    break;
+  case Hexagon::A2_tfrt:
+  case Hexagon::A2_tfrf:
+    // if ([!]Pu) Rd = Rd
+    DefR = MI->getOperand(0).getReg();
+    SrcR = MI->getOperand(2).getReg();
+    IsUndef = MI->getOperand(2).isUndef();
+    break;
+  default:
+    return false;
+  }
+  if (DefR != SrcR)
+    return false;
+  if (IsUndef) {
+    MachineBasicBlock &B = *MI->getParent();
+    DebugLoc DL = MI->getDebugLoc();
+    auto DefI = BuildMI(B, MI, DL, HII->get(TargetOpcode::IMPLICIT_DEF), DefR);
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef() && Op.isImplicit())
+        DefI->addOperand(Op);
+  }
+
+  if (Indexes)
+    Indexes->removeMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+  return true;
+}
+
+bool HexagonTfrCleanup::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  // Map: 32-bit register -> immediate value.
+  // 64-bit registers are stored through their subregisters.
+  ImmediateMap IMap;
+  SlotIndexes *Indexes = this->getAnalysisIfAvailable<SlotIndexes>();
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  HII = HST.getInstrInfo();
+  TRI = HST.getRegisterInfo();
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock &B = *I;
+    MachineBasicBlock::iterator J, F, NextJ;
+    IMap.clear();
+    bool Inserted = false, Erased = false;
+    for (J = B.begin(), F = B.end(); J != F; J = NextJ) {
+      NextJ = std::next(J);
+      MachineInstr *MI = &*J;
+      bool E = eraseIfRedundant(MI, Indexes);
+      Erased |= E;
+      if (E)
+        continue;
+      Inserted |= rewriteIfImm(MI, IMap, Indexes);
+      MachineBasicBlock::iterator NewJ = std::prev(NextJ);
+      updateImmMap(&*NewJ, IMap);
+    }
+    bool BlockC = Inserted | Erased;
+    Changed |= BlockC;
+    if (BlockC && Indexes)
+      Indexes->repairIndexesInRange(&B, B.begin(), B.end());
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+INITIALIZE_PASS(HexagonTfrCleanup, "tfr-cleanup", "Hexagon TFR Cleanup", false,
+                false)
+
+FunctionPass *llvm::createHexagonTfrCleanup() {
+  return new HexagonTfrCleanup();
+}
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
index 9d7570b9a929e..d51c9554a022c 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
@@ -160,10 +160,8 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r8 = mux(p0,r8,r1)
-; CHECK-NEXT:     r9 = mux(p0,r9,r1)
+; CHECK-NEXT:     if (!p0.new) r8 = add(r1,#0)
+; CHECK-NEXT:     if (!p0.new) r9 = add(r1,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memd_locked(r0,p0) = r9:8
diff --git a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
index 4e54aa4212247..7073c1a2a609a 100644
--- a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
 define <4 x i8> @f0(<4 x i8> %a0, <4 x i8> %a1, i32 %a2) #0 {
 ; CHECK-LABEL: f0:
diff --git a/llvm/test/CodeGen/Hexagon/reg-by-name.ll b/llvm/test/CodeGen/Hexagon/reg-by-name.ll
index 4abea83ba6dd7..cc8807e4f4d6b 100644
--- a/llvm/test/CodeGen/Hexagon/reg-by-name.ll
+++ b/llvm/test/CodeGen/Hexagon/reg-by-name.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-tfr-cleanup=0 < %s | FileCheck %s
 
 target triple = "hexagon"
 
@@ -647,7 +647,7 @@ entry:
   ret i32 %1
 }
 
-attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv62" }
+attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv73" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 
diff --git a/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll
new file mode 100644
index 0000000000000..cebba9476c687
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll
@@ -0,0 +1,26 @@
+; Check that after tfr-cleanup COPY to $r0 is converted to tfrsi instruction
+; The tfrst instruction must use the same slot index as the COPY instruction
+; to avoid breaking live interval information.
+; Check that there is no machine verifier crash
+
+; RUN: llc -stop-after=tfr-cleanup -verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK: $r0 = A2_tfrsi 34767
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nounwind optsize
+define dso_local i32 @foo() local_unnamed_addr #0 {
+entry:
+  call void @bar(i32 34767) #1
+  call void @baz(i32 34767) #1
+  ret i32 15
+}
+
+declare void @bar(i32) local_unnamed_addr
+
+declare void @baz(i32) local_unnamed_addr
+
+attributes #0 = { nounwind optsize "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
+attributes #1 = { noduplicate nomerge nounwind }

From 568babab7e769a7793c28aee4f889898bf0bd8ba Mon Sep 17 00:00:00 2001
From: Pavel Iliin <Pavel.Iliin@arm.com>
Date: Thu, 22 Feb 2024 23:33:54 +0000
Subject: [PATCH 112/546] [AArch64] Implement __builtin_cpu_supports,
 compiler-rt tests. (#82378)

The patch complements https://github.com/llvm/llvm-project/pull/68919
and adds AArch64 support for builtin
`__builtin_cpu_supports("feature1+...+featureN")`
which return true if all specified CPU features in argument are
detected. Also compiler-rt aarch64 native run tests for features
detection mechanism were added and 'cpu_model' check was fixed after its
refactor merged https://github.com/llvm/llvm-project/pull/75635 Original
RFC was https://reviews.llvm.org/D153153
---
 clang/lib/Basic/Targets/AArch64.cpp           |  8 ++-
 clang/lib/Basic/Targets/AArch64.h             |  2 +-
 clang/lib/CodeGen/CGBuiltin.cpp               | 16 ++++++
 clang/lib/CodeGen/CodeGenFunction.h           |  2 +-
 .../CodeGen/aarch64-cpu-supports-target.c     | 52 ++++++++++++++++++
 clang/test/CodeGen/aarch64-cpu-supports.c     | 54 +++++++++++++++++++
 clang/test/Preprocessor/has_builtin_cpuid.c   |  5 --
 clang/test/Sema/aarch64-cpu-supports.c        | 26 +++++++++
 clang/test/Sema/builtin-cpu-supports.c        |  2 +-
 .../builtins/Unit/aarch64_cpu_features_test.c | 17 ++++++
 .../test/builtins/Unit/cpu_model_test.c       |  2 +-
 11 files changed, 176 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-cpu-supports-target.c
 create mode 100644 clang/test/CodeGen/aarch64-cpu-supports.c
 create mode 100644 clang/test/Sema/aarch64-cpu-supports.c
 create mode 100644 compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c

diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 68032961451d9..5abb060073c51 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -667,7 +667,13 @@ StringRef AArch64TargetInfo::getFeatureDependencies(StringRef Name) const {
 }
 
 bool AArch64TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
-  return llvm::AArch64::parseArchExtension(FeatureStr).has_value();
+  // CPU features might be separated by '+', extract them and check
+  llvm::SmallVector<StringRef, 8> Features;
+  FeatureStr.split(Features, "+");
+  for (auto &Feature : Features)
+    if (!llvm::AArch64::parseArchExtension(Feature.trim()).has_value())
+      return false;
+  return true;
 }
 
 bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 26ee7fa197825..c1ba156860a12 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -165,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
                             DiagnosticsEngine &Diags) override;
   ParsedTargetAttr parseTargetAttr(StringRef Str) const override;
   bool supportsTargetAttributeTune() const override { return true; }
-
+  bool supportsCpuSupports() const override { return true; }
   bool checkArithmeticFenceSupported() const override { return true; }
 
   bool hasBFloat16Type() const override;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d8b2115f1e5e3..734eb5a035ca4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10638,6 +10638,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
       BuiltinID <= clang::AArch64::LastSMEBuiltin)
     return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
 
+  if (BuiltinID == Builtin::BI__builtin_cpu_supports)
+    return EmitAArch64CpuSupports(E);
+
   unsigned HintID = static_cast<unsigned>(-1);
   switch (BuiltinID) {
   default: break;
@@ -14025,6 +14028,19 @@ Value *CodeGenFunction::EmitX86CpuInit() {
   return Builder.CreateCall(Func);
 }
 
+Value *CodeGenFunction::EmitAArch64CpuSupports(const CallExpr *E) {
+  const Expr *ArgExpr = E->getArg(0)->IgnoreParenCasts();
+  StringRef ArgStr = cast<StringLiteral>(ArgExpr)->getString();
+  llvm::SmallVector<StringRef, 8> Features;
+  ArgStr.split(Features, "+");
+  for (auto &Feature : Features) {
+    Feature = Feature.trim();
+    if (Feature != "default")
+      Features.push_back(Feature);
+  }
+  return EmitAArch64CpuSupports(Features);
+}
+
 llvm::Value *
 CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
   uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index caa6a327550ba..92ce0edeaf9e9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -5013,10 +5013,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitAArch64CpuInit();
   llvm::Value *
   FormAArch64ResolverCondition(const MultiVersionResolverOption &RO);
+  llvm::Value *EmitAArch64CpuSupports(const CallExpr *E);
   llvm::Value *EmitAArch64CpuSupports(ArrayRef<StringRef> FeatureStrs);
 };
 
-
 inline DominatingLLVMValue::saved_type
 DominatingLLVMValue::save(CodeGenFunction &CGF, llvm::Value *value) {
   if (!needsSaving(value)) return saved_type(value, false);
diff --git a/clang/test/CodeGen/aarch64-cpu-supports-target.c b/clang/test/CodeGen/aarch64-cpu-supports-target.c
new file mode 100644
index 0000000000000..e023944b24e53
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-cpu-supports-target.c
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+int check_all_feature() {
+  if (__builtin_cpu_supports("rng+flagm+flagm2+fp16fml+dotprod+sm4"))
+    return 1;
+  else if (__builtin_cpu_supports("rdm+lse+fp+simd+crc+sha1+sha2+sha3"))
+    return 2;
+  else if (__builtin_cpu_supports("aes+pmull+fp16+dit+dpb+dpb2+jscvt"))
+    return 3;
+  else if (__builtin_cpu_supports("fcma+rcpc+rcpc2+rcpc3+frintts+dgh"))
+    return 4;
+  else if (__builtin_cpu_supports("i8mm+bf16+ebf16+rpres+sve+sve-bf16"))
+    return 5;
+  else if (__builtin_cpu_supports("sve-ebf16+sve-i8mm+f32mm+f64mm"))
+    return 6;
+  else if (__builtin_cpu_supports("sve2+sve2-aes+sve2-pmull128"))
+    return 7;
+  else if (__builtin_cpu_supports("sve2-bitperm+sve2-sha3+sve2-sm4"))
+    return 8;
+  else if (__builtin_cpu_supports("sme+memtag+memtag2+memtag3+sb"))
+    return 9;
+  else if (__builtin_cpu_supports("predres+ssbs+ssbs2+bti+ls64+ls64_v"))
+    return 10;
+  else if (__builtin_cpu_supports("ls64_accdata+wfxt+sme-f64f64"))
+    return 11;
+  else if (__builtin_cpu_supports("sme-i16i64+sme2"))
+    return 12;
+  else
+    return 0;
+}
+
+// CHECK-LABEL: define dso_local i32 @neon_code() #1
+int __attribute__((target("simd"))) neon_code() { return 1; }
+
+// CHECK-LABEL: define dso_local i32 @sve_code() #2
+int __attribute__((target("sve"))) sve_code() { return 2; }
+
+// CHECK-LABEL: define dso_local i32 @code() #0
+int code() { return 3; }
+
+// CHECK-LABEL: define dso_local i32 @test_versions() #0
+int test_versions() {
+  if (__builtin_cpu_supports("sve"))
+    return sve_code();
+  else if (__builtin_cpu_supports("simd"))
+    return neon_code();
+  else
+    return code();
+}
+// CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon" }
+// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
diff --git a/clang/test/CodeGen/aarch64-cpu-supports.c b/clang/test/CodeGen/aarch64-cpu-supports.c
new file mode 100644
index 0000000000000..872fec6827ef1
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-cpu-supports.c
@@ -0,0 +1,54 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 2
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK-LABEL: define dso_local i32 @main
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    store i32 1, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    br label [[RETURN:%.*]]
+// CHECK:       if.end:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 9070970929152
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 9070970929152
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[IF_THEN1:%.*]], label [[IF_END2:%.*]]
+// CHECK:       if.then1:
+// CHECK-NEXT:    store i32 2, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    br label [[RETURN]]
+// CHECK:       if.end2:
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 166633186212708352
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 166633186212708352
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[IF_THEN3:%.*]], label [[IF_END4:%.*]]
+// CHECK:       if.then3:
+// CHECK-NEXT:    store i32 3, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    br label [[RETURN]]
+// CHECK:       if.end4:
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    br label [[RETURN]]
+// CHECK:       return:
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP12]]
+//
+int main(void) {
+  if (__builtin_cpu_supports("sb"))
+    return 1;
+
+  if (__builtin_cpu_supports("sve2-pmull128+memtag"))
+    return 2;
+
+  if (__builtin_cpu_supports("sme2+ls64_v+wfxt"))
+    return 3;
+
+  return 0;
+}
diff --git a/clang/test/Preprocessor/has_builtin_cpuid.c b/clang/test/Preprocessor/has_builtin_cpuid.c
index 8de6331e62d6e..35ef65ecdd9b9 100644
--- a/clang/test/Preprocessor/has_builtin_cpuid.c
+++ b/clang/test/Preprocessor/has_builtin_cpuid.c
@@ -13,8 +13,3 @@
 #   error "ARM/PPC shouldn't have __builtin_cpu_init"
 # endif
 #endif
-#if __has_builtin(__builtin_cpu_supports)
-# ifdef ARM
-#   error "ARM shouldn't have __builtin_cpu_supports"
-# endif
-#endif
diff --git a/clang/test/Sema/aarch64-cpu-supports.c b/clang/test/Sema/aarch64-cpu-supports.c
new file mode 100644
index 0000000000000..24aae9542dbc4
--- /dev/null
+++ b/clang/test/Sema/aarch64-cpu-supports.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify %s
+
+int test_aarch64_features(void) {
+  char * ssbs2;
+  // expected-error@+1 {{expression is not a string literal}}
+  if (__builtin_cpu_supports(ssbs2))
+    return 1;
+  // expected-error@+1 {{invalid cpu feature string}}
+  if (__builtin_cpu_supports(""))
+    return 2;
+  // expected-error@+1 {{invalid cpu feature string}}
+  if (__builtin_cpu_supports("pmull128"))
+    return 3;
+  // expected-error@+1 {{invalid cpu feature string}}
+  if (__builtin_cpu_supports("sve2,rpres"))
+    return 4;
+  // expected-error@+1 {{invalid cpu feature string}}
+  if (__builtin_cpu_supports("dgh+sve2-pmull"))
+    return 5;
+  // expected-error@+1 {{invalid cpu feature string}}
+  if (__builtin_cpu_supports("default"))
+    return 6;
+  if (__builtin_cpu_supports(" ssbs + bti "))
+    return 7;
+  return 0;
+}
diff --git a/clang/test/Sema/builtin-cpu-supports.c b/clang/test/Sema/builtin-cpu-supports.c
index cc6f1beb5d8a7..733d797f3ff8f 100644
--- a/clang/test/Sema/builtin-cpu-supports.c
+++ b/clang/test/Sema/builtin-cpu-supports.c
@@ -27,7 +27,7 @@ int main(void) {
   (void)__builtin_cpu_supports("x86-64-v4");
   (void)__builtin_cpu_supports("x86-64-v5"); // expected-error {{invalid cpu feature string for builtin}}
 #else
-  if (__builtin_cpu_supports("aes")) // expected-error {{builtin is not supported on this target}}
+  if (__builtin_cpu_supports("neon")) // expected-error {{invalid cpu feature string for builtin}}
     a("vsx");
 
   if (__builtin_cpu_is("cortex-x3")) // expected-error {{builtin is not supported on this target}}
diff --git a/compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c b/compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c
new file mode 100644
index 0000000000000..7ca2710ea2756
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/aarch64_cpu_features_test.c
@@ -0,0 +1,17 @@
+// REQUIRES: aarch64-target-arch
+// REQUIRES: native-run
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_aarch64
+int main(void) {
+  if (__builtin_cpu_supports("fp+simd+pmull+sha2+crc")) {
+    if (__builtin_cpu_supports("fp") && __builtin_cpu_supports("simd") &&
+        __builtin_cpu_supports("pmull") && __builtin_cpu_supports("sha2") &&
+        __builtin_cpu_supports("crc")) {
+      return 0;
+    } else {
+      // Something wrong in feature detection
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/cpu_model_test.c b/compiler-rt/test/builtins/Unit/cpu_model_test.c
index a8b736802f67b..6d5f17aa12565 100644
--- a/compiler-rt/test/builtins/Unit/cpu_model_test.c
+++ b/compiler-rt/test/builtins/Unit/cpu_model_test.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-target-arch
 // RUN: %clang_builtins %s %librt -o %t && %run %t
-// REQUIRES: librt_has_cpu_model
+// REQUIRES: librt_has_x86
 
 // FIXME: XFAIL the test because it is expected to return non-zero value.
 // XFAIL: *

From 2b0f5667e2b40729f714459093eb16cc53fc9e9a Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 22 Feb 2024 23:37:49 +0000
Subject: [PATCH 113/546] [gn build] Port aaf2d078b622

---
 llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index 09b5811d7d122..747ca8f9c91d3 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -86,6 +86,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonTargetMachine.cpp",
     "HexagonTargetObjectFile.cpp",
     "HexagonTargetTransformInfo.cpp",
+    "HexagonTfrCleanup.cpp",
     "HexagonVExtract.cpp",
     "HexagonVLIWPacketizer.cpp",
     "HexagonVectorCombine.cpp",

From d57f158a9546746219e3b01398886e104d8a0fdb Mon Sep 17 00:00:00 2001
From: Jerry-Ge <jerry.ge@arm.com>
Date: Thu, 22 Feb 2024 15:54:42 -0800
Subject: [PATCH 114/546] [Tosa] Add Tosa Sin and Cos operators (#82510)

- Add Tosa Sin and Cos operators to the MLIR dialect
- Define the new Tosa_FloatTensor type

---------

Signed-off-by: Jerry Ge <jerry.ge@arm.com>
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td  | 40 +++++++++++++++++++
 .../mlir/Dialect/Tosa/IR/TosaTypesBase.td     |  2 +
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          |  2 +
 mlir/test/Dialect/Tosa/ops.mlir               | 14 +++++++
 4 files changed, 58 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 0ee9e713724ea..0ecded75c5d8b 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -989,6 +989,26 @@ def Tosa_ClzOp : Tosa_ElementwiseOp<"clz", [SameOperandsAndResultElementType]> {
   );
 }
 
+//===----------------------------------------------------------------------===//
+// Operator: cos
+//===----------------------------------------------------------------------===//
+def Tosa_CosOp : Tosa_ElementwiseOp<"cos",
+    [SameOperandsAndResultElementType]> {
+  let summary = "Elementwise cos op";
+
+  let description = [{
+    Elementwise cosine operation for values given in radians.
+  }];
+
+  let arguments = (ins
+    Tosa_FloatTensor:$input
+  );
+
+  let results = (outs
+    Tosa_FloatTensor:$output
+  );
+}
+
 //===----------------------------------------------------------------------===//
 // Operator: exp
 //===----------------------------------------------------------------------===//
@@ -1148,6 +1168,26 @@ def Tosa_RsqrtOp : Tosa_ElementwiseOp<"rsqrt",
   );
 }
 
+//===----------------------------------------------------------------------===//
+// Operator: sin
+//===----------------------------------------------------------------------===//
+def Tosa_SinOp : Tosa_ElementwiseOp<"sin",
+    [SameOperandsAndResultElementType]> {
+  let summary = "Elementwise sin op";
+
+  let description = [{
+    Elementwise sine operation for values given in radians.
+  }];
+
+  let arguments = (ins
+    Tosa_FloatTensor:$input
+  );
+
+  let results = (outs
+    Tosa_FloatTensor:$output
+  );
+}
+
 //===----------------------------------------------------------------------===//
 // TOSA Spec Section 2.6
 // Operator Class: Elementwise unary/binary/ternary operators.
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index c55ddaafdda76..5a4d6ff464f19 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -113,6 +113,8 @@ def Tosa_Weight : AnyTypeOf<[Tosa_Int4, Tosa_Int8,
 def Tosa_Int32Tensor : TensorOf<[Tosa_Int32]>;
 def Tosa_Int32Or64Tensor : TensorOf<[Tosa_Int32Or64]>;
 
+def Tosa_FloatTensor : TensorOf<[Tosa_Float]>;
+
 // Either ranked or unranked tensor of TOSA supported element types.
 def Tosa_Tensor : TensorOf<[Tosa_AnyNumber]>;
 def Tosa_Tensor_Plus_F64 : TensorOf<[Tosa_AnyNumber_Plus_F64]>;
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 950ee597b891b..62d07859e32f6 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -1330,6 +1330,7 @@ NARY_SHAPE_INFER(tosa::CastOp)
 NARY_SHAPE_INFER(tosa::CeilOp)
 NARY_SHAPE_INFER(tosa::ClampOp)
 NARY_SHAPE_INFER(tosa::ClzOp)
+NARY_SHAPE_INFER(tosa::CosOp)
 NARY_SHAPE_INFER(tosa::DivOp)
 NARY_SHAPE_INFER(tosa::ExpOp)
 NARY_SHAPE_INFER(tosa::FloorOp)
@@ -1352,6 +1353,7 @@ NARY_SHAPE_INFER(tosa::ReciprocalOp)
 NARY_SHAPE_INFER(tosa::RescaleOp)
 NARY_SHAPE_INFER(tosa::ReverseOp)
 NARY_SHAPE_INFER(tosa::RsqrtOp)
+NARY_SHAPE_INFER(tosa::SinOp)
 NARY_SHAPE_INFER(tosa::SelectOp)
 NARY_SHAPE_INFER(tosa::SubOp)
 NARY_SHAPE_INFER(tosa::TanhOp)
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 3d68464ebf0b3..01b27072a4b64 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -375,6 +375,13 @@ func.func @test_clz(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
   return %0 : tensor<13x21x3xi32>
 }
 
+// -----
+// CHECK-LABEL: cos
+func.func @test_cos(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = tosa.cos %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  return %0 : tensor<13x21x3xf32>
+}
+
 // -----
 // CHECK-LABEL: exp
 func.func @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
@@ -424,6 +431,13 @@ func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   return %0 : tensor<13x21x3xf32>
 }
 
+// -----
+// CHECK-LABEL: sin
+func.func @test_sin(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
+  %0 = tosa.sin %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
+  return %0 : tensor<13x21x3xf32>
+}
+
 // -----
 // CHECK-LABEL: select
 func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {

From f37c6d55c6a0c695418932a55bac6a517be4a53a Mon Sep 17 00:00:00 2001
From: vangthao95 <vang.thao@amd.com>
Date: Thu, 22 Feb 2024 15:55:26 -0800
Subject: [PATCH 115/546] [AMDGPU][NFC] Refactor SIInsertWaitcnts zero waitcnt
 generation (#82575)

Move the allZero* waitcnt generation methods into WaitcntGenerator
class.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   | 28 +++++++++++++++----
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  9 ------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6ecb1c8bf6e1d..a6184c5e1e048 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -480,6 +480,10 @@ class WaitcntGenerator {
   // WaitEventType to corresponding counter values in InstCounterType.
   virtual const unsigned *getWaitEventMask() const = 0;
 
+  // Returns a new waitcnt with all counters except VScnt set to 0. If
+  // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
+
   virtual ~WaitcntGenerator() = default;
 };
 
@@ -516,6 +520,8 @@ class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
 
     return WaitEventMaskForInstPreGFX12;
   }
+
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
 class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
@@ -549,6 +555,8 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 
     return WaitEventMaskForInstGFX12Plus;
   }
+
+  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -1304,6 +1312,16 @@ bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
   return Modified;
 }
 
+AMDGPU::Waitcnt
+WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
+}
+
+AMDGPU::Waitcnt
+WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
+  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
+}
+
 /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
 /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
 /// were added by previous passes. Currently this pass conservatively
@@ -1613,8 +1631,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       MI.getOpcode() == AMDGPU::SI_RETURN ||
       MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
-    Wait = Wait.combined(
-        AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
   }
   // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
   // stores. In this case it can be useful to send a message to explicitly
@@ -1834,8 +1851,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
   if (MI.getOpcode() == AMDGPU::S_BARRIER &&
       !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
-    Wait = Wait.combined(
-        AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
+    Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1851,7 +1867,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   ScoreBrackets.simplifyWaitcnt(Wait);
 
   if (ForceEmitZeroWaitcnts)
-    Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
+    Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
 
   if (ForceEmitWaitcnt[LOAD_CNT])
     Wait.LoadCnt = 0;
@@ -2089,7 +2105,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     if (callWaitsOnFunctionReturn(Inst)) {
       // Act as a wait on everything
       ScoreBrackets->applyWaitcnt(
-          AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
+          WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
       ScoreBrackets->setStateOnFunctionEntryOrReturn();
     } else {
       // May need to way wait for anything.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f35e774452829..b38016a581603 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -870,15 +870,6 @@ struct Waitcnt {
       : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
         SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
 
-  static Waitcnt allZero(bool Extended, bool HasStorecnt) {
-    return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0)
-                    : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u);
-  }
-
-  static Waitcnt allZeroExceptVsCnt(bool Extended) {
-    return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u);
-  }
-
   bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
 
   bool hasWaitExceptStoreCnt() const {

From cd1d4d8dd31f527615de26f5b62d687c6b2982a6 Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Thu, 22 Feb 2024 15:56:13 -0800
Subject: [PATCH 116/546] [mlir][Vector] Add missing CHECK rules to
 vector-transfer-flatten.mlir (#82698)

This test failed after landing #81964 due to a bad merge. I provided a quick fix and this PR is adding the rest of CHECK rules that were not merged properly.
---
 mlir/test/Dialect/Vector/vector-transfer-flatten.mlir | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 2766e782a3fb2..788ae9ac044ed 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -476,6 +476,7 @@ func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, str
 //       CHECK:     %[[APPLY:.*]] = affine.apply #[[$MAP]]()
 
 // CHECK-128B-LABEL: func @regression_non_contiguous_dim_read(
+//       CHECK-128B:   memref.collapse_shape
 
 // -----
 
@@ -491,3 +492,4 @@ func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>,
 //   CHECK-NOT:    memref.collapse_shape
 
 // CHECK-128B-LABEL: func @unsupported_non_contiguous_dim_write(
+//   CHECK-128B-NOT:   memref.collapse_shape

From ac518c7c9916a6fde1d898b8c53b74298fd00d5f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 22 Feb 2024 16:17:48 -0800
Subject: [PATCH 117/546] [RISCV] Vector sub (zext, zext) -> sext (sub (zext,
 zext)) (#82455)

This is legal as long as the inner zext retains at least one bit of
increase so that the sub overflow case (0 - UINT_MAX) can be
represented. Alive2 proof: https://alive2.llvm.org/ce/z/BKeV3W

For RVV, restrict this to power of two sizes with the operation type
being at least e8 to stick to legal extends. We could arguably handle i1
source types with some care if we wanted to.

This is likely profitable because it may allow us to perform the sub
instruction in a narrow LMUL (equivalently, in fewer DLEN-sized pieces)
before widening for the user. We could arguably avoid narrowing below
DLEN, but the transform should at worst introduce one extra extend and
one extra vsetvli toggle if the source could previously be handled via
loads explicit w/EEW.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 25 ++++++++++++++-
 .../CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll | 32 +++++++++----------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6bf02cf8c0f87..5c67aaf678566 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12887,6 +12887,7 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineSubOfBoolean(N, DAG))
     return V;
 
+  EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   // fold (sub 0, (setcc x, 0, setlt)) -> (sra x, xlen - 1)
@@ -12894,7 +12895,6 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
       isNullConstant(N1.getOperand(1))) {
     ISD::CondCode CCVal = cast<CondCodeSDNode>(N1.getOperand(2))->get();
     if (CCVal == ISD::SETLT) {
-      EVT VT = N->getValueType(0);
       SDLoc DL(N);
       unsigned ShAmt = N0.getValueSizeInBits() - 1;
       return DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0),
@@ -12902,6 +12902,29 @@ static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // sub (zext, zext) -> sext (sub (zext, zext))
+  //   where the sum of the extend widths match, and the inner zexts
+  //   add at least one bit.  (For profitability on rvv, we use a
+  //   power of two for both inner and outer extend.)
+  if (VT.isVector() && Subtarget.getTargetLowering()->isTypeLegal(VT) &&
+      N0.getOpcode() == N1.getOpcode() && N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse() && N1.hasOneUse()) {
+    SDValue Src0 = N0.getOperand(0);
+    SDValue Src1 = N1.getOperand(0);
+    EVT SrcVT = Src0.getValueType();
+    if (Subtarget.getTargetLowering()->isTypeLegal(SrcVT) &&
+        SrcVT == Src1.getValueType() && SrcVT.getScalarSizeInBits() >= 8 &&
+        SrcVT.getScalarSizeInBits() < VT.getScalarSizeInBits() / 2) {
+      LLVMContext &C = *DAG.getContext();
+      EVT ElemVT = VT.getVectorElementType().getHalfSizedIntegerVT(C);
+      EVT NarrowVT = EVT::getVectorVT(C, ElemVT, VT.getVectorElementCount());
+      Src0 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src0), NarrowVT, Src0);
+      Src1 = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src1), NarrowVT, Src1);
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
+                         DAG.getNode(ISD::SUB, SDLoc(N), NarrowVT, Src0, Src1));
+    }
+  }
+
   // fold (sub x, (select lhs, rhs, cc, 0, y)) ->
   //      (select lhs, rhs, cc, x, (sub x, y))
   return combineSelectAndUse(N, N1, N0, DAG, /*AllOnes*/ false, Subtarget);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index 574c2652ccfac..a084b5383b403 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -385,12 +385,12 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind {
 define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i32_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -899,12 +899,12 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i32_of_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -917,12 +917,12 @@ define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
 define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i64_of_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vzext.vf4 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -935,12 +935,12 @@ define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
 define <2 x i64> @vwsubu_v2i64_of_v2i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i64_of_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i16>, ptr %x
   %b = load <2 x i16>, ptr %y

From 9e84a22e6989494709d30a03ce9b304956fc0ae2 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 18:22:16 -0600
Subject: [PATCH 118/546] [libc] Silence warnings when building GPU tests
 (#82701)

Summary:
This patch silences two warnings that may occur during the building of
GPU tests. These are not informative or helpful and just make the test
output longer.
---
 libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 4 ++--
 libc/cmake/modules/LLVMLibCTestRules.cmake          | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 33ba5da4f8d57..408e25b3469c0 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -158,12 +158,12 @@ function(_get_hermetic_test_compile_options output_var flags)
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     list(APPEND compile_options
-         -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+         -Wno-multi-gpu -nogpulib -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
          -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     list(APPEND compile_options
          "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false"
-         --cuda-path=${LIBC_CUDA_ROOT}
+         -Wno-multi-gpu --cuda-path=${LIBC_CUDA_ROOT}
          -nogpulib -march=${LIBC_GPU_TARGET_ARCHITECTURE} -fno-use-cxa-atexit)
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 373cbd6853859..1166c26e4d8a5 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -470,6 +470,7 @@ function(add_integration_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE 
+      "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
@@ -650,6 +651,7 @@ function(add_libc_hermetic_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE 
+      "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
   elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)

From 7a5c01dbca3ddfc6dd87775ec90346783c8e2c73 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 22 Feb 2024 18:55:46 -0600
Subject: [PATCH 119/546] [libc] Search the compiler's path for GPU utility
 tools (#82712)

Summary:
We need some extra tools for the GPU build. Normally we search for these
from the build itself, but in the case of a `LLVM_PROJECTS_BUILD` or
some other kind of external build, this directory will not be populated.
However, the GPU build already requires that the compiler is an
up-to-date clang, which should always have these present next to the
binary. Simply add this as a fallback search path. Generally we want it
to be the second, because it would pick up someone install and then
become stale.
---
 libc/cmake/modules/prepare_libc_gpu_build.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 75beef86760c8..752182f67cc01 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -17,9 +17,10 @@ if(NOT LLVM_LIBC_FULL_BUILD)
 endif()
 
 # Identify the program used to package multiple images into a single binary.
+get_filename_component(compiler_path ${CMAKE_CXX_COMPILER} DIRECTORY)
 find_program(LIBC_CLANG_OFFLOAD_PACKAGER
              NAMES clang-offload-packager NO_DEFAULT_PATH
-             PATHS ${LLVM_BINARY_DIR}/bin)
+             PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
 if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
   message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
                       "build")
@@ -45,7 +46,7 @@ elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
   # Using 'check_cxx_compiler_flag' does not work currently due to the link job.
   find_program(LIBC_NVPTX_ARCH
                NAMES nvptx-arch NO_DEFAULT_PATH
-               PATHS ${LLVM_BINARY_DIR}/bin)
+               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
   if(LIBC_NVPTX_ARCH)
     execute_process(COMMAND ${LIBC_NVPTX_ARCH}
                     OUTPUT_VARIABLE arch_tool_output

From 590c968e7943e51bb00ff75d312435f24d983b2a Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 22 Feb 2024 17:27:28 -0800
Subject: [PATCH 120/546] [NVPTX] fixup support for unaligned parameters and
 returns (#82562)

Add support for unaligned parameters and return values. These must be
loaded and stored one byte at a time and then bit manipulation is used
to assemble the correct final result.
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  30 ++
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   | 257 +++++++++++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |   4 +
 llvm/test/CodeGen/NVPTX/param-load-store.ll   |  93 ++++-
 .../NVPTX/unaligned-param-load-store.ll       | 385 ++++++++++++++++++
 5 files changed, 730 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ded2f2584014d..3ff8994602e16 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2135,6 +2135,21 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
                              NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
                              NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
                              NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
+    if (Opcode == NVPTX::StoreRetvalI8) {
+      // Fine tune the opcode depending on the size of the operand.
+      // This helps to avoid creating redundant COPY instructions in
+      // InstrEmitter::AddRegisterOperand().
+      switch (Ops[0].getSimpleValueType().SimpleTy) {
+      default:
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreRetvalI8TruncI32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreRetvalI8TruncI64;
+        break;
+      }
+    }
     break;
   case 2:
     Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
@@ -2211,6 +2226,21 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
                                NVPTX::StoreParamI8, NVPTX::StoreParamI16,
                                NVPTX::StoreParamI32, NVPTX::StoreParamI64,
                                NVPTX::StoreParamF32, NVPTX::StoreParamF64);
+      if (Opcode == NVPTX::StoreParamI8) {
+        // Fine tune the opcode depending on the size of the operand.
+        // This helps to avoid creating redundant COPY instructions in
+        // InstrEmitter::AddRegisterOperand().
+        switch (Ops[0].getSimpleValueType().SimpleTy) {
+        default:
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::StoreParamI8TruncI32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::StoreParamI8TruncI64;
+          break;
+        }
+      }
       break;
     case 2:
       Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7d2fe78d14229..66a101036f913 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -47,6 +47,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -59,6 +60,7 @@
 #include <cmath>
 #include <cstdint>
 #include <iterator>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -1529,6 +1531,105 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
   return DL.getABITypeAlign(Ty);
 }
 
+static bool adjustElementType(EVT &ElementType) {
+  switch (ElementType.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::f16:
+  case MVT::bf16:
+    ElementType = MVT::i16;
+    return true;
+  case MVT::f32:
+  case MVT::v2f16:
+  case MVT::v2bf16:
+    ElementType = MVT::i32;
+    return true;
+  case MVT::f64:
+    ElementType = MVT::i64;
+    return true;
+  }
+}
+
+// Use byte-store when the param address of the argument value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+//
+// This is called in LowerCall() when passing the param values.
+static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
+                                        uint64_t Offset, EVT ElementType,
+                                        SDValue StVal, SDValue &InGlue,
+                                        unsigned ArgID, const SDLoc &dl) {
+  // Bit logic only works on integer types
+  if (adjustElementType(ElementType))
+    StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
+
+  // Store each byte
+  SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    // Shift the byte to the last byte position
+    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
+                                   DAG.getConstant(i * 8, dl, MVT::i32));
+    SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
+                               DAG.getConstant(Offset + i, dl, MVT::i32),
+                               ShiftVal, InGlue};
+    // Trunc store only the last byte by using
+    //     st.param.b8
+    // The register type can be larger than b8.
+    Chain = DAG.getMemIntrinsicNode(
+        NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
+        MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
+    InGlue = Chain.getValue(1);
+  }
+  return Chain;
+}
+
+// Use byte-load when the param adress of the returned value is unaligned.
+// This may happen when the returned value is a field of a packed structure.
+static SDValue
+LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
+                           EVT ElementType, SDValue &InGlue,
+                           SmallVectorImpl<SDValue> &TempProxyRegOps,
+                           const SDLoc &dl) {
+  // Bit logic only works on integer types
+  EVT MergedType = ElementType;
+  adjustElementType(MergedType);
+
+  // Load each byte and construct the whole value. Initial value to 0
+  SDValue RetVal = DAG.getConstant(0, dl, MergedType);
+  // LoadParamMemI8 loads into i16 register only
+  SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
+                              DAG.getConstant(Offset + i, dl, MVT::i32),
+                              InGlue};
+    // This will be selected to LoadParamMemI8
+    SDValue LdVal =
+        DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
+                                MVT::i8, MachinePointerInfo(), Align(1));
+    SDValue TmpLdVal = LdVal.getValue(0);
+    Chain = LdVal.getValue(1);
+    InGlue = LdVal.getValue(2);
+
+    TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
+                           TmpLdVal.getSimpleValueType(), TmpLdVal);
+    TempProxyRegOps.push_back(TmpLdVal);
+
+    SDValue CMask = DAG.getConstant(255, dl, MergedType);
+    SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
+    // Need to extend the i16 register to the whole width.
+    TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
+    // Mask off the high bits. Leave only the lower 8bits.
+    // Do this because we are using loadparam.b8.
+    TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
+    // Shift and merge
+    TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
+    RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
+  }
+  if (ElementType != MergedType)
+    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+  return RetVal;
+}
+
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
 
@@ -1680,17 +1781,6 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (NeedAlign)
         PartAlign = commonAlignment(ArgAlign, CurOffset);
 
-      // New store.
-      if (VectorInfo[j] & PVF_FIRST) {
-        assert(StoreOperands.empty() && "Unfinished preceding store.");
-        StoreOperands.push_back(Chain);
-        StoreOperands.push_back(
-            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
-        StoreOperands.push_back(DAG.getConstant(
-            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
-            dl, MVT::i32));
-      }
-
       SDValue StVal = OutVals[OIdx];
 
       MVT PromotedVT;
@@ -1723,6 +1813,35 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
       }
 
+      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+      // scalar store. In such cases, fall back to byte stores.
+      if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
+          PartAlign.value() <
+              DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
+        assert(StoreOperands.empty() && "Unfinished preceeding store.");
+        Chain = LowerUnalignedStoreParam(
+            DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
+            StVal, InGlue, ParamCount, dl);
+
+        // LowerUnalignedStoreParam took care of inserting the necessary nodes
+        // into the SDAG, so just move on to the next element.
+        if (!IsByVal)
+          ++OIdx;
+        continue;
+      }
+
+      // New store.
+      if (VectorInfo[j] & PVF_FIRST) {
+        assert(StoreOperands.empty() && "Unfinished preceding store.");
+        StoreOperands.push_back(Chain);
+        StoreOperands.push_back(
+            DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
+
+        StoreOperands.push_back(DAG.getConstant(
+            IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
+            dl, MVT::i32));
+      }
+
       // Record the value to store.
       StoreOperands.push_back(StVal);
 
@@ -1923,6 +2042,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SmallVector<SDValue, 16> ProxyRegOps;
   SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
+  // An item of the vector is filled if the element does not need a ProxyReg
+  // operation on it and should be added to InVals as is. ProxyRegOps and
+  // ProxyRegTruncates contain empty/none items at the same index.
+  SmallVector<SDValue, 16> RetElts;
+  // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
+  // to use the values of `LoadParam`s and to be replaced later then
+  // `CALLSEQ_END` is added.
+  SmallVector<SDValue, 16> TempProxyRegOps;
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
@@ -1966,6 +2093,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         EltType = MVT::i16;
       }
 
+      // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
+      // scalar load. In such cases, fall back to byte loads.
+      if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
+          EltAlign < DL.getABITypeAlign(
+                         TheLoadType.getTypeForEVT(*DAG.getContext()))) {
+        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+        SDValue Ret = LowerUnalignedLoadRetParam(
+            DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
+        ProxyRegOps.push_back(SDValue());
+        ProxyRegTruncates.push_back(std::optional<MVT>());
+        RetElts.resize(i);
+        RetElts.push_back(Ret);
+
+        continue;
+      }
+
       // Record index of the very first element of the vector.
       if (VectorInfo[i] & PVF_FIRST) {
         assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
@@ -2028,6 +2171,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // will not get lost. Otherwise, during libcalls expansion, the nodes can become
   // dangling.
   for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
+    if (i < RetElts.size() && RetElts[i]) {
+      InVals.push_back(RetElts[i]);
+      continue;
+    }
+
     SDValue Ret = DAG.getNode(
       NVPTXISD::ProxyReg, dl,
       DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
@@ -2044,6 +2192,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     InVals.push_back(Ret);
   }
 
+  for (SDValue &T : TempProxyRegOps) {
+    SDValue Repl = DAG.getNode(
+        NVPTXISD::ProxyReg, dl,
+        DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
+        {Chain, T.getOperand(0), InGlue});
+    DAG.ReplaceAllUsesWith(T, Repl);
+    DAG.RemoveDeadNode(T.getNode());
+
+    Chain = Repl.getValue(1);
+    InGlue = Repl.getValue(2);
+  }
+
   // set isTailCall to false for now, until we figure out how to express
   // tail call optimization in PTX
   isTailCall = false;
@@ -3045,9 +3205,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
                           DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
           Value *srcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+
+          const MaybeAlign PartAlign = [&]() -> MaybeAlign {
+            if (aggregateIsPacked)
+              return Align(1);
+            if (NumElts != 1)
+              return std::nullopt;
+            Align PartAlign =
+                (Offsets[parti] == 0 && PAL.getParamAlignment(i))
+                    ? PAL.getParamAlignment(i).value()
+                    : DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
+            return commonAlignment(PartAlign, Offsets[parti]);
+          }();
           SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
-                                  MachinePointerInfo(srcValue),
-                                  MaybeAlign(aggregateIsPacked ? 1 : 0),
+                                  MachinePointerInfo(srcValue), PartAlign,
                                   MachineMemOperand::MODereferenceable |
                                       MachineMemOperand::MOInvariant);
           if (P.getNode())
@@ -3113,6 +3284,33 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
+// Use byte-store when the param adress of the return value is unaligned.
+// This may happen when the return value is a field of a packed structure.
+static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
+                                      uint64_t Offset, EVT ElementType,
+                                      SDValue RetVal, const SDLoc &dl) {
+  // Bit logic only works on integer types
+  if (adjustElementType(ElementType))
+    RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
+
+  // Store each byte
+  for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
+    // Shift the byte to the last byte position
+    SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
+                                   DAG.getConstant(i * 8, dl, MVT::i32));
+    SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
+                               ShiftVal};
+    // Trunc store only the last byte by using
+    //     st.param.b8
+    // The register type can be larger than b8.
+    Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                    DAG.getVTList(MVT::Other), StoreOperands,
+                                    MVT::i8, MachinePointerInfo(), std::nullopt,
+                                    MachineMemOperand::MOStore);
+  }
+  return Chain;
+}
+
 SDValue
 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  bool isVarArg,
@@ -3162,13 +3360,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   SmallVector<SDValue, 6> StoreOperands;
   for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
-    // New load/store. Record chain and offset operands.
-    if (VectorInfo[i] & PVF_FIRST) {
-      assert(StoreOperands.empty() && "Orphaned operand list.");
-      StoreOperands.push_back(Chain);
-      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
-    }
-
     SDValue OutVal = OutVals[i];
     SDValue RetVal = PromotedOutVals[i];
 
@@ -3182,6 +3373,32 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
     }
 
+    // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
+    // for a scalar store. In such cases, fall back to byte stores.
+    if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
+      EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+      Align ElementTypeAlign =
+          DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
+      Align ElementAlign =
+          commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
+      if (ElementAlign < ElementTypeAlign) {
+        assert(StoreOperands.empty() && "Orphaned operand list.");
+        Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
+                                       RetVal, dl);
+
+        // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
+        // into the graph, so just move on to the next element.
+        continue;
+      }
+    }
+
+    // New load/store. Record chain and offset operands.
+    if (VectorInfo[i] & PVF_FIRST) {
+      assert(StoreOperands.empty() && "Orphaned operand list.");
+      StoreOperands.push_back(Chain);
+      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+    }
+
     // Record the value to return.
     StoreOperands.push_back(RetVal);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 55a1955a7f497..b3517ce066b87 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2738,6 +2738,8 @@ def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
 
 def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
 def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamI8TruncI32 : StoreParamInst<Int32Regs, ".b8">;
+def StoreParamI8TruncI64 : StoreParamInst<Int64Regs, ".b8">;
 def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
 def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
 def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
@@ -2757,6 +2759,8 @@ def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
 def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
 def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalI8TruncI32 : StoreRetvalInst<Int32Regs, ".b8">;
+def StoreRetvalI8TruncI64 : StoreRetvalInst<Int64Regs, ".b8">;
 def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
 def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
 def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index c14dc88431d31..a29d4e1875cd7 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
 ; CHECK:        .param .align 1 .b8 param0[25];
-; CHECK-DAG:        st.param.b32    [param0+0],
-; CHECK-DAG:        st.param.b32    [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+0],
+; CHECK-DAG:        st.param.b8     [param0+1],
+; CHECK-DAG:        st.param.b8     [param0+2],
+; CHECK-DAG:        st.param.b8     [param0+3],
+; CHECK-DAG:        st.param.b8     [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+5],
+; CHECK-DAG:        st.param.b8     [param0+6],
+; CHECK-DAG:        st.param.b8     [param0+7],
 ; CHECK-DAG:        st.param.b8     [param0+8],
-; CHECK-DAG:        st.param.b32    [param0+9],
-; CHECK-DAG:        st.param.b32    [param0+13],
-; CHECK-DAG:        st.param.b64    [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+9],
+; CHECK-DAG:        st.param.b8     [param0+10],
+; CHECK-DAG:        st.param.b8     [param0+11],
+; CHECK-DAG:        st.param.b8     [param0+12],
+; CHECK-DAG:        st.param.b8     [param0+13],
+; CHECK-DAG:        st.param.b8     [param0+14],
+; CHECK-DAG:        st.param.b8     [param0+15],
+; CHECK-DAG:        st.param.b8     [param0+16],
+; CHECK-DAG:        st.param.b8     [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+18],
+; CHECK-DAG:        st.param.b8     [param0+19],
+; CHECK-DAG:        st.param.b8     [param0+20],
+; CHECK-DAG:        st.param.b8     [param0+21],
+; CHECK-DAG:        st.param.b8     [param0+22],
+; CHECK-DAG:        st.param.b8     [param0+23],
+; CHECK-DAG:        st.param.b8     [param0+24],
 ; CHECK:            .param .align 1 .b8 retval0[25];
 ; CHECK:            call.uni (retval0),
 ; CHECK-NEXT:       test_s_i1i32x4p,
-; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
-; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
-; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
-; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
-; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
-; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
-; CHECK-DAG:        st.param.b32    [func_retval0+0],
-; CHECK-DAG:        st.param.b32    [func_retval0+4],
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+0];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK:            } // callseq
+; CHECK-DAG:        st.param.b8     [func_retval0+0],
+; CHECK-DAG:        st.param.b8     [func_retval0+1],
+; CHECK-DAG:        st.param.b8     [func_retval0+2],
+; CHECK-DAG:        st.param.b8     [func_retval0+3],
+; CHECK-DAG:        st.param.b8     [func_retval0+4],
+; CHECK-DAG:        st.param.b8     [func_retval0+5],
+; CHECK-DAG:        st.param.b8     [func_retval0+6],
+; CHECK-DAG:        st.param.b8     [func_retval0+7],
 ; CHECK-DAG:        st.param.b8     [func_retval0+8],
-; CHECK-DAG:        st.param.b32    [func_retval0+9],
-; CHECK-DAG:        st.param.b32    [func_retval0+13],
-; CHECK-DAG:        st.param.b64    [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+9],
+; CHECK-DAG:        st.param.b8     [func_retval0+10],
+; CHECK-DAG:        st.param.b8     [func_retval0+11],
+; CHECK-DAG:        st.param.b8     [func_retval0+12],
+; CHECK-DAG:        st.param.b8     [func_retval0+13],
+; CHECK-DAG:        st.param.b8     [func_retval0+14],
+; CHECK-DAG:        st.param.b8     [func_retval0+15],
+; CHECK-DAG:        st.param.b8     [func_retval0+16],
+; CHECK-DAG:        st.param.b8     [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+18],
+; CHECK-DAG:        st.param.b8     [func_retval0+19],
+; CHECK-DAG:        st.param.b8     [func_retval0+20],
+; CHECK-DAG:        st.param.b8     [func_retval0+21],
+; CHECK-DAG:        st.param.b8     [func_retval0+22],
+; CHECK-DAG:        st.param.b8     [func_retval0+23],
+; CHECK-DAG:        st.param.b8     [func_retval0+24],
 
 define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
        %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
new file mode 100644
index 0000000000000..40a3e9e945a23
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -0,0 +1,385 @@
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+
+%s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
+%s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
+%s_i8i64p = type { <{ i64, i8, i64 }>, i64 }
+%s_i8f16p = type { <{ half, i8, half }>, i64 }
+%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 }
+%s_i8f32p = type { <{ float, i8, float }>, i64 }
+%s_i8f64p = type { <{ double, i8, double }>, i64 }
+
+; -- All loads/stores from parameters aligned by one must be done one
+;    byte at a time.
+; -- Notes:
+;   -- There are two fields of interest in the packed part of the struct, one
+;      with a proper offset and one without. The former should be loaded or
+;      stored as a whole, and the latter by bytes.
+;   -- Only loading and storing the said fields are checked in the following
+;      series of tests so that they are more concise.
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8i16p(
+; CHECK:        .param .align 8 .b8 test_s_i8i16p_param_0[16]
+; CHECK-DAG:    ld.param.u16 [[P0:%rs[0-9]+]],   [test_s_i8i16p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rs[0-9]+]],   [test_s_i8i16p_param_0+3];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rs[0-9]+]],   [test_s_i8i16p_param_0+4];
+; CHECK-DAG:    shl.b16     [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16      [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8;
+; CHECK-DAG:    and.b16      [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255;
+; CHECK-DAG:    or.b16       [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3], [[R2]];
+; CHECK-DAG:    and.b16      [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4], [[R2_1_and]];
+; CHECK:        ret;
+
+define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
+       %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
+       ret %s_i8i16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8i32p(
+; CHECK:        .param .align 8 .b8 test_s_i8i32p_param_0[24]
+; CHECK-DAG:    ld.param.u32 [[P0:%r[0-9]+]],   [test_s_i8i32p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%r[0-9]+]],   [test_s_i8i32p_param_0+5];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%r[0-9]+]],   [test_s_i8i32p_param_0+6];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%r[0-9]+]],   [test_s_i8i32p_param_0+7];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%r[0-9]+]],   [test_s_i8i32p_param_0+8];
+; CHECK-DAG:    shl.b32     [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32     [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32     [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32      [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32      [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32      [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32     [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32     [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
+       %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
+       ret %s_i8i32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8i64p(
+; CHECK:        .param .align 8 .b8 test_s_i8i64p_param_0[32]
+; CHECK-DAG:    ld.param.u64 [[P0:%rd[0-9]+]],   [test_s_i8i64p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rd[0-9]+]],   [test_s_i8i64p_param_0+9];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rd[0-9]+]],   [test_s_i8i64p_param_0+10];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%rd[0-9]+]],   [test_s_i8i64p_param_0+11];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%rd[0-9]+]],   [test_s_i8i64p_param_0+12];
+; CHECK-DAG:    ld.param.u8 [[P2_4:%rd[0-9]+]],   [test_s_i8i64p_param_0+13];
+; CHECK-DAG:    ld.param.u8 [[P2_5:%rd[0-9]+]],   [test_s_i8i64p_param_0+14];
+; CHECK-DAG:    ld.param.u8 [[P2_6:%rd[0-9]+]],   [test_s_i8i64p_param_0+15];
+; CHECK-DAG:    ld.param.u8 [[P2_7:%rd[0-9]+]],   [test_s_i8i64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.b64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
+       %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
+       ret %s_i8i64p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8f16p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16p_param_0[16]
+; CHECK-DAG:    ld.param.b16 [[P0:%rs[0-9]+]],     [test_s_i8f16p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rs[0-9]+]],   [test_s_i8f16p_param_0+3];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rs[0-9]+]],   [test_s_i8f16p_param_0+4];
+; CHECK-DAG:    shl.b16      [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16       [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],     [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2I_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2I_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8;
+; CHECK-DAG:    and.b16      [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255;
+; CHECK-DAG:    or.b16       [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3],  [[R2I]];
+; CHECK-DAG:    and.b16      [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4],  [[R2I_1_and]];
+; CHECK:        ret;
+
+define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
+       %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
+       ret %s_i8f16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f16x2p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16x2p_param_0[24]
+; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],  [test_s_i8f16x2p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f16x2p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f16x2p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f16x2p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f16x2p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16x2p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
+       %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
+       ret %s_i8f16x2p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f32p(
+; CHECK:        .param .align 8 .b8 test_s_i8f32p_param_0[24]
+; CHECK-DAG:    ld.param.f32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f32p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.f32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f32 [[R0:%f[0-9]+]],    [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
+       %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
+       ret %s_i8f32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8f64p(
+; CHECK:        .param .align 8 .b8 test_s_i8f64p_param_0[32]
+; CHECK-DAG:    ld.param.f64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%rd[0-9]+]],   [test_s_i8f64p_param_0+12];
+; CHECK-DAG:    ld.param.u8  [[P2_4:%rd[0-9]+]],   [test_s_i8f64p_param_0+13];
+; CHECK-DAG:    ld.param.u8  [[P2_5:%rd[0-9]+]],   [test_s_i8f64p_param_0+14];
+; CHECK-DAG:    ld.param.u8  [[P2_6:%rd[0-9]+]],   [test_s_i8f64p_param_0+15];
+; CHECK-DAG:    ld.param.u8  [[P2_7:%rd[0-9]+]],   [test_s_i8f64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.f64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f64 [[R0:%fd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
+       %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
+       ret %s_i8f64p %r
+}

From 19e518d2623c0e87a87ebf30405e74448bd1ee70 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Fri, 23 Feb 2024 09:36:32 +0800
Subject: [PATCH 121/546] [Clang][Parser] Have the depth of the abbreviated
 generic lambdas inside a requires clause differ from the surrounding generic
 lambda (#80656)

A one-line fix, again : )

This fixes https://github.com/llvm/llvm-project/issues/78524 and the
similar example at
https://github.com/llvm/llvm-project/issues/78524#issuecomment-1899886951.

We previously increased the template depth by one after parsing the
attaching requires-clause on a lambda expression. This led to a problem
where the 'auto' parameters of nested abbreviated generic lambdas,
inside of a requires-expression, had the same depth as the template
parameters of the surrounding lambda. Consequently, during the
concept-checking stage, we ended up substituting these parameters with
the wrong template arguments because they were at different levels.
---
 clang/docs/ReleaseNotes.rst                   |  4 +++
 clang/lib/Parse/ParseExprCXX.cpp              | 11 +++++++-
 .../Parser/cxx-concepts-requires-clause.cpp   | 27 +++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 19cc5b7756431..529dd783ab738 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -277,6 +277,10 @@ Bug Fixes to C++ Support
   (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
 - Correctly immediate-escalate lambda conversion functions.
   (`#82258 <https://github.com/llvm/llvm-project/issues/82258>`_)
+- Fixed an issue where template parameters of a nested abbreviated generic lambda within
+  a requires-clause lie at the same depth as those of the surrounding lambda. This,
+  in turn, results in the wrong template argument substitution during constraint checking.
+  (`#78524 <https://github.com/llvm/llvm-project/issues/78524>`_)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index fd262ff31e661..22ee60af4616d 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1385,6 +1385,16 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
       Diag(RAngleLoc,
            diag::err_lambda_template_parameter_list_empty);
     } else {
+      // We increase the template depth before recursing into a requires-clause.
+      //
+      // This depth is used for setting up a LambdaScopeInfo (in
+      // Sema::RecordParsingTemplateParameterDepth), which is used later when
+      // inventing template parameters in InventTemplateParameter.
+      //
+      // This way, abbreviated generic lambdas could have different template
+      // depths, avoiding substitution into the wrong template parameters during
+      // constraint satisfaction check.
+      ++CurTemplateDepthTracker;
       ExprResult RequiresClause;
       if (TryConsumeToken(tok::kw_requires)) {
         RequiresClause =
@@ -1396,7 +1406,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
 
       Actions.ActOnLambdaExplicitTemplateParameterList(
           Intro, LAngleLoc, TemplateParams, RAngleLoc, RequiresClause);
-      ++CurTemplateDepthTracker;
     }
   }
 
diff --git a/clang/test/Parser/cxx-concepts-requires-clause.cpp b/clang/test/Parser/cxx-concepts-requires-clause.cpp
index 1ec1eefa12865..5b5bc9ea978bf 100644
--- a/clang/test/Parser/cxx-concepts-requires-clause.cpp
+++ b/clang/test/Parser/cxx-concepts-requires-clause.cpp
@@ -168,3 +168,30 @@ auto lambda4 = [] requires(sizeof(char) == 1){}; // expected-error {{expected bo
 #if __cplusplus <= 202002L
 // expected-warning@-2{{lambda without a parameter clause is a C++23 extension}}
 #endif
+
+namespace GH78524 {
+
+template <typename T> T Foo;
+
+template <typename T> auto C(Foo<T>);
+
+template <typename T> struct D {
+  decltype(T()(C<T>)) Type;
+};
+
+template <typename T, typename U> D<T> G(T, U) { return {}; }
+
+struct E {};
+
+void F() {
+  G([]<typename T>
+//     ~~~~~~~~~~ T: Depth: 0, Index: 0
+      requires requires { [](auto...) {}; }(T)
+//                           ~~~~ auto: Depth: 1, Index: 0
+    { return T(); },
+    E{});
+}
+
+int a = []<int=0> requires requires { [](auto){}; } { return 0; }();
+
+} // namespace GH78524

From 5ccf54640a2bdb6f36f65c574feb312da7f75243 Mon Sep 17 00:00:00 2001
From: huaatian <142874007+huaatian@users.noreply.github.com>
Date: Fri, 23 Feb 2024 10:25:02 +0800
Subject: [PATCH 122/546] [llvm][cmake] Performing expensive checks requires
 enabling assert. (#80821)

LLVM will intercept errors using assert() when
LLVM_ENABLE_EXPENSIVE_CHECKS is ON. So an explicit check is added.

---------

Co-authored-by: Hua Tian <akiratian@tencent.com>
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 4257083e53ad4..40316b11ceed9 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -128,6 +128,11 @@ if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
 endif()
 
 if(LLVM_ENABLE_EXPENSIVE_CHECKS)
+  # When LLVM_ENABLE_EXPENSIVE_CHECKS is ON, LLVM will intercept errors
+  # using assert(). An explicit check is performed here.
+  if (NOT LLVM_ENABLE_ASSERTIONS)
+    message(FATAL_ERROR "LLVM_ENABLE_EXPENSIVE_CHECKS requires LLVM_ENABLE_ASSERTIONS \"ON\".")
+  endif()
   add_compile_definitions(EXPENSIVE_CHECKS)
 
   # In some libstdc++ versions, std::min_element is not constexpr when

From 2e5af56b05c2d39ab2c829bf4c13190523b67ddd Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 23 Feb 2024 10:59:46 +0800
Subject: [PATCH 123/546] [C++20] [Modules] Allow to compile a pcm with and
 without -fPIC seperately

We can compile a module unit in 2 phase compilaton:

```
clang++ -std=c++20 a.cppm --precompile -o a.pcm
clang++ -std=c++20 a.pcm -c -o a.o
```

And it is a general requirement that we need to compile a translation
unit with and without -fPIC for static and shared libraries.

But for C++20 modules with 2 phase compilation, it may be waste of time
to compile them 2 times completely. It may be fine to generate one BMI
and compile it with and without -fPIC seperately.

e.g.,

```
clang++ -std=c++20 a.cppm --precompile -o a.pcm
clang++ -std=c++20 a.pcm -c -o a.o
clang++ -std=c++20 a.pcm -c -fPIC -o a-PIC.o
```

Then we can save the time to parse a.cppm repeatedly.
---
 clang/include/clang/Frontend/ASTUnit.h        | 23 +++++++++++--------
 .../include/clang/Frontend/CompilerInstance.h |  3 +++
 .../clang/Frontend/CompilerInvocation.h       |  1 +
 clang/lib/Frontend/ASTUnit.cpp                | 15 ++++++++++--
 clang/lib/Frontend/FrontendAction.cpp         |  2 +-
 clang/test/Modules/compile-pcm-with-pic.cppm  | 21 +++++++++++++++++
 clang/tools/c-index-test/core_main.cpp        |  2 +-
 clang/tools/libclang/CIndex.cpp               |  2 +-
 8 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/Modules/compile-pcm-with-pic.cppm

diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index 6af712afdcb6d..a2c1b25dd2247 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -691,16 +691,19 @@ class ASTUnit {
   /// lifetime is expected to extend past that of the returned ASTUnit.
   ///
   /// \returns - The initialized ASTUnit or null if the AST failed to load.
-  static std::unique_ptr<ASTUnit> LoadFromASTFile(
-      const std::string &Filename, const PCHContainerReader &PCHContainerRdr,
-      WhatToLoad ToLoad, IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
-      const FileSystemOptions &FileSystemOpts,
-      std::shared_ptr<HeaderSearchOptions> HSOpts, bool OnlyLocalDecls = false,
-      CaptureDiagsKind CaptureDiagnostics = CaptureDiagsKind::None,
-      bool AllowASTWithCompilerErrors = false,
-      bool UserFilesAreVolatile = false,
-      IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
-          llvm::vfs::getRealFileSystem());
+  static std::unique_ptr<ASTUnit>
+  LoadFromASTFile(const std::string &Filename,
+                  const PCHContainerReader &PCHContainerRdr, WhatToLoad ToLoad,
+                  IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
+                  const FileSystemOptions &FileSystemOpts,
+                  std::shared_ptr<HeaderSearchOptions> HSOpts,
+                  std::shared_ptr<LangOptions> LangOpts = nullptr,
+                  bool OnlyLocalDecls = false,
+                  CaptureDiagsKind CaptureDiagnostics = CaptureDiagsKind::None,
+                  bool AllowASTWithCompilerErrors = false,
+                  bool UserFilesAreVolatile = false,
+                  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
+                      llvm::vfs::getRealFileSystem());
 
 private:
   /// Helper function for \c LoadFromCompilerInvocation() and
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index ac2f940769fbe..b97d0c636806a 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -311,6 +311,9 @@ class CompilerInstance : public ModuleLoader {
 
   LangOptions &getLangOpts() { return Invocation->getLangOpts(); }
   const LangOptions &getLangOpts() const { return Invocation->getLangOpts(); }
+  std::shared_ptr<LangOptions> getLangOptsPtr() const {
+    return Invocation->getLangOptsPtr();
+  }
 
   PreprocessorOptions &getPreprocessorOpts() {
     return Invocation->getPreprocessorOpts();
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index c6528779bde7b..8fc51e6ec03b6 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -271,6 +271,7 @@ class CompilerInvocation : public CompilerInvocationBase {
   std::shared_ptr<PreprocessorOptions> getPreprocessorOptsPtr() {
     return PPOpts;
   }
+  std::shared_ptr<LangOptions> getLangOptsPtr() { return LangOpts; }
   /// @}
 
   /// Create a compiler invocation from a list of input options.
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index f09a01b5dd4af..3610a08831e79 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -540,7 +540,17 @@ class ASTInfoCollector : public ASTReaderListener {
     if (InitializedLanguage)
       return false;
 
+    // FIXME: We did similar things in ReadHeaderSearchOptions too. But such
+    // style is not scaling. Probably we need to invite some mechanism to
+    // handle such patterns generally.
+    auto PICLevel = LangOpt.PICLevel;
+    auto PIE = LangOpt.PIE;
+
     LangOpt = LangOpts;
+
+    LangOpt.PICLevel = PICLevel;
+    LangOpt.PIE = PIE;
+
     InitializedLanguage = true;
 
     updated();
@@ -790,7 +800,8 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
     const std::string &Filename, const PCHContainerReader &PCHContainerRdr,
     WhatToLoad ToLoad, IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
     const FileSystemOptions &FileSystemOpts,
-    std::shared_ptr<HeaderSearchOptions> HSOpts, bool OnlyLocalDecls,
+    std::shared_ptr<HeaderSearchOptions> HSOpts,
+    std::shared_ptr<LangOptions> LangOpts, bool OnlyLocalDecls,
     CaptureDiagsKind CaptureDiagnostics, bool AllowASTWithCompilerErrors,
     bool UserFilesAreVolatile, IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS) {
   std::unique_ptr<ASTUnit> AST(new ASTUnit(true));
@@ -804,7 +815,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
 
   ConfigureDiags(Diags, *AST, CaptureDiagnostics);
 
-  AST->LangOpts = std::make_shared<LangOptions>();
+  AST->LangOpts = LangOpts ? LangOpts : std::make_shared<LangOptions>();
   AST->OnlyLocalDecls = OnlyLocalDecls;
   AST->CaptureDiagnostics = CaptureDiagnostics;
   AST->Diagnostics = Diags;
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index eff785b99a09a..b9fd9b8897b7e 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -689,7 +689,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
     std::unique_ptr<ASTUnit> AST = ASTUnit::LoadFromASTFile(
         std::string(InputFile), CI.getPCHContainerReader(),
         ASTUnit::LoadEverything, Diags, CI.getFileSystemOpts(),
-        CI.getHeaderSearchOptsPtr());
+        CI.getHeaderSearchOptsPtr(), CI.getLangOptsPtr());
 
     if (!AST)
       return false;
diff --git a/clang/test/Modules/compile-pcm-with-pic.cppm b/clang/test/Modules/compile-pcm-with-pic.cppm
new file mode 100644
index 0000000000000..3d818dde0cd2f
--- /dev/null
+++ b/clang/test/Modules/compile-pcm-with-pic.cppm
@@ -0,0 +1,21 @@
+// REQUIRES: x86-registered-target
+
+// RUN: rm -rf %t
+// RUN: mkdir %t
+
+// RUN: %clang_cc1 -std=c++20 %s -pic-level 2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -pic-level 2 -fmodule-output=%t/m.pcm -emit-llvm -o - \
+// RUN:     | FileCheck %s
+//
+// RUN: %clang_cc1 -std=c++20 %s -emit-module-interface -o %t/m.pcm
+// RUN: %clang_cc1 -std=c++20 %t/m.pcm -pic-level 2 -emit-llvm -o - | FileCheck %s 
+// RUN: %clang_cc1 -std=c++20 %t/m.pcm -emit-llvm -o - | FileCheck %s --check-prefix=NOPIC
+
+export module m;
+export int x;
+export int func() {
+    return x;
+}
+
+// CHECK: ![[METADATA_NUM:[0-9]+]] = !{{{.*}}, !"PIC Level", i32 2}
+// NOPIC-NOT: ![[METADATA_NUM:[0-9]+]] = !{{{.*}}, !"PIC Level", i32 2}
diff --git a/clang/tools/c-index-test/core_main.cpp b/clang/tools/c-index-test/core_main.cpp
index 56bf7c91acc7b..c552466c9a188 100644
--- a/clang/tools/c-index-test/core_main.cpp
+++ b/clang/tools/c-index-test/core_main.cpp
@@ -276,7 +276,7 @@ static bool printSourceSymbolsFromModule(StringRef modulePath,
       CompilerInstance::createDiagnostics(new DiagnosticOptions());
   std::unique_ptr<ASTUnit> AU = ASTUnit::LoadFromASTFile(
       std::string(modulePath), *pchRdr, ASTUnit::LoadASTOnly, Diags,
-      FileSystemOpts, HSOpts,
+      FileSystemOpts, HSOpts, /*LangOpts=*/nullptr,
       /*OnlyLocalDecls=*/true, CaptureDiagsKind::None,
       /*AllowASTWithCompilerErrors=*/true,
       /*UserFilesAreVolatile=*/false);
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 4ded92cbe9aea..418b152ba4a13 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -3890,7 +3890,7 @@ enum CXErrorCode clang_createTranslationUnit2(CXIndex CIdx,
   std::unique_ptr<ASTUnit> AU = ASTUnit::LoadFromASTFile(
       ast_filename, CXXIdx->getPCHContainerOperations()->getRawReader(),
       ASTUnit::LoadEverything, Diags, FileSystemOpts, HSOpts,
-      CXXIdx->getOnlyLocalDecls(), CaptureDiagsKind::All,
+      /*LangOpts=*/nullptr, CXXIdx->getOnlyLocalDecls(), CaptureDiagsKind::All,
       /*AllowASTWithCompilerErrors=*/true,
       /*UserFilesAreVolatile=*/true);
   *out_TU = MakeCXTranslationUnit(CXXIdx, std::move(AU));

From 6e6bf9f81756ba6655b4eea8dc45469a47f89b39 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Thu, 22 Feb 2024 19:17:15 -0800
Subject: [PATCH 124/546] [WebAssembly] Disable multivalue emission temporarily
 (#82714)

We plan to enable multivalue in the features section soon (#80923) for
other reasons, such as the feature having been standardized for many
years and other features being developed (e.g. EH) depending on it. This
is separate from enabling Clang experimental multivalue ABI (`-Xclang
-target-abi -Xclang experimental-mv`), but it turned out we generate
some multivalue code in the backend as well if it is enabled in the
features section.

Given that our backend multivalue generation still has not been much
used nor tested, and enabling the feature in the features section can be
a separate decision from how much multialue (including none) we decide
to generate for now, I'd like to temporarily disable the actual
generation of multivalue in our backend. To do that, this adds an
internal flag `-wasm-emit-multivalue` that defaults to false. All our
existing multivalue tests can use this to test multivalue code. This
flag can be removed later when we are confident the multivalue
generation is well tested.
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  7 +++--
 .../WebAssemblyMachineFunctionInfo.cpp        |  5 +++-
 .../WebAssemblyRuntimeLibcallSignatures.cpp   | 26 ++++++++++---------
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |  9 +++++++
 .../lower-em-ehsjlj-multi-return.ll           |  4 +--
 .../multivalue-dont-move-def-past-use.mir     |  2 +-
 .../WebAssembly/multivalue-stackify.ll        |  2 +-
 llvm/test/CodeGen/WebAssembly/multivalue.ll   | 10 ++++---
 .../CodeGen/WebAssembly/multivalue_libcall.ll |  2 +-
 9 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7c47790d1e351..36f067956e63a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,6 +43,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
+extern cl::opt<bool> WasmEmitMultiValue;
+
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1288,7 +1290,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
-  return Subtarget->hasMultivalue() || Outs.size() <= 1;
+  return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1296,7 +1298,8 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
+  assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
+          Outs.size() <= 1) &&
          "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index 1e959111a4dbc..b969b8370a3e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+extern cl::opt<bool> WasmEmitMultiValue;
+
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
 MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -71,7 +73,8 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
+      (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
+       !WasmEmitMultiValue)) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 3e2e029695ab6..2a84c90c89602 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,6 +24,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> WasmEmitMultiValue;
+
 namespace {
 
 enum RuntimeLibcallSignature {
@@ -694,7 +696,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_func_f32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -703,7 +705,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F32);
     break;
   case i64_i64_func_f64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -712,7 +714,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F64);
     break;
   case i16_i16_func_i16_i16:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -722,7 +724,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i32_i32_func_i32_i32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -732,7 +734,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -742,7 +744,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -754,7 +756,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_iPTR:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -767,7 +769,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
@@ -781,7 +783,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -851,7 +853,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -865,7 +867,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i32:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -874,7 +876,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64:
-    if (Subtarget.hasMultivalue()) {
+    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 42043a7b8680a..3120b6b67906e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,6 +54,15 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
+// A temporary option to control emission of multivalue until multivalue
+// implementation is stable enough. We currently don't emit multivalue by
+// default even if the feature section allows it.
+// TODO Stabilize multivalue and delete this option
+cl::opt<bool>
+    WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
+                       cl::desc("WebAssembly: Emit multivalue in the backend"),
+                       cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index 4f33439db770d..daf46c6eef025 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4b4661b144667..4fadbd5f07e6d 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index 52a8c686824d3..f4f93ac2f30ce 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 675009c8f3e54..846691e5ff0cd 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -19,6 +20,7 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
+; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 47c5ae7b457dd..7bf37b59353ad 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled

From ca09e08239008759f92f4aff39c7640da3e1bfa9 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Thu, 22 Feb 2024 19:41:15 -0800
Subject: [PATCH 125/546] [Symbolizer][WebAssembly] Use wasm-specific
 getSymbolSize (#82083)

getSymbolSize was recently added to WasmObjectFile and has correct sizes
for most symbol types. This makes llvm-symbolizer correctly symbolize
addresses in the middle of the symbol.

When reworking the test I also noticed that the DWARF info seems to be
wrong for the first instruction in each function. I noted that in the test
comments but didn't attempt to fix here.
---
 llvm/lib/Object/SymbolSize.cpp               |  7 +++
 llvm/test/tools/llvm-symbolizer/wasm-basic.s | 53 ++++++++++++++++----
 2 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp
index cb20feffb710b..635cd8373afbf 100644
--- a/llvm/lib/Object/SymbolSize.cpp
+++ b/llvm/lib/Object/SymbolSize.cpp
@@ -65,6 +65,13 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
     return Ret;
   }
 
+  if (const auto *E = dyn_cast<WasmObjectFile>(&O)) {
+    for (SymbolRef Sym : E->symbols()) {
+      Ret.push_back({Sym, E->getSymbolSize(Sym)});
+    }
+    return Ret;
+  }
+
   // Collect sorted symbol addresses. Include dummy addresses for the end
   // of each section.
   std::vector<SymEntry> Addresses;
diff --git a/llvm/test/tools/llvm-symbolizer/wasm-basic.s b/llvm/test/tools/llvm-symbolizer/wasm-basic.s
index cc189abcfca80..1f425e5259316 100644
--- a/llvm/test/tools/llvm-symbolizer/wasm-basic.s
+++ b/llvm/test/tools/llvm-symbolizer/wasm-basic.s
@@ -1,24 +1,59 @@
 # REQUIRES: webassembly-registered-target
 # RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj %s -o %t.o -g
+# RUN: llvm-symbolizer --basenames --output-style=GNU -e %t.o 1 2 3 4 5 6 7 8 9 10 11 12 13 | FileCheck %s
 
 foo:
     .functype foo () -> ()
     nop
+    return
     end_function
 
 bar:
     .functype bar (i32) -> (i32)
     local.get 0
+    nop
     return
     end_function
 
-# RUN: llvm-symbolizer -e %t.o 3 4 7 8 | FileCheck %s
-## Byte 1 is the function length and 2 is the locals declaration.
-## Currently no line corresponds to them.
-## TODO: create a loc for .functype?
 
-## Test 2 functions to ensure wasm's function-sections system works.
-# CHECK: wasm-basic.s:6:0
-# CHECK: wasm-basic.s:7:0
-# CHECK: wasm-basic.s:11:0
-# CHECK: wasm-basic.s:11:0
+## Symbols start from (including) the function length and should cover all the
+## way to the next symbol start.
+## TODO: create a loc for .functype? It could go with the local declarations.
+
+## Byte 1 is the function length, has no loc but the symbol table considers it
+## the start of the function
+# CHECK: foo
+# CHECK-NEXT: ??:0
+## Byte 2 is the local declaration, but for some reason DWARF is marking it as line 7.
+## TODO: figure out why.
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:7
+## Byte 3 is actually the nop, line 7
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:7
+## Byte 4 is the return, line 8
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:8
+## Byte 5 is the end_function, line 9
+# CHECK-NEXT: foo
+# CHECK-NEXT: wasm-basic.s:9
+## Byte 6 is bar's function length, symbol table considers it part of bar
+# CHECK-NEXT: bar
+# CHECK-NEXT: ??:0
+## Byte 7 bar's local declaration, but DWARF marks it as line 13, like above
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:13
+## Byte 8 and 9 are actually the local.get on line 13
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:13
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:13
+## Byte 10 is the nop
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:14
+## Byte b is the return
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:15
+## Byte c is end_function
+# CHECK-NEXT: bar
+# CHECK-NEXT: wasm-basic.s:16

From de41eae41f0dc2a844b439e0246e29c1bcbb2d03 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 22 Feb 2024 20:18:52 -0800
Subject: [PATCH 126/546] [SelectionDAG][RISCV] Use FP type for legality query
 for LRINT/LLRINT in LegalizeVectorOps. (#82728)

This matches how LRINT/LLRINT is queried for scalar types in
LegalizeDAG.

It's confusing if they do different things since a "Legal" vector
LRINT/LLRINT would get through to LegalizeDAG which would then consider
it illegal. This doesn't happen currently because RISC-V uses Custom.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 4 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 2a7aaf88847ea..6074498d9144f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -404,8 +404,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FCEIL:
   case ISD::FTRUNC:
   case ISD::FRINT:
-  case ISD::LRINT:
-  case ISD::LLRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
   case ISD::FROUNDEVEN:
@@ -455,6 +453,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
                                               Node->getValueType(0), Scale);
     break;
   }
+  case ISD::LRINT:
+  case ISD::LLRINT:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::VECREDUCE_ADD:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5c67aaf678566..04d5e60500ce6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -830,7 +830,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          VT, Custom);
       setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT,
                          Custom);
-      setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
       setOperationAction({ISD::AVGFLOORU, ISD::AVGCEILU, ISD::SADDSAT,
                           ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT},
                          VT, Legal);
@@ -956,6 +955,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       // between vXf16 and vXf64 must be lowered as sequences which convert via
       // vXf32.
       setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
+      setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
       // Custom-lower insert/extract operations to simplify patterns.
       setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
                          Custom);

From 2d50703ddd4fcf7826e4b62cba38e3151314ca60 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 23 Feb 2024 12:46:37 +0800
Subject: [PATCH 127/546] [RISCV] Use RISCVSubtarget::getRealVLen() in more
 places. NFC

Catching a couple of more places where we can use the new query added in
8603a7b2.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 31 +++++++++------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 04d5e60500ce6..7540b22d13b7f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3848,11 +3848,10 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   // If we're compiling for an exact VLEN value, we can split our work per
   // register in the register group.
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
-  if (MinVLen == MaxVLen && VT.getSizeInBits().getKnownMinValue() > MinVLen) {
+  if (const auto VLen = Subtarget.getRealVLen();
+      VLen && VT.getSizeInBits().getKnownMinValue() > *VLen) {
     MVT ElemVT = VT.getVectorElementType();
-    unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+    unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
     MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
     MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
@@ -4763,9 +4762,8 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
 
   // If we don't know exact data layout, not much we can do.  If this
   // is already m1 or smaller, no point in splitting further.
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
-  if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen)
+  const auto VLen = Subtarget.getRealVLen();
+  if (!VLen || VT.getSizeInBits().getFixedValue() <= *VLen)
     return SDValue();
 
   // Avoid picking up bitrotate patterns which we have a linear-in-lmul
@@ -4776,7 +4774,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
     return SDValue();
 
   MVT ElemVT = VT.getVectorElementType();
-  unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+  unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
   unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
 
   SmallVector<std::pair<int, SmallVector<int>>>
@@ -8328,15 +8326,13 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   // constant index, we can always perform the extract in m1 (or
   // smaller) as we can determine the register corresponding to
   // the index in the register group.
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+  const auto VLen = Subtarget.getRealVLen();
   if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
-      IdxC && MinVLen == MaxVLen &&
-      VecVT.getSizeInBits().getKnownMinValue() > MinVLen) {
+      IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
     MVT M1VT = getLMUL1VT(ContainerVT);
     unsigned OrigIdx = IdxC->getZExtValue();
     EVT ElemVT = VecVT.getVectorElementType();
-    unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+    unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
     unsigned RemIdx = OrigIdx % ElemsPerVReg;
     unsigned SubRegIdx = OrigIdx / ElemsPerVReg;
     unsigned ExtractIdx =
@@ -9797,15 +9793,14 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   if (OrigIdx == 0)
     return Op;
 
-  const unsigned MinVLen = Subtarget.getRealMinVLen();
-  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+  const auto VLen = Subtarget.getRealVLen();
 
   // If the subvector vector is a fixed-length type and we don't know VLEN
   // exactly, we cannot use subregister manipulation to simplify the codegen; we
   // don't know which register of a LMUL group contains the specific subvector
   // as we only know the minimum register size. Therefore we must slide the
   // vector group down the full amount.
-  if (SubVecVT.isFixedLengthVector() && MinVLen != MaxVLen) {
+  if (SubVecVT.isFixedLengthVector() && !VLen) {
     MVT ContainerVT = VecVT;
     if (VecVT.isFixedLengthVector()) {
       ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -9852,8 +9847,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // and decomposeSubvectorInsertExtractToSubRegs takes this into account. So if
   // we have a fixed length subvector, we need to adjust the index by 1/vscale.
   if (SubVecVT.isFixedLengthVector()) {
-    assert(MinVLen == MaxVLen);
-    unsigned Vscale = MinVLen / RISCV::RVVBitsPerBlock;
+    assert(VLen);
+    unsigned Vscale = *VLen / RISCV::RVVBitsPerBlock;
     auto Decompose =
         RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
             VecVT, ContainerSubVecVT, OrigIdx / Vscale, TRI);

From 0d72fe9777e7c131dfb50c172b944d64437e2ece Mon Sep 17 00:00:00 2001
From: shkoo <nils@risczero.com>
Date: Thu, 22 Feb 2024 21:27:01 -0800
Subject: [PATCH 128/546] [mlir] Fix FunctionOpInterface
 extraSharedClassDeclaration to be fully namespace qualified (#82682)

`extraSharedClassDeclaration` of `FunctionOpInterface` can be inherited
by other `OpInterfaces` into foreign namespaces, thus types must be
fully qualified to prevent compiler errors, for example:

    def MyFunc : OpInterface<"MyFunc", [FunctionOpInterface]> {
        let cppNamespace = "::MyNamespace";
    }
---
 .../mlir/Interfaces/FunctionInterfaces.td     | 226 +++++++++---------
 1 file changed, 113 insertions(+), 113 deletions(-)

diff --git a/mlir/include/mlir/Interfaces/FunctionInterfaces.td b/mlir/include/mlir/Interfaces/FunctionInterfaces.td
index 98e002565cf19..970a781c998b9 100644
--- a/mlir/include/mlir/Interfaces/FunctionInterfaces.td
+++ b/mlir/include/mlir/Interfaces/FunctionInterfaces.td
@@ -147,12 +147,12 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
   }];
   let extraSharedClassDeclaration = [{
     /// Block list iterator types.
-    using BlockListType = Region::BlockListType;
+    using BlockListType = ::mlir::Region::BlockListType;
     using iterator = BlockListType::iterator;
     using reverse_iterator = BlockListType::reverse_iterator;
 
     /// Block argument iterator types.
-    using BlockArgListType = Region::BlockArgListType;
+    using BlockArgListType = ::mlir::Region::BlockArgListType;
     using args_iterator = BlockArgListType::iterator;
 
     //===------------------------------------------------------------------===//
@@ -163,7 +163,7 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     bool isExternal() { return empty(); }
 
     /// Return the region containing the body of this function.
-    Region &getFunctionBody() { return $_op->getRegion(0); }
+    ::mlir::Region &getFunctionBody() { return $_op->getRegion(0); }
 
     /// Delete all blocks from this function.
     void eraseBody() {
@@ -183,39 +183,39 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     bool empty() { return getFunctionBody().empty(); }
 
     /// Push a new block to the back of the body region.
-    void push_back(Block *block) { getFunctionBody().push_back(block); }
+    void push_back(::mlir::Block *block) { getFunctionBody().push_back(block); }
 
     /// Push a new block to the front of the body region.
-    void push_front(Block *block) { getFunctionBody().push_front(block); }
+    void push_front(::mlir::Block *block) { getFunctionBody().push_front(block); }
 
     /// Return the last block in the body region.
-    Block &back() { return getFunctionBody().back(); }
+    ::mlir::Block &back() { return getFunctionBody().back(); }
 
     /// Return the first block in the body region.
-    Block &front() { return getFunctionBody().front(); }
+    ::mlir::Block &front() { return getFunctionBody().front(); }
 
     /// Add an entry block to an empty function, and set up the block arguments
     /// to match the signature of the function. The newly inserted entry block
     /// is returned.
-    Block *addEntryBlock() {
+    ::mlir::Block *addEntryBlock() {
       assert(empty() && "function already has an entry block");
-      Block *entry = new Block();
+      ::mlir::Block *entry = new ::mlir::Block();
       push_back(entry);
 
       // FIXME: Allow for passing in locations for these arguments instead of using
       // the operations location.
-      ArrayRef<Type> inputTypes = $_op.getArgumentTypes();
-      SmallVector<Location> locations(inputTypes.size(),
-                                      $_op.getOperation()->getLoc());
+      ::llvm::ArrayRef<::mlir::Type> inputTypes = $_op.getArgumentTypes();
+      ::llvm::SmallVector<::mlir::Location> locations(inputTypes.size(),
+                                              $_op.getOperation()->getLoc());
       entry->addArguments(inputTypes, locations);
       return entry;
     }
 
     /// Add a normal block to the end of the function's block list. The function
     /// should at least already have an entry block.
-    Block *addBlock() {
+    ::mlir::Block *addBlock() {
       assert(!empty() && "function should at least have an entry block");
-      push_back(new Block());
+      push_back(new ::mlir::Block());
       return &back();
     }
 
@@ -230,8 +230,8 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     ///  - the argument/result attributes may need an update: if the new type
     ///    has less parameters we drop the extra attributes, if there are more
     ///    parameters they won't have any attributes.
-    void setType(Type newType) {
-      function_interface_impl::setFunctionType($_op, newType);
+    void setType(::mlir::Type newType) {
+      ::mlir::function_interface_impl::setFunctionType($_op, newType);
     }
 
     //===------------------------------------------------------------------===//
@@ -245,7 +245,7 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     unsigned getNumResults() { return $_op.getResultTypes().size(); }
 
     /// Returns the entry block function argument at the given index.
-    BlockArgument getArgument(unsigned idx) {
+    ::mlir::BlockArgument getArgument(unsigned idx) {
       return getFunctionBody().getArgument(idx);
     }
 
@@ -256,8 +256,8 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
 
     /// Insert a single argument of type `argType` with attributes `argAttrs` and
     /// location `argLoc` at `argIndex`.
-    void insertArgument(unsigned argIndex, Type argType, DictionaryAttr argAttrs,
-                        Location argLoc) {
+    void insertArgument(unsigned argIndex, ::mlir::Type argType, ::mlir::DictionaryAttr argAttrs,
+                        ::mlir::Location argLoc) {
       insertArguments({argIndex}, {argType}, {argAttrs}, {argLoc});
     }
 
@@ -265,20 +265,20 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     /// listed indices. `argIndices` must be sorted. Arguments are inserted in the
     /// order they are listed, such that arguments with identical index will
     /// appear in the same order that they were listed here.
-    void insertArguments(ArrayRef<unsigned> argIndices, TypeRange argTypes,
-                        ArrayRef<DictionaryAttr> argAttrs,
-                        ArrayRef<Location> argLocs) {
+    void insertArguments(::llvm::ArrayRef<unsigned> argIndices, ::mlir::TypeRange argTypes,
+                        ::llvm::ArrayRef<::mlir::DictionaryAttr> argAttrs,
+                        ::llvm::ArrayRef<::mlir::Location> argLocs) {
       unsigned originalNumArgs = $_op.getNumArguments();
-      Type newType = $_op.getTypeWithArgsAndResults(
+      ::mlir::Type newType = $_op.getTypeWithArgsAndResults(
           argIndices, argTypes, /*resultIndices=*/{}, /*resultTypes=*/{});
-      function_interface_impl::insertFunctionArguments(
+      ::mlir::function_interface_impl::insertFunctionArguments(
           $_op, argIndices, argTypes, argAttrs, argLocs,
           originalNumArgs, newType);
     }
 
     /// Insert a single result of type `resultType` at `resultIndex`.
-    void insertResult(unsigned resultIndex, Type resultType,
-                      DictionaryAttr resultAttrs) {
+    void insertResult(unsigned resultIndex, ::mlir::Type resultType,
+                      ::mlir::DictionaryAttr resultAttrs) {
       insertResults({resultIndex}, {resultType}, {resultAttrs});
     }
 
@@ -286,41 +286,41 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     /// `resultIndices` must be sorted. Results are inserted in the order they are
     /// listed, such that results with identical index will appear in the same
     /// order that they were listed here.
-    void insertResults(ArrayRef<unsigned> resultIndices, TypeRange resultTypes,
-                      ArrayRef<DictionaryAttr> resultAttrs) {
+    void insertResults(::llvm::ArrayRef<unsigned> resultIndices, ::mlir::TypeRange resultTypes,
+                       ::llvm::ArrayRef<::mlir::DictionaryAttr> resultAttrs) {
       unsigned originalNumResults = $_op.getNumResults();
-      Type newType = $_op.getTypeWithArgsAndResults(
+      ::mlir::Type newType = $_op.getTypeWithArgsAndResults(
         /*argIndices=*/{}, /*argTypes=*/{}, resultIndices, resultTypes);
-      function_interface_impl::insertFunctionResults(
+      ::mlir::function_interface_impl::insertFunctionResults(
           $_op, resultIndices, resultTypes, resultAttrs,
           originalNumResults, newType);
     }
 
     /// Erase a single argument at `argIndex`.
     void eraseArgument(unsigned argIndex) {
-      BitVector argsToErase($_op.getNumArguments());
+      ::llvm::BitVector argsToErase($_op.getNumArguments());
       argsToErase.set(argIndex);
       eraseArguments(argsToErase);
     }
 
     /// Erases the arguments listed in `argIndices`.
-    void eraseArguments(const BitVector &argIndices) {
-      Type newType = $_op.getTypeWithoutArgs(argIndices);
-      function_interface_impl::eraseFunctionArguments(
+    void eraseArguments(const ::llvm::BitVector &argIndices) {
+      ::mlir::Type newType = $_op.getTypeWithoutArgs(argIndices);
+      ::mlir::function_interface_impl::eraseFunctionArguments(
         $_op, argIndices, newType);
     }
 
     /// Erase a single result at `resultIndex`.
     void eraseResult(unsigned resultIndex) {
-      BitVector resultsToErase($_op.getNumResults());
+      ::llvm::BitVector resultsToErase($_op.getNumResults());
       resultsToErase.set(resultIndex);
       eraseResults(resultsToErase);
     }
 
     /// Erases the results listed in `resultIndices`.
-    void eraseResults(const BitVector &resultIndices) {
-      Type newType = $_op.getTypeWithoutResults(resultIndices);
-      function_interface_impl::eraseFunctionResults(
+    void eraseResults(const ::llvm::BitVector &resultIndices) {
+      ::mlir::Type newType = $_op.getTypeWithoutResults(resultIndices);
+      ::mlir::function_interface_impl::eraseFunctionResults(
           $_op, resultIndices, newType);
     }
 
@@ -328,13 +328,13 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     /// results inserted. This is used to update the function's signature in
     /// the `insertArguments` and `insertResults` methods. The arrays must be
     /// sorted by increasing index.
-    Type getTypeWithArgsAndResults(
-      ArrayRef<unsigned> argIndices, TypeRange argTypes,
-      ArrayRef<unsigned> resultIndices, TypeRange resultTypes) {
-      SmallVector<Type> argStorage, resultStorage;
-      TypeRange newArgTypes = insertTypesInto(
+    ::mlir::Type getTypeWithArgsAndResults(
+      ::llvm::ArrayRef<unsigned> argIndices, ::mlir::TypeRange argTypes,
+      ::llvm::ArrayRef<unsigned> resultIndices, ::mlir::TypeRange resultTypes) {
+      ::llvm::SmallVector<::mlir::Type> argStorage, resultStorage;
+      ::mlir::TypeRange newArgTypes = insertTypesInto(
           $_op.getArgumentTypes(), argIndices, argTypes, argStorage);
-      TypeRange newResultTypes = insertTypesInto(
+      ::mlir::TypeRange newResultTypes = insertTypesInto(
           $_op.getResultTypes(), resultIndices, resultTypes, resultStorage);
       return $_op.cloneTypeWith(newArgTypes, newResultTypes);
     }
@@ -342,24 +342,24 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     /// Return the type of this function without the specified arguments and
     /// results. This is used to update the function's signature in the
     /// `eraseArguments` and `eraseResults` methods.
-    Type getTypeWithoutArgsAndResults(
-      const BitVector &argIndices, const BitVector &resultIndices) {
-      SmallVector<Type> argStorage, resultStorage;
-      TypeRange newArgTypes = filterTypesOut(
+    ::mlir::Type getTypeWithoutArgsAndResults(
+      const ::llvm::BitVector &argIndices, const ::llvm::BitVector &resultIndices) {
+      ::llvm::SmallVector<::mlir::Type> argStorage, resultStorage;
+      ::mlir::TypeRange newArgTypes = filterTypesOut(
           $_op.getArgumentTypes(), argIndices, argStorage);
-      TypeRange newResultTypes = filterTypesOut(
+      ::mlir::TypeRange newResultTypes = filterTypesOut(
           $_op.getResultTypes(), resultIndices, resultStorage);
       return $_op.cloneTypeWith(newArgTypes, newResultTypes);
     }
-    Type getTypeWithoutArgs(const BitVector &argIndices) {
-      SmallVector<Type> argStorage;
-      TypeRange newArgTypes = filterTypesOut(
+    ::mlir::Type getTypeWithoutArgs(const ::llvm::BitVector &argIndices) {
+      ::llvm::SmallVector<::mlir::Type> argStorage;
+      ::mlir::TypeRange newArgTypes = filterTypesOut(
           $_op.getArgumentTypes(), argIndices, argStorage);
       return $_op.cloneTypeWith(newArgTypes, $_op.getResultTypes());
     }
-    Type getTypeWithoutResults(const BitVector &resultIndices) {
-      SmallVector<Type> resultStorage;
-      TypeRange newResultTypes = filterTypesOut(
+    ::mlir::Type getTypeWithoutResults(const ::llvm::BitVector &resultIndices) {
+      ::llvm::SmallVector<::mlir::Type> resultStorage;
+      ::mlir::TypeRange newResultTypes = filterTypesOut(
           $_op.getResultTypes(), resultIndices, resultStorage);
       return $_op.cloneTypeWith($_op.getArgumentTypes(), newResultTypes);
     }
@@ -369,88 +369,88 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     //===------------------------------------------------------------------===//
 
     /// Return all of the attributes for the argument at 'index'.
-    ArrayRef<NamedAttribute> getArgAttrs(unsigned index) {
-      return function_interface_impl::getArgAttrs($_op, index);
+    ::llvm::ArrayRef<::mlir::NamedAttribute> getArgAttrs(unsigned index) {
+      return ::mlir::function_interface_impl::getArgAttrs($_op, index);
     }
 
     /// Return an ArrayAttr containing all argument attribute dictionaries of
     /// this function, or nullptr if no arguments have attributes.
-    ArrayAttr getAllArgAttrs() { return $_op.getArgAttrsAttr(); }
+    ::mlir::ArrayAttr getAllArgAttrs() { return $_op.getArgAttrsAttr(); }
 
     /// Return all argument attributes of this function.
-    void getAllArgAttrs(SmallVectorImpl<DictionaryAttr> &result) {
-      if (ArrayAttr argAttrs = getAllArgAttrs()) {
-        auto argAttrRange = argAttrs.template getAsRange<DictionaryAttr>();
+    void getAllArgAttrs(::llvm::SmallVectorImpl<::mlir::DictionaryAttr> &result) {
+      if (::mlir::ArrayAttr argAttrs = getAllArgAttrs()) {
+        auto argAttrRange = argAttrs.template getAsRange<::mlir::DictionaryAttr>();
         result.append(argAttrRange.begin(), argAttrRange.end());
       } else {
         result.append($_op.getNumArguments(),
-                      DictionaryAttr::get(this->getOperation()->getContext()));
+                      ::mlir::DictionaryAttr::get(this->getOperation()->getContext()));
       }
     }
 
     /// Return the specified attribute, if present, for the argument at 'index',
     /// null otherwise.
-    Attribute getArgAttr(unsigned index, StringAttr name) {
+    ::mlir::Attribute getArgAttr(unsigned index, ::mlir::StringAttr name) {
       auto argDict = getArgAttrDict(index);
       return argDict ? argDict.get(name) : nullptr;
     }
-    Attribute getArgAttr(unsigned index, StringRef name) {
+    ::mlir::Attribute getArgAttr(unsigned index, ::llvm::StringRef name) {
       auto argDict = getArgAttrDict(index);
       return argDict ? argDict.get(name) : nullptr;
     }
 
     template <typename AttrClass>
-    AttrClass getArgAttrOfType(unsigned index, StringAttr name) {
+    AttrClass getArgAttrOfType(unsigned index, ::mlir::StringAttr name) {
       return ::llvm::dyn_cast_or_null<AttrClass>(getArgAttr(index, name));
     }
     template <typename AttrClass>
-    AttrClass getArgAttrOfType(unsigned index, StringRef name) {
+    AttrClass getArgAttrOfType(unsigned index, ::llvm::StringRef name) {
       return ::llvm::dyn_cast_or_null<AttrClass>(getArgAttr(index, name));
     }
 
     /// Set the attributes held by the argument at 'index'.
-    void setArgAttrs(unsigned index, ArrayRef<NamedAttribute> attributes) {
-      function_interface_impl::setArgAttrs($_op, index, attributes);
+    void setArgAttrs(unsigned index, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) {
+      ::mlir::function_interface_impl::setArgAttrs($_op, index, attributes);
     }
 
     /// Set the attributes held by the argument at 'index'. `attributes` may be
     /// null, in which case any existing argument attributes are removed.
-    void setArgAttrs(unsigned index, DictionaryAttr attributes) {
-      function_interface_impl::setArgAttrs($_op, index, attributes);
+    void setArgAttrs(unsigned index, ::mlir::DictionaryAttr attributes) {
+      ::mlir::function_interface_impl::setArgAttrs($_op, index, attributes);
     }
-    void setAllArgAttrs(ArrayRef<DictionaryAttr> attributes) {
+    void setAllArgAttrs(::llvm::ArrayRef<::mlir::DictionaryAttr> attributes) {
       assert(attributes.size() == $_op.getNumArguments());
-      function_interface_impl::setAllArgAttrDicts($_op, attributes);
+      ::mlir::function_interface_impl::setAllArgAttrDicts($_op, attributes);
     }
-    void setAllArgAttrs(ArrayRef<Attribute> attributes) {
+    void setAllArgAttrs(::llvm::ArrayRef<::mlir::Attribute> attributes) {
       assert(attributes.size() == $_op.getNumArguments());
-      function_interface_impl::setAllArgAttrDicts($_op, attributes);
+      ::mlir::function_interface_impl::setAllArgAttrDicts($_op, attributes);
     }
-    void setAllArgAttrs(ArrayAttr attributes) {
+    void setAllArgAttrs(::mlir::ArrayAttr attributes) {
       assert(attributes.size() == $_op.getNumArguments());
       $_op.setArgAttrsAttr(attributes);
     }
 
     /// If the an attribute exists with the specified name, change it to the new
     /// value. Otherwise, add a new attribute with the specified name/value.
-    void setArgAttr(unsigned index, StringAttr name, Attribute value) {
-      function_interface_impl::setArgAttr($_op, index, name, value);
+    void setArgAttr(unsigned index, ::mlir::StringAttr name, ::mlir::Attribute value) {
+      ::mlir::function_interface_impl::setArgAttr($_op, index, name, value);
     }
-    void setArgAttr(unsigned index, StringRef name, Attribute value) {
+    void setArgAttr(unsigned index, ::llvm::StringRef name, ::mlir::Attribute value) {
       setArgAttr(index,
-                 StringAttr::get(this->getOperation()->getContext(), name),
+                 ::mlir::StringAttr::get(this->getOperation()->getContext(), name),
                  value);
     }
 
     /// Remove the attribute 'name' from the argument at 'index'. Return the
     /// attribute that was erased, or nullptr if there was no attribute with
     /// such name.
-    Attribute removeArgAttr(unsigned index, StringAttr name) {
-      return function_interface_impl::removeArgAttr($_op, index, name);
+    ::mlir::Attribute removeArgAttr(unsigned index, ::mlir::StringAttr name) {
+      return ::mlir::function_interface_impl::removeArgAttr($_op, index, name);
     }
-    Attribute removeArgAttr(unsigned index, StringRef name) {
+    ::mlir::Attribute removeArgAttr(unsigned index, ::llvm::StringRef name) {
       return removeArgAttr(
-          index, StringAttr::get(this->getOperation()->getContext(), name));
+          index, ::mlir::StringAttr::get(this->getOperation()->getContext(), name));
     }
 
     //===------------------------------------------------------------------===//
@@ -458,102 +458,102 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     //===------------------------------------------------------------------===//
 
     /// Return all of the attributes for the result at 'index'.
-    ArrayRef<NamedAttribute> getResultAttrs(unsigned index) {
-      return function_interface_impl::getResultAttrs($_op, index);
+    ::llvm::ArrayRef<::mlir::NamedAttribute> getResultAttrs(unsigned index) {
+      return ::mlir::function_interface_impl::getResultAttrs($_op, index);
     }
 
     /// Return an ArrayAttr containing all result attribute dictionaries of this
     /// function, or nullptr if no result have attributes.
-    ArrayAttr getAllResultAttrs() { return $_op.getResAttrsAttr(); }
+    ::mlir::ArrayAttr getAllResultAttrs() { return $_op.getResAttrsAttr(); }
 
     /// Return all result attributes of this function.
-    void getAllResultAttrs(SmallVectorImpl<DictionaryAttr> &result) {
-      if (ArrayAttr argAttrs = getAllResultAttrs()) {
-        auto argAttrRange = argAttrs.template getAsRange<DictionaryAttr>();
+    void getAllResultAttrs(::llvm::SmallVectorImpl<::mlir::DictionaryAttr> &result) {
+      if (::mlir::ArrayAttr argAttrs = getAllResultAttrs()) {
+        auto argAttrRange = argAttrs.template getAsRange<::mlir::DictionaryAttr>();
         result.append(argAttrRange.begin(), argAttrRange.end());
       } else {
         result.append($_op.getNumResults(),
-                      DictionaryAttr::get(this->getOperation()->getContext()));
+                      ::mlir::DictionaryAttr::get(this->getOperation()->getContext()));
       }
     }
 
     /// Return the specified attribute, if present, for the result at 'index',
     /// null otherwise.
-    Attribute getResultAttr(unsigned index, StringAttr name) {
+    ::mlir::Attribute getResultAttr(unsigned index, ::mlir::StringAttr name) {
       auto argDict = getResultAttrDict(index);
       return argDict ? argDict.get(name) : nullptr;
     }
-    Attribute getResultAttr(unsigned index, StringRef name) {
+    ::mlir::Attribute getResultAttr(unsigned index, ::llvm::StringRef name) {
       auto argDict = getResultAttrDict(index);
       return argDict ? argDict.get(name) : nullptr;
     }
 
     template <typename AttrClass>
-    AttrClass getResultAttrOfType(unsigned index, StringAttr name) {
+    AttrClass getResultAttrOfType(unsigned index, ::mlir::StringAttr name) {
       return ::llvm::dyn_cast_or_null<AttrClass>(getResultAttr(index, name));
     }
     template <typename AttrClass>
-    AttrClass getResultAttrOfType(unsigned index, StringRef name) {
+    AttrClass getResultAttrOfType(unsigned index, ::llvm::StringRef name) {
       return ::llvm::dyn_cast_or_null<AttrClass>(getResultAttr(index, name));
     }
 
     /// Set the attributes held by the result at 'index'.
-    void setResultAttrs(unsigned index, ArrayRef<NamedAttribute> attributes) {
-      function_interface_impl::setResultAttrs($_op, index, attributes);
+    void setResultAttrs(unsigned index, ::llvm::ArrayRef<::mlir::NamedAttribute> attributes) {
+      ::mlir::function_interface_impl::setResultAttrs($_op, index, attributes);
     }
 
     /// Set the attributes held by the result at 'index'. `attributes` may be
     /// null, in which case any existing argument attributes are removed.
-    void setResultAttrs(unsigned index, DictionaryAttr attributes) {
-      function_interface_impl::setResultAttrs($_op, index, attributes);
+    void setResultAttrs(unsigned index, ::mlir::DictionaryAttr attributes) {
+      ::mlir::function_interface_impl::setResultAttrs($_op, index, attributes);
     }
-    void setAllResultAttrs(ArrayRef<DictionaryAttr> attributes) {
+    void setAllResultAttrs(::llvm::ArrayRef<::mlir::DictionaryAttr> attributes) {
       assert(attributes.size() == $_op.getNumResults());
-      function_interface_impl::setAllResultAttrDicts(
+      ::mlir::function_interface_impl::setAllResultAttrDicts(
         $_op, attributes);
     }
-    void setAllResultAttrs(ArrayRef<Attribute> attributes) {
+    void setAllResultAttrs(::llvm::ArrayRef<::mlir::Attribute> attributes) {
       assert(attributes.size() == $_op.getNumResults());
-      function_interface_impl::setAllResultAttrDicts(
+      ::mlir::function_interface_impl::setAllResultAttrDicts(
         $_op, attributes);
     }
-    void setAllResultAttrs(ArrayAttr attributes) {
+    void setAllResultAttrs(::mlir::ArrayAttr attributes) {
       assert(attributes.size() == $_op.getNumResults());
       $_op.setResAttrsAttr(attributes);
     }
 
     /// If the an attribute exists with the specified name, change it to the new
     /// value. Otherwise, add a new attribute with the specified name/value.
-    void setResultAttr(unsigned index, StringAttr name, Attribute value) {
-      function_interface_impl::setResultAttr($_op, index, name, value);
+    void setResultAttr(unsigned index, ::mlir::StringAttr name, ::mlir::Attribute value) {
+      ::mlir::function_interface_impl::setResultAttr($_op, index, name, value);
     }
-    void setResultAttr(unsigned index, StringRef name, Attribute value) {
+    void setResultAttr(unsigned index, ::llvm::StringRef name, ::mlir::Attribute value) {
       setResultAttr(index,
-                    StringAttr::get(this->getOperation()->getContext(), name),
+                    ::mlir::StringAttr::get(this->getOperation()->getContext(), name),
                     value);
     }
 
     /// Remove the attribute 'name' from the result at 'index'. Return the
     /// attribute that was erased, or nullptr if there was no attribute with
     /// such name.
-    Attribute removeResultAttr(unsigned index, StringAttr name) {
-      return function_interface_impl::removeResultAttr($_op, index, name);
+    ::mlir::Attribute removeResultAttr(unsigned index, ::mlir::StringAttr name) {
+      return ::mlir::function_interface_impl::removeResultAttr($_op, index, name);
     }
 
     /// Returns the dictionary attribute corresponding to the argument at
     /// 'index'. If there are no argument attributes at 'index', a null
     /// attribute is returned.
-    DictionaryAttr getArgAttrDict(unsigned index) {
+    ::mlir::DictionaryAttr getArgAttrDict(unsigned index) {
       assert(index < $_op.getNumArguments() && "invalid argument number");
-      return function_interface_impl::getArgAttrDict($_op, index);
+      return ::mlir::function_interface_impl::getArgAttrDict($_op, index);
     }
 
     /// Returns the dictionary attribute corresponding to the result at 'index'.
     /// If there are no result attributes at 'index', a null attribute is
     /// returned.
-    DictionaryAttr getResultAttrDict(unsigned index) {
+    ::mlir::DictionaryAttr getResultAttrDict(unsigned index) {
       assert(index < $_op.getNumResults() && "invalid result number");
-      return function_interface_impl::getResultAttrDict($_op, index);
+      return ::mlir::function_interface_impl::getResultAttrDict($_op, index);
     }
   }];
 

From afd469023aad10786eaea3d444047a558ad8d5c1 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 22 Feb 2024 21:48:49 -0800
Subject: [PATCH 129/546] [lldb] Fix term-width setting (#82736)

I noticed that the term-width setting would always report its default
value (80) despite the driver correctly setting the value with
SBDebugger::SetTerminalWidth.

```
(lldb) settings show term-width
term-width (int) = 80
```

The issue is that the setting was defined as a SInt64 instead of a
UInt64 while the getter returned an unsigned value. There's no reason
the terminal width should be a signed value. My best guess it that it
was using SInt64 because UInt64 didn't support min and max values. I
fixed that and correct the type and now lldb reports the correct
terminal width:

```
(lldb) settings show term-width
term-width (unsigned) = 189
```

rdar://123488999
---
 .../lldb/Interpreter/OptionValueSInt64.h      |  4 +--
 .../lldb/Interpreter/OptionValueUInt64.h      | 26 +++++++++++++++++--
 lldb/source/Core/CoreProperties.td            |  2 +-
 lldb/source/Core/Debugger.cpp                 |  4 +--
 lldb/source/Interpreter/OptionValueUInt64.cpp | 13 +++++++---
 .../API/commands/settings/TestSettings.py     | 15 ++++++++---
 .../TestTrimmedProgressReporting.py           |  3 ++-
 7 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/OptionValueSInt64.h b/lldb/include/lldb/Interpreter/OptionValueSInt64.h
index 5efae627758ac..3cf41d38c0ef0 100644
--- a/lldb/include/lldb/Interpreter/OptionValueSInt64.h
+++ b/lldb/include/lldb/Interpreter/OptionValueSInt64.h
@@ -86,8 +86,8 @@ class OptionValueSInt64 : public Cloneable<OptionValueSInt64, OptionValue> {
 protected:
   int64_t m_current_value = 0;
   int64_t m_default_value = 0;
-  int64_t m_min_value = INT64_MIN;
-  int64_t m_max_value = INT64_MAX;
+  int64_t m_min_value = std::numeric_limits<int64_t>::min();
+  int64_t m_max_value = std::numeric_limits<int64_t>::max();
 };
 
 } // namespace lldb_private
diff --git a/lldb/include/lldb/Interpreter/OptionValueUInt64.h b/lldb/include/lldb/Interpreter/OptionValueUInt64.h
index 30c27bf73d99c..07076075790c6 100644
--- a/lldb/include/lldb/Interpreter/OptionValueUInt64.h
+++ b/lldb/include/lldb/Interpreter/OptionValueUInt64.h
@@ -64,13 +64,35 @@ class OptionValueUInt64 : public Cloneable<OptionValueUInt64, OptionValue> {
 
   uint64_t GetDefaultValue() const { return m_default_value; }
 
-  void SetCurrentValue(uint64_t value) { m_current_value = value; }
+  bool SetCurrentValue(uint64_t value) {
+    if (value >= m_min_value && value <= m_max_value) {
+      m_current_value = value;
+      return true;
+    }
+    return false;
+  }
+
+  bool SetDefaultValue(uint64_t value) {
+    if (value >= m_min_value && value <= m_max_value) {
+      m_default_value = value;
+      return true;
+    }
+    return false;
+  }
+
+  void SetMinimumValue(int64_t v) { m_min_value = v; }
+
+  uint64_t GetMinimumValue() const { return m_min_value; }
+
+  void SetMaximumValue(int64_t v) { m_max_value = v; }
 
-  void SetDefaultValue(uint64_t value) { m_default_value = value; }
+  uint64_t GetMaximumValue() const { return m_max_value; }
 
 protected:
   uint64_t m_current_value = 0;
   uint64_t m_default_value = 0;
+  uint64_t m_min_value = std::numeric_limits<uint64_t>::min();
+  uint64_t m_max_value = std::numeric_limits<uint64_t>::max();
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td
index 4cfff805688c5..a6cb951187a04 100644
--- a/lldb/source/Core/CoreProperties.td
+++ b/lldb/source/Core/CoreProperties.td
@@ -132,7 +132,7 @@ let Definition = "debugger" in {
     Global,
     DefaultStringValue<"${ansi.normal}">,
     Desc<"When displaying the line marker in a color-enabled terminal, use the ANSI terminal code specified in this format immediately after the line to be marked.">;
-  def TerminalWidth: Property<"term-width", "SInt64">,
+  def TerminalWidth: Property<"term-width", "UInt64">,
     Global,
     DefaultUnsignedValue<80>,
     Desc<"The maximum number of columns to use for displaying text.">;
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 97311b4716ac2..bb81110ae35a5 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -886,8 +886,8 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
   }
   assert(m_dummy_target_sp.get() && "Couldn't construct dummy target?");
 
-  OptionValueSInt64 *term_width =
-      m_collection_sp->GetPropertyAtIndexAsOptionValueSInt64(
+  OptionValueUInt64 *term_width =
+      m_collection_sp->GetPropertyAtIndexAsOptionValueUInt64(
           ePropertyTerminalWidth);
   term_width->SetMinimumValue(10);
   term_width->SetMaximumValue(1024);
diff --git a/lldb/source/Interpreter/OptionValueUInt64.cpp b/lldb/source/Interpreter/OptionValueUInt64.cpp
index 1999c63d11aff..2e69c164e32ac 100644
--- a/lldb/source/Interpreter/OptionValueUInt64.cpp
+++ b/lldb/source/Interpreter/OptionValueUInt64.cpp
@@ -47,9 +47,16 @@ Status OptionValueUInt64::SetValueFromString(llvm::StringRef value_ref,
     llvm::StringRef value_trimmed = value_ref.trim();
     uint64_t value;
     if (llvm::to_integer(value_trimmed, value)) {
-      m_value_was_set = true;
-      m_current_value = value;
-      NotifyValueChanged();
+      if (value >= m_min_value && value <= m_max_value) {
+        m_value_was_set = true;
+        m_current_value = value;
+        NotifyValueChanged();
+      } else {
+        error.SetErrorStringWithFormat(
+            "%" PRIu64 " is out of range, valid values must be between %" PRIu64
+            " and %" PRIu64 ".",
+            value, m_min_value, m_max_value);
+      }
     } else {
       error.SetErrorStringWithFormat("invalid uint64_t string value: '%s'",
                                      value_ref.str().c_str());
diff --git a/lldb/test/API/commands/settings/TestSettings.py b/lldb/test/API/commands/settings/TestSettings.py
index a2d845493d1df..104a9f09788c3 100644
--- a/lldb/test/API/commands/settings/TestSettings.py
+++ b/lldb/test/API/commands/settings/TestSettings.py
@@ -2,7 +2,6 @@
 Test lldb settings command.
 """
 
-
 import json
 import os
 import re
@@ -151,14 +150,22 @@ def test_set_term_width(self):
         self.expect(
             "settings show term-width",
             SETTING_MSG("term-width"),
-            startstr="term-width (int) = 70",
+            startstr="term-width (unsigned) = 70",
         )
 
         # The overall display should also reflect the new setting.
         self.expect(
             "settings show",
             SETTING_MSG("term-width"),
-            substrs=["term-width (int) = 70"],
+            substrs=["term-width (unsigned) = 70"],
+        )
+
+        self.dbg.SetTerminalWidth(60)
+
+        self.expect(
+            "settings show",
+            SETTING_MSG("term-width"),
+            substrs=["term-width (unsigned) = 60"],
         )
 
     # rdar://problem/10712130
@@ -593,7 +600,7 @@ def test_settings_with_trailing_whitespace(self):
         self.expect(
             "settings show term-width",
             SETTING_MSG("term-width"),
-            startstr="term-width (int) = 60",
+            startstr="term-width (unsigned) = 60",
         )
         self.runCmd("settings clear term-width", check=False)
         # string
diff --git a/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py b/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py
index 357999b6f5619..ee35dbd23b3db 100644
--- a/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py
+++ b/lldb/test/API/functionalities/progress_reporting/TestTrimmedProgressReporting.py
@@ -24,7 +24,8 @@ def do_test(self, term_width, pattern_list):
         )
         self.expect("set set term-width " + str(term_width))
         self.expect(
-            "set show term-width", substrs=["term-width (int) = " + str(term_width)]
+            "set show term-width",
+            substrs=["term-width (unsigned) = " + str(term_width)],
         )
 
         self.child.send("file " + self.getBuildArtifact("a.out") + "\n")

From 850dde063b7f70bb592723064385e9f9ad39c96e Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:17:15 +0800
Subject: [PATCH 130/546] [RISCV][VP] Introduce vp saturating
 addition/subtraction and RISC-V support. (#82370)

This patch also pick the MatchContext framework from DAGCombiner to an
indiviual header file to make the framework be used from other files in
llvm/lib/CodeGen/SelectionDAG/.
---
 llvm/docs/LangRef.rst                         |  203 ++
 llvm/include/llvm/IR/Intrinsics.td            |   20 +
 llvm/include/llvm/IR/VPIntrinsics.def         |   24 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  137 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   37 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |    2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   16 +-
 llvm/lib/CodeGen/SelectionDAG/MatchContext.h  |  175 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   12 +-
 .../RISCV/rvv/fixed-vectors-vsadd-vp.ll       | 1701 ++++++++++++++
 .../RISCV/rvv/fixed-vectors-vsaddu-vp.ll      | 1697 ++++++++++++++
 .../RISCV/rvv/fixed-vectors-vssub-vp.ll       | 1745 ++++++++++++++
 .../RISCV/rvv/fixed-vectors-vssubu-vp.ll      | 1740 ++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll       | 2015 ++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll      | 2014 ++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll       | 2067 +++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll      | 2065 ++++++++++++++++
 llvm/unittests/IR/VPIntrinsicTest.cpp         |    8 +
 18 files changed, 15521 insertions(+), 157 deletions(-)
 create mode 100644 llvm/lib/CodeGen/SelectionDAG/MatchContext.h
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8f4495e25d0fa..19ca9f6ae3fe3 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16749,6 +16749,7 @@ an operation is greater than the maximum value, the result is set (or
 "clamped") to this maximum. If it is below the minimum, it is clamped to this
 minimum.
 
+.. _int_sadd_sat:
 
 '``llvm.sadd.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -16798,6 +16799,8 @@ Examples
       %res = call i4 @llvm.sadd.sat.i4(i4 -4, i4 -5)  ; %res = -8
 
 
+.. _int_uadd_sat:
+
 '``llvm.uadd.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16845,6 +16848,8 @@ Examples
       %res = call i4 @llvm.uadd.sat.i4(i4 8, i4 8)  ; %res = 15
 
 
+.. _int_ssub_sat:
+
 '``llvm.ssub.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16893,6 +16898,8 @@ Examples
       %res = call i4 @llvm.ssub.sat.i4(i4 4, i4 -5)  ; %res = 7
 
 
+.. _int_usub_sat:
+
 '``llvm.usub.sat.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -23610,6 +23617,202 @@ Examples:
       %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
 
 
+.. _int_vp_sadd_sat:
+
+'``llvm.vp.sadd.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.sadd.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.sadd.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.sadd.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed saturating addition of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.sadd.sat``' intrinsic performs sadd.sat (:ref:`sadd.sat <int_sadd_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_uadd_sat:
+
+'``llvm.vp.uadd.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.uadd.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.uadd.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.uadd.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned saturating addition of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.uadd.sat``' intrinsic performs uadd.sat (:ref:`uadd.sat <int_uadd_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_ssub_sat:
+
+'``llvm.vp.ssub.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.ssub.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.ssub.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.ssub.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed saturating subtraction of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.ssub.sat``' intrinsic performs ssub.sat (:ref:`ssub.sat <int_ssub_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
+.. _int_vp_usub_sat:
+
+'``llvm.vp.usub.sat.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.usub.sat.v16i32 (<16 x i32> <left_op> <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.usub.sat.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.usub.sat.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned saturating subtraction of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.usub.sat``' intrinsic performs usub.sat (:ref:`usub.sat <int_usub_sat>`)
+of the first and second vector operands on each enabled lane. The result on
+disabled lanes is a :ref:`poison value <poisonvalues>`.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
 .. _int_vp_fshl:
 
 '``llvm.vp.fshl.*``' Intrinsics
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8c0d4d5db32d8..d7c1ce153a6c8 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1933,6 +1933,26 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_sadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_uadd_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_ssub_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_usub_sat : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
 
   // Floating-point arithmetic
   def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 3b32b60609f53..4089acf9ec3f0 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -293,6 +293,30 @@ BEGIN_REGISTER_VP(vp_fshr, 3, 4, VP_FSHR, -1)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(fshr)
 VP_PROPERTY_FUNCTIONAL_SDOPC(FSHR)
 END_REGISTER_VP(vp_fshr, VP_FSHR)
+
+// llvm.vp.sadd.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_sadd_sat, 2, 3, VP_SADDSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(sadd_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(SADDSAT)
+END_REGISTER_VP(vp_sadd_sat, VP_SADDSAT)
+
+// llvm.vp.uadd.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_uadd_sat, 2, 3, VP_UADDSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(uadd_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(UADDSAT)
+END_REGISTER_VP(vp_uadd_sat, VP_UADDSAT)
+
+// llvm.vp.ssub.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_ssub_sat, 2, 3, VP_SSUBSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(ssub_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(SSUBSAT)
+END_REGISTER_VP(vp_ssub_sat, VP_SSUBSAT)
+
+// llvm.vp.usub.sat(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_usub_sat, 2, 3, VP_USUBSAT, -1)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(usub_sat)
+VP_PROPERTY_FUNCTIONAL_SDOPC(USUBSAT)
+END_REGISTER_VP(vp_usub_sat, VP_USUBSAT)
 ///// } Integer Arithmetic
 
 ///// Floating-Point Arithmetic {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ed43dd7f52882..6a28bc8da223b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -76,6 +76,8 @@
 #include <utility>
 #include <variant>
 
+#include "MatchContext.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "dagcombine"
@@ -888,141 +890,6 @@ class WorklistInserter : public SelectionDAG::DAGUpdateListener {
   void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
 };
 
-class EmptyMatchContext {
-  SelectionDAG &DAG;
-  const TargetLowering &TLI;
-
-public:
-  EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
-      : DAG(DAG), TLI(TLI) {}
-
-  bool match(SDValue OpN, unsigned Opcode) const {
-    return Opcode == OpN->getOpcode();
-  }
-
-  // Same as SelectionDAG::getNode().
-  template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
-    return DAG.getNode(std::forward<ArgT>(Args)...);
-  }
-
-  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
-                                bool LegalOnly = false) const {
-    return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
-  }
-};
-
-class VPMatchContext {
-  SelectionDAG &DAG;
-  const TargetLowering &TLI;
-  SDValue RootMaskOp;
-  SDValue RootVectorLenOp;
-
-public:
-  VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
-      : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
-    assert(Root->isVPOpcode());
-    if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
-      RootMaskOp = Root->getOperand(*RootMaskPos);
-    else if (Root->getOpcode() == ISD::VP_SELECT)
-      RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root),
-                                          Root->getOperand(0).getValueType());
-
-    if (auto RootVLenPos =
-            ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
-      RootVectorLenOp = Root->getOperand(*RootVLenPos);
-  }
-
-  /// whether \p OpVal is a node that is functionally compatible with the
-  /// NodeType \p Opc
-  bool match(SDValue OpVal, unsigned Opc) const {
-    if (!OpVal->isVPOpcode())
-      return OpVal->getOpcode() == Opc;
-
-    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
-                                           !OpVal->getFlags().hasNoFPExcept());
-    if (BaseOpc != Opc)
-      return false;
-
-    // Make sure the mask of OpVal is true mask or is same as Root's.
-    unsigned VPOpcode = OpVal->getOpcode();
-    if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
-      SDValue MaskOp = OpVal.getOperand(*MaskPos);
-      if (RootMaskOp != MaskOp &&
-          !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
-        return false;
-    }
-
-    // Make sure the EVL of OpVal is same as Root's.
-    if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
-      if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
-        return false;
-    return true;
-  }
-
-  // Specialize based on number of operands.
-  // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
-  // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
-  // DAG.getNode(Opcode, DL, VT); }
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {Operand, RootMaskOp, RootVectorLenOp});
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {N1, N2, RootMaskOp, RootVectorLenOp});
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDValue N3) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {N1, N2, N3, RootMaskOp, RootVectorLenOp});
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
-                  SDNodeFlags Flags) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
-    return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
-                       Flags);
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDNodeFlags Flags) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
-    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
-                       Flags);
-  }
-
-  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
-                  SDValue N2, SDValue N3, SDNodeFlags Flags) {
-    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
-    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
-           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
-    return DAG.getNode(VPOpcode, DL, VT,
-                       {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
-  }
-
-  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
-                                bool LegalOnly = false) const {
-    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
-    return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
-  }
-};
-
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a4ba261686c68..df17d6530b0de 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -217,7 +217,15 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SSUBSAT:
   case ISD::USUBSAT:
   case ISD::SSHLSAT:
-  case ISD::USHLSAT:     Res = PromoteIntRes_ADDSUBSHLSAT(N); break;
+  case ISD::USHLSAT:
+    Res = PromoteIntRes_ADDSUBSHLSAT<EmptyMatchContext>(N);
+    break;
+  case ISD::VP_SADDSAT:
+  case ISD::VP_UADDSAT:
+  case ISD::VP_SSUBSAT:
+  case ISD::VP_USUBSAT:
+    Res = PromoteIntRes_ADDSUBSHLSAT<VPMatchContext>(N);
+    break;
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
@@ -934,6 +942,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT);
 }
 
+template <class MatchContextClass>
 SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   // If the promoted type is legal, we can convert this to:
   //   1. ANY_EXTEND iN to iM
@@ -945,11 +954,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
   SDValue Op2 = N->getOperand(1);
+  MatchContextClass matcher(DAG, TLI, N);
   unsigned OldBits = Op1.getScalarValueSizeInBits();
 
-  unsigned Opcode = N->getOpcode();
+  unsigned Opcode = matcher.getRootBaseOpcode();
   bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT;
 
+  // FIXME: We need vp-aware PromotedInteger functions.
   SDValue Op1Promoted, Op2Promoted;
   if (IsShift) {
     Op1Promoted = GetPromotedInteger(Op1);
@@ -968,18 +979,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
     APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits);
     SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
     SDValue Add =
-        DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
-    return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+        matcher.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return matcher.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
   }
 
   // USUBSAT can always be promoted as long as we have zero-extended the args.
   if (Opcode == ISD::USUBSAT)
-    return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
-                       Op2Promoted);
+    return matcher.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
+                           Op2Promoted);
 
   // Shift cannot use a min/max expansion, we can't detect overflow if all of
   // the bits have been shifted out.
-  if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
+  if (IsShift || matcher.isOperationLegal(Opcode, PromotedType)) {
     unsigned ShiftOp;
     switch (Opcode) {
     case ISD::SADDSAT:
@@ -1002,11 +1013,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
         DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
     if (!IsShift)
       Op2Promoted =
-          DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+          matcher.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
 
     SDValue Result =
-        DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
-    return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+        matcher.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return matcher.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
   }
 
   unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
@@ -1015,9 +1026,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
   SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType);
   SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
   SDValue Result =
-      DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
-  Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
-  Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
+      matcher.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
+  Result = matcher.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
+  Result = matcher.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
   return Result;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9114987162857..3c84f67653eca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
 
+#include "MatchContext.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -355,6 +356,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_VSCALE(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+  template <class MatchContextClass>
   SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N);
   SDValue PromoteIntRes_MULFIX(SDNode *N);
   SDValue PromoteIntRes_DIVFIX(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7fc252600534f..90cda2a1155b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1163,10 +1163,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
   case ISD::UMAX: case ISD::VP_UMAX:
-  case ISD::SADDSAT:
-  case ISD::UADDSAT:
-  case ISD::SSUBSAT:
-  case ISD::USUBSAT:
+  case ISD::SADDSAT: case ISD::VP_SADDSAT:
+  case ISD::UADDSAT: case ISD::VP_UADDSAT:
+  case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
+  case ISD::USUBSAT: case ISD::VP_USUBSAT:
   case ISD::SSHLSAT:
   case ISD::USHLSAT:
   case ISD::ROTL:
@@ -4140,10 +4140,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
   case ISD::UMAX: case ISD::VP_UMAX:
-  case ISD::UADDSAT:
-  case ISD::SADDSAT:
-  case ISD::USUBSAT:
-  case ISD::SSUBSAT:
+  case ISD::UADDSAT: case ISD::VP_UADDSAT:
+  case ISD::SADDSAT: case ISD::VP_SADDSAT:
+  case ISD::USUBSAT: case ISD::VP_USUBSAT:
+  case ISD::SSUBSAT: case ISD::VP_SSUBSAT:
   case ISD::SSHLSAT:
   case ISD::USHLSAT:
   case ISD::ROTL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/MatchContext.h b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
new file mode 100644
index 0000000000000..f965cb952f97a
--- /dev/null
+++ b/llvm/lib/CodeGen/SelectionDAG/MatchContext.h
@@ -0,0 +1,175 @@
+//===---------------- llvm/CodeGen/MatchContext.h  --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the EmptyMatchContext class and VPMatchContext class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_MATCHCONTEXT_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+using namespace llvm;
+
+namespace {
+class EmptyMatchContext {
+  SelectionDAG &DAG;
+  const TargetLowering &TLI;
+  SDNode *Root;
+
+public:
+  EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
+      : DAG(DAG), TLI(TLI), Root(Root) {}
+
+  unsigned getRootBaseOpcode() { return Root->getOpcode(); }
+  bool match(SDValue OpN, unsigned Opcode) const {
+    return Opcode == OpN->getOpcode();
+  }
+
+  // Same as SelectionDAG::getNode().
+  template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
+    return DAG.getNode(std::forward<ArgT>(Args)...);
+  }
+
+  bool isOperationLegal(unsigned Op, EVT VT) const {
+    return TLI.isOperationLegal(Op, VT);
+  }
+
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
+  }
+};
+
+class VPMatchContext {
+  SelectionDAG &DAG;
+  const TargetLowering &TLI;
+  SDValue RootMaskOp;
+  SDValue RootVectorLenOp;
+  SDNode *Root;
+
+public:
+  VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *_Root)
+      : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
+    Root = _Root;
+    assert(Root->isVPOpcode());
+    if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
+      RootMaskOp = Root->getOperand(*RootMaskPos);
+    else if (Root->getOpcode() == ISD::VP_SELECT)
+      RootMaskOp = DAG.getAllOnesConstant(SDLoc(Root),
+                                          Root->getOperand(0).getValueType());
+
+    if (auto RootVLenPos = ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
+      RootVectorLenOp = Root->getOperand(*RootVLenPos);
+  }
+
+  unsigned getRootBaseOpcode() {
+    std::optional<unsigned> Opcode = ISD::getBaseOpcodeForVP(
+        Root->getOpcode(), !Root->getFlags().hasNoFPExcept());
+    assert(Opcode.has_value());
+    return *Opcode;
+  }
+
+  /// whether \p OpVal is a node that is functionally compatible with the
+  /// NodeType \p Opc
+  bool match(SDValue OpVal, unsigned Opc) const {
+    if (!OpVal->isVPOpcode())
+      return OpVal->getOpcode() == Opc;
+
+    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
+                                           !OpVal->getFlags().hasNoFPExcept());
+    if (BaseOpc != Opc)
+      return false;
+
+    // Make sure the mask of OpVal is true mask or is same as Root's.
+    unsigned VPOpcode = OpVal->getOpcode();
+    if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
+      SDValue MaskOp = OpVal.getOperand(*MaskPos);
+      if (RootMaskOp != MaskOp &&
+          !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
+        return false;
+    }
+
+    // Make sure the EVL of OpVal is same as Root's.
+    if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
+      if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
+        return false;
+    return true;
+  }
+
+  // Specialize based on number of operands.
+  // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
+  // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
+  // DAG.getNode(Opcode, DL, VT); }
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {Operand, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, N3, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
+                  SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+    return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
+                       Flags);
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
+                       Flags);
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3, SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
+  }
+
+  bool isOperationLegal(unsigned Op, EVT VT) const {
+    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+    return TLI.isOperationLegal(VPOp, VT);
+  }
+
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+    return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
+  }
+};
+} // end anonymous namespace
+#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7540b22d13b7f..540c2e7476dc1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -691,7 +691,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FP_TO_UINT,  ISD::VP_SETCC,       ISD::VP_SIGN_EXTEND,
         ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE,    ISD::VP_SMIN,
         ISD::VP_SMAX,        ISD::VP_UMIN,        ISD::VP_UMAX,
-        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
+        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE,
+        ISD::VP_SADDSAT,     ISD::VP_UADDSAT,     ISD::VP_SSUBSAT,
+        ISD::VP_USUBSAT};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
@@ -5752,6 +5754,10 @@ static unsigned getRISCVVLOp(SDValue Op) {
   VP_CASE(SINT_TO_FP) // VP_SINT_TO_FP
   VP_CASE(UINT_TO_FP) // VP_UINT_TO_FP
   VP_CASE(BITREVERSE) // VP_BITREVERSE
+  VP_CASE(SADDSAT)    // VP_SADDSAT
+  VP_CASE(UADDSAT)    // VP_UADDSAT
+  VP_CASE(SSUBSAT)    // VP_SSUBSAT
+  VP_CASE(USUBSAT)    // VP_USUBSAT
   VP_CASE(BSWAP)      // VP_BSWAP
   VP_CASE(CTLZ)       // VP_CTLZ
   VP_CASE(CTTZ)       // VP_CTTZ
@@ -6791,6 +6797,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VP_UDIV:
   case ISD::VP_SREM:
   case ISD::VP_UREM:
+  case ISD::VP_UADDSAT:
+  case ISD::VP_USUBSAT:
+  case ISD::VP_SADDSAT:
+  case ISD::VP_SSUBSAT:
     return lowerVPOp(Op, DAG);
   case ISD::VP_AND:
   case ISD::VP_OR:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
new file mode 100644
index 0000000000000..6c5dd0403dff1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -0,0 +1,1701 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsra.vi v9, v9, 1
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vsadd_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vsadd_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vsadd_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vsadd_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vsadd_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vsadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsadd_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vsadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsadd_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vsadd_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vsadd_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vsadd_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vsadd_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vsadd_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vsadd_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vsadd_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vsadd_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vsadd_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vsadd_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vsadd_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vsadd_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v16, v16, -1
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vsadd.vi on RV32.
+
+define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsadd_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsadd_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
new file mode 100644
index 0000000000000..6227f8abe599e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -0,0 +1,1697 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vsaddu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vsaddu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vsaddu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vsaddu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vsaddu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vsaddu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vsaddu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsaddu_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vsaddu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsaddu_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vsaddu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vsaddu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vsaddu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vsaddu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vsaddu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vsaddu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vsaddu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vsaddu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vsaddu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vsaddu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vsaddu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vsaddu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v16, v16, -1
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vsaddu.vi on RV32.
+
+define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsaddu_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsaddu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsaddu_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
new file mode 100644
index 0000000000000..6360cf49d8d47
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -0,0 +1,1745 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsra.vi v9, v9, 1
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vssub_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vssub_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vssub_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vssub_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vssub_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssub_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssub_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vssub_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vssub_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vssub_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vssub_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vssub_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vssub_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vssub_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vssub_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vssub_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vssub_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vssub_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vssub_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a2, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a2, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a2
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v16, v16, a2
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vssub.vi on RV32.
+
+define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssub_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssub_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssub_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
new file mode 100644
index 0000000000000..6ea9758871230
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -0,0 +1,1740 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vssubu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vssubu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vssubu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vssubu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vssubu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vssubu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssubu_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssubu_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vssubu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vssubu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vssubu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vssubu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vssubu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vssubu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vssubu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vssubu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vssubu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vssubu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vssubu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vssubu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a2, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a2, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a2
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v16, v16, a2
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vssubu.vi on RV32.
+
+define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssubu_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssubu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssubu_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
new file mode 100644
index 0000000000000..caaeae55ed78e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -0,0 +1,2015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.sadd.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vsadd_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.sadd.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vsadd_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vsadd_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vsadd_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vsadd_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vsadd_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vsadd_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vsadd_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vsadd_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub a2, a1, a0
+; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a0, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vsadd_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vsadd_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vsadd_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vsadd_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vsadd_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vsadd_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vsadd_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vsadd_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vsadd_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vsadd_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vsadd_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vsadd_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vsadd_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vsadd_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vsadd_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
new file mode 100644
index 0000000000000..c0779e508c0a9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -0,0 +1,2014 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.uadd.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vsaddu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.uadd.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vsaddu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vsaddu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vsaddu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vsaddu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vsaddu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vsaddu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vsaddu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vsaddu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub a2, a1, a0
+; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a0, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vsaddu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vsaddu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vsaddu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vsaddu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vsaddu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vsaddu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vsaddu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vsaddu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vsaddu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vsaddu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vsaddu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vsaddu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vsaddu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vsaddu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vsaddu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
new file mode 100644
index 0000000000000..2d51a2ee44f65
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -0,0 +1,2067 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.ssub.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vssub_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.ssub.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vssub_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vssub_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vssub_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vssub_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vssub_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vssub_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vssub_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vssub_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub a0, a1, a2
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vssub_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vssub_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vssub_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vssub_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vssub_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vssub_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vssub_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vssub_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vssub_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vssub_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vssub_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a1, a0, a2
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a1
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    bltu a0, a2, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vssub_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vssub_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vssub_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vssub_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
new file mode 100644
index 0000000000000..e5589ce1a9bc6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -0,0 +1,2065 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.usub.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vssubu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.usub.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vssubu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vssubu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vssubu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vssubu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vssubu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vssubu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vssubu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vssubu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub a0, a1, a2
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vssubu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vssubu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vssubu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vssubu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vssubu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vssubu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vssubu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vssubu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vssubu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vssubu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vssubu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a1, a0, a2
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a1
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    bltu a0, a2, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vssubu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vssubu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vssubu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vssubu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index 7a9d91cc76b34..e3462f0f33f11 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -163,6 +163,14 @@ class VPIntrinsicTest : public testing::Test {
         << "(<8 x i16>, i1 immarg, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.cttz.v8i16"
         << "(<8 x i16>, i1 immarg, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.sadd.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.uadd.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.ssub.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
+    Str << " declare <8 x i16> @llvm.vp.usub.sat.v8i16"
+        << "(<8 x i16>, <8 x i16>, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.fshl.v8i16"
         << "(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i1>, i32) ";
     Str << " declare <8 x i16> @llvm.vp.fshr.v8i16"

From d7a28f7ad77504694ad8bdc6b2aaa8938f08fbdd Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 23 Feb 2024 14:34:57 +0800
Subject: [PATCH 131/546] [RISCV] Add asserts for insert/extract_subvector
 invariants. NFC

We can currently select insert_subvector and extract_subvector nodes in
RISCVISelDAGToDAG (this is after custom legalizing in RISCVISelLowering)
with fixed subvector types.

However decomposeSubvectorInsertExtractToSubRegs is based off of
scalable subvectors where the indices are scaled by vscale, so any index
other than 0 will be wrong.

For insert_subvector the vector being inserted into needs to be undef as
well, because it assumes we can replace a whole subregister which isn't
always the case for fixed subvectors (e.g. insert <2 x i32> into <4 x
i32> at index 0 with vlen=128).

We currently maintain these invariants in RISCVISelLowering, so this
adds asserts in RISCVISelDAGToDAG so we don't break them.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 904f1d7fdf906..c922098c55094 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2062,8 +2062,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
     MVT SubVecContainerVT = SubVecVT;
     // Establish the correct scalable-vector types for any fixed-length type.
-    if (SubVecVT.isFixedLengthVector())
+    if (SubVecVT.isFixedLengthVector()) {
+      assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
+    }
     if (VT.isFixedLengthVector())
       VT = TLI.getContainerForFixedLengthVector(VT);
 
@@ -2115,8 +2117,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     const RISCVTargetLowering &TLI = *Subtarget->getTargetLowering();
     MVT SubVecContainerVT = VT;
     // Establish the correct scalable-vector types for any fixed-length type.
-    if (VT.isFixedLengthVector())
+    if (VT.isFixedLengthVector()) {
+      assert(Idx == 0);
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(VT);
+    }
     if (InVT.isFixedLengthVector())
       InVT = TLI.getContainerForFixedLengthVector(InVT);
 

From 1fe6be8794964c011aeba7a66bd2dcd891d21ab0 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye@intel.com>
Date: Fri, 23 Feb 2024 15:18:42 +0800
Subject: [PATCH 132/546] [X86] Support APXF to enable __builtin_cpu_supports.
 (#80636)

For referring, APX's spec:
https://cdrdv2.intel.com/v1/dl/getContent/784266
APX's index in libgcc:
https://github.com/gcc-mirror/gcc/blob/master/gcc/common/config/i386/i386-cpuinfo.h#L267
---
 clang/lib/Headers/cpuid.h                          | 1 +
 clang/test/CodeGen/target-builtin-noerror.c        | 1 +
 compiler-rt/lib/builtins/cpu_model/x86.c           | 6 ++++--
 llvm/include/llvm/TargetParser/X86TargetParser.def | 1 +
 llvm/lib/TargetParser/Host.cpp                     | 7 +++++++
 5 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index c968d37fb8cd6..0bb9912b465ff 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -219,6 +219,7 @@
 #define bit_PREFETCHI     0x00004000
 #define bit_USERMSR       0x00008000
 #define bit_AVX10         0x00080000
+#define bit_APXF          0x00200000
 
 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index 9608b5f37baaa..b438e50848a4b 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -141,6 +141,7 @@ void verifyfeaturestrings(void) {
   (void)__builtin_cpu_supports("sm3");
   (void)__builtin_cpu_supports("sha512");
   (void)__builtin_cpu_supports("sm4");
+  (void)__builtin_cpu_supports("apxf");
   (void)__builtin_cpu_supports("usermsr");
   (void)__builtin_cpu_supports("avx10.1-256");
   (void)__builtin_cpu_supports("avx10.1-512");
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 1afa468c4ae8c..7e8acb3e73eda 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -217,8 +217,8 @@ enum ProcessorFeatures {
   FEATURE_SM3,
   FEATURE_SHA512,
   FEATURE_SM4,
-  // FEATURE_APXF,
-  FEATURE_USERMSR = 112,
+  FEATURE_APXF,
+  FEATURE_USERMSR,
   FEATURE_AVX10_1_256,
   FEATURE_AVX10_1_512,
   CPU_FEATURE_MAX
@@ -983,6 +983,8 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(FEATURE_USERMSR);
   if (HasLeaf7Subleaf1 && ((EDX >> 19) & 1))
     setFeature(FEATURE_AVX10_1_256);
+  if (HasLeaf7Subleaf1 && ((EDX >> 21) & 1))
+    setFeature(FEATURE_APXF);
 
   unsigned MaxLevel;
   getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX);
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 4c630c1eb06e8..a9ed56fcd4700 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -265,6 +265,7 @@ X86_MICROARCH_LEVEL(X86_64_BASELINE,"x86-64",               95)
 X86_MICROARCH_LEVEL(X86_64_V2,      "x86-64-v2",            96)
 X86_MICROARCH_LEVEL(X86_64_V3,      "x86-64-v3",            97)
 X86_MICROARCH_LEVEL(X86_64_V4,      "x86-64-v4",            98)
+X86_MICROARCH_LEVEL(APXF,           "apxf",                111)
 #undef X86_FEATURE_COMPAT
 #undef X86_FEATURE
 #undef X86_MICROARCH_LEVEL
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 4466d50458e19..a4cc757a9214e 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1846,6 +1846,13 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["prefetchi"]  = HasLeaf7Subleaf1 && ((EDX >> 14) & 1);
   Features["usermsr"]  = HasLeaf7Subleaf1 && ((EDX >> 15) & 1);
   Features["avx10.1-256"] = HasLeaf7Subleaf1 && ((EDX >> 19) & 1);
+  bool HasAPXF = HasLeaf7Subleaf1 && ((EDX >> 21) & 1);
+  Features["egpr"] = HasAPXF;
+  Features["push2pop2"] = HasAPXF;
+  Features["ppx"] = HasAPXF;
+  Features["ndd"] = HasAPXF;
+  Features["ccmp"] = HasAPXF;
+  Features["cf"] = HasAPXF;
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);

From 354401f8d3dc08ed41895d03a12a122e9cc0482c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 22 Feb 2024 23:53:12 -0800
Subject: [PATCH 133/546] [lldb] Fix GetTerminalWidth after afd469023aad

afd469023aad fixed the type of the term-width setting but the getter
(Debugger::GetTerminalWidth) was still trying to get the terminal width
as an unsigned. This fixes TestXMLRegisterFlags.py.
---
 lldb/source/Core/Debugger.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index bb81110ae35a5..c3e603dbae896 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -365,7 +365,7 @@ bool Debugger::SetREPLLanguage(lldb::LanguageType repl_lang) {
 
 uint64_t Debugger::GetTerminalWidth() const {
   const uint32_t idx = ePropertyTerminalWidth;
-  return GetPropertyAtIndexAs<int64_t>(
+  return GetPropertyAtIndexAs<uint64_t>(
       idx, g_debugger_properties[idx].default_uint_value);
 }
 

From 531e8c26b3f2626e7f1a997e0e8b61d67d10aded Mon Sep 17 00:00:00 2001
From: Dani <DanielKristofKiss@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:04:33 +0100
Subject: [PATCH 134/546] [llvm][AArch64] Autoupgrade function attributes from
 Module attributes. (#80640)

`sign-return-address` and similar module attributes should be propagated
to the function level before modules got merged because module flags may
contradict and this information is not recoverable.
Generated code will match with the normal linking flow.
---
 llvm/include/llvm/IR/AutoUpgrade.h            |  3 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  2 +-
 llvm/lib/IR/AutoUpgrade.cpp                   | 72 ++++++++++++++++++-
 llvm/lib/Linker/IRMover.cpp                   |  4 ++
 .../test/Bitcode/upgrade-arc-runtime-calls.ll |  4 +-
 .../AArch64/link-branch-target-enforcement.ll |  1 +
 .../LTO/AArch64/link-sign-return-address.ll   | 43 +++++++++++
 llvm/test/Linker/link-arm-and-thumb.ll        |  7 +-
 8 files changed, 128 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/LTO/AArch64/link-sign-return-address.ll

diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 152f781ffa9b3..c0d96efc54752 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -67,7 +67,8 @@ namespace llvm {
   void UpgradeSectionAttributes(Module &M);
 
   /// Correct any IR that is relying on old function attribute behavior.
-  void UpgradeFunctionAttributes(Function &F);
+  void UpgradeFunctionAttributes(Function &F,
+                                 bool ModuleMetadataIsMaterialized = false);
 
   /// If the given TBAA tag uses the scalar TBAA format, create a new node
   /// corresponding to the upgrade to the struct-path aware TBAA format.
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 832907a3f53f5..8c860101afa02 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -6706,7 +6706,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   }
 
   // Look for functions that rely on old function attribute behavior.
-  UpgradeFunctionAttributes(*F);
+  UpgradeFunctionAttributes(*F, true);
 
   // Bring in any functions that this function forward-referenced via
   // blockaddresses.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b90bbe71ac189..edff13c796b31 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5155,7 +5155,46 @@ struct StrictFPUpgradeVisitor : public InstVisitor<StrictFPUpgradeVisitor> {
 };
 } // namespace
 
-void llvm::UpgradeFunctionAttributes(Function &F) {
+// Check if the module attribute is present and not zero.
+static bool isModuleAttributeSet(const Module *M, const StringRef &ModAttr) {
+  const auto *Attr =
+      mdconst::extract_or_null<ConstantInt>(M->getModuleFlag(ModAttr));
+  return Attr && Attr->getZExtValue();
+}
+
+// Copy an attribute from module to the function if exists.
+// First value of the pair is used when the module attribute is not zero
+// the second otherwise.
+static void
+CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName,
+                              StringRef ModAttrName,
+                              std::pair<StringRef, StringRef> Values) {
+  if (F.hasFnAttribute(FnAttrName))
+    return;
+  F.addFnAttr(FnAttrName, isModuleAttributeSet(F.getParent(), ModAttrName)
+                              ? Values.first
+                              : Values.second);
+}
+
+// Copy a boolean attribute from module to the function if exists.
+// Module attribute treated false if zero otherwise true.
+static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) {
+  CopyModuleAttributeToFunction(
+      F, AttrName, AttrName,
+      std::make_pair<StringRef, StringRef>("true", "false"));
+}
+
+// Copy an attribute from module to the function if exists.
+// First value of the pair is used when the module attribute is not zero
+// the second otherwise.
+static void
+CopyModuleAttributeToFunction(Function &F, StringRef AttrName,
+                              std::pair<StringRef, StringRef> Values) {
+  CopyModuleAttributeToFunction(F, AttrName, AttrName, Values);
+}
+
+void llvm::UpgradeFunctionAttributes(Function &F,
+                                     bool ModuleMetadataIsMaterialized) {
   // If a function definition doesn't have the strictfp attribute,
   // convert any callsite strictfp attributes to nobuiltin.
   if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) {
@@ -5167,6 +5206,37 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
   for (auto &Arg : F.args())
     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
+
+  if (!ModuleMetadataIsMaterialized)
+    return;
+  if (F.isDeclaration())
+    return;
+  Module *M = F.getParent();
+  if (!M)
+    return;
+
+  Triple T(M->getTargetTriple());
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  if (T.isThumb() || T.isARM() || T.isAArch64()) {
+    if (!F.hasFnAttribute("sign-return-address")) {
+      StringRef SignType = "none";
+      if (isModuleAttributeSet(M, "sign-return-address"))
+        SignType = "non-leaf";
+
+      if (isModuleAttributeSet(M, "sign-return-address-all"))
+        SignType = "all";
+
+      F.addFnAttr("sign-return-address", SignType);
+    }
+    CopyModuleAttributeToFunction(F, "branch-target-enforcement");
+    CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr");
+    CopyModuleAttributeToFunction(F, "guarded-control-stack");
+    CopyModuleAttributeToFunction(
+        F, "sign-return-address-key",
+        std::make_pair<StringRef, StringRef>("b_key", "a_key"));
+  }
 }
 
 static bool isOldLoopArgument(Metadata *MD) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 37d21119447b9..9f45ebc6eda01 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1606,6 +1606,10 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
+  // Update function attributes before copying them to destation module.
+  for (Function &F : SrcM->getFunctionList())
+    UpgradeFunctionAttributes(F, true);
+
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
diff --git a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll
index 19f25f98953fa..d2edec18d55e5 100644
--- a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll
+++ b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll
@@ -55,7 +55,7 @@ unwindBlock:
 // Check that auto-upgrader converts function calls to intrinsic calls. Note that
 // the auto-upgrader doesn't touch invoke instructions.
 
-// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality
+// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality
 // ARC: %[[V0:.*]] = tail call ptr @llvm.objc.autorelease(ptr %[[A]])
 // ARC-NEXT: tail call void @llvm.objc.autoreleasePoolPop(ptr %[[A]])
 // ARC-NEXT: %[[V1:.*]] = tail call ptr @llvm.objc.autoreleasePoolPush()
@@ -88,7 +88,7 @@ unwindBlock:
 // ARC-NEXT: tail call void @llvm.objc.arc.annotation.bottomup.bbend(ptr %[[B]], ptr %[[C]])
 // ARC-NEXT: invoke void @objc_autoreleasePoolPop(ptr %[[A]])
 
-// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality
+// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality
 // NOUPGRADE: %[[V0:.*]] = tail call ptr @objc_autorelease(ptr %[[A]])
 // NOUPGRADE-NEXT: tail call void @objc_autoreleasePoolPop(ptr %[[A]])
 // NOUPGRADE-NEXT: %[[V1:.*]] = tail call ptr @objc_autoreleasePoolPush()
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index ccf8cf67ede6d..74d9c86881d52 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -32,6 +32,7 @@ entry:
 ; CHECK-DUMP: <main>:
 ; CHECK-DUMP:      bl      0x8 <main+0x8>
 ; CHECK-DUMP: <foo>:
+; CHECK-DUMP:     paciasp
 
 ; `main` doesn't support BTI while `foo` does, so in the binary
 ; we should see only PAC which is supported by both.
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
new file mode 100644
index 0000000000000..c25857ceed7b4
--- /dev/null
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -0,0 +1,43 @@
+; Testcase to check that module with different branch-target-enforcement can
+; be mixed.
+;
+; RUN: llvm-as %s -o %t1.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-lto -exported-symbol main \
+; RUN:          -exported-symbol foo \
+; RUN:          -filetype=obj \
+; RUN:           %t2.bc %t1.bc \
+; RUN:           -o %t1.exe 2>&1
+; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
+; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare i32 @foo();
+
+define i32 @main() {
+entry:
+  %add = call i32 @foo()
+  ret i32 %add
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 0}
+!1 = !{i32 8, !"sign-return-address", i32 0}
+!2 = !{i32 8, !"sign-return-address-all", i32 0}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
+
+; CHECK-DUMP: <foo>:
+; CHECK-DUMP:     paciasp
+; CHECK-DUMP:     mov     w0, #0x2a
+; CHECK-DUMP:     autiasp
+; CHECK-DUMP:     ret
+; CHECK-DUMP: <main>:
+; CHECK-DUMP-NOT:  paciasp
+; CHECK-DUMP:      str     x30,
+; CHECK-DUMP:      bl      0x14 <main+0x4>
+
+; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
+; we should not see anything.
+; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
\ No newline at end of file
diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
index a90f2128e4430..37bd8c37f8b5e 100644
--- a/llvm/test/Linker/link-arm-and-thumb.ll
+++ b/llvm/test/Linker/link-arm-and-thumb.ll
@@ -13,11 +13,12 @@ entry:
   ret i32 %add
 }
 
-; CHECK: define i32 @main() {
+; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
 
-; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
-; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
+; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} }
+; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" }
+; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" }
 
 ; STDERR-NOT: warning: Linking two modules of different target triples:

From 6fae3e784472751002570f367c378cb2dbd82c26 Mon Sep 17 00:00:00 2001
From: Dani <DanielKristofKiss@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:30:36 +0100
Subject: [PATCH 135/546] [llvm][AArch64] Do not inline a function with
 different signing scheme. (#80642)

If the signing scheme is different that maybe the functions assumes
different behaviours and dangerous to inline them without analysing
them. This should be a rare case.
---
 llvm/include/llvm/IR/Attributes.td            |  28 +++--
 llvm/lib/IR/Attributes.cpp                    |   5 +
 .../Inline/inline-sign-return-address.ll      | 104 ++++++++++++++++++
 llvm/utils/TableGen/Attributes.cpp            |   6 +-
 4 files changed, 135 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/inline-sign-return-address.ll

diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 864f87f338389..d22eb76d2292d 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -339,14 +339,26 @@ def UseSampleProfile : StrBoolAttr<"use-sample-profile">;
 def DenormalFPMath : ComplexStrAttr<"denormal-fp-math", [FnAttr]>;
 def DenormalFPMathF32 : ComplexStrAttr<"denormal-fp-math-f32", [FnAttr]>;
 
+// Attribute compatiblity rules are generated to check the attribute of the
+// caller and callee and decide whether inlining should be allowed. CompatRule
+// and child classes are used for the rule generation. CompatRule takes only a
+// compare function which could be templated with the attribute type.
+// CompatRuleStrAttr takes the compare function and the string attribute for
+// checking compatibility for inline substitution.
 class CompatRule<string F> {
-  // The name of the function called to check the attribute of the caller and
-  // callee and decide whether inlining should be allowed. The function's
-  // signature must match "bool(const Function&, const Function &)", where the
-  // first parameter is the reference to the caller and the second parameter is
-  // the reference to the callee. It must return false if the attributes of the
-  // caller and callee are incompatible, and true otherwise.
+  // The function's signature must match "bool(const Function&, const
+  // Function&)", where the first parameter is the reference to the caller and
+  // the second parameter is the reference to the callee. It must return false
+  // if the attributes of the caller and callee are incompatible, and true
+  // otherwise.
   string CompatFunc = F;
+  string AttrName = "";
+}
+
+class CompatRuleStrAttr<string F, string Attr> : CompatRule<F> {
+  // The checker function is extended with an third argument as the function
+  // attribute string "bool(const Function&, const Function&, const StringRef&)".
+  string AttrName = Attr;
 }
 
 def : CompatRule<"isEqual<SanitizeAddressAttr>">;
@@ -359,7 +371,9 @@ def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
 def : CompatRule<"isEqual<NoProfileAttr>">;
 def : CompatRule<"checkDenormMode">;
-
+def : CompatRuleStrAttr<"isEqual", "sign-return-address">;
+def : CompatRuleStrAttr<"isEqual", "sign-return-address-key">;
+def : CompatRuleStrAttr<"isEqual", "branch-protection-pauth-lr">;
 
 class MergeRule<string F> {
   // The name of the function called to merge the attributes of the caller and
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index fd5160209506f..19076771ff2ea 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2045,6 +2045,11 @@ static bool isEqual(const Function &Caller, const Function &Callee) {
          Callee.getFnAttribute(AttrClass::getKind());
 }
 
+static bool isEqual(const Function &Caller, const Function &Callee,
+                    const StringRef &AttrName) {
+  return Caller.getFnAttribute(AttrName) == Callee.getFnAttribute(AttrName);
+}
+
 /// Compute the logical AND of the attributes of the caller and the
 /// callee.
 ///
diff --git a/llvm/test/Transforms/Inline/inline-sign-return-address.ll b/llvm/test/Transforms/Inline/inline-sign-return-address.ll
new file mode 100644
index 0000000000000..c4d85fa671a4f
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-sign-return-address.ll
@@ -0,0 +1,104 @@
+; Check the inliner doesn't inline a function with different sign return address schemes.
+; RUN: opt < %s -passes=inline -S | FileCheck %s
+
+define internal void @foo_all() #0 {
+  ret void
+}
+
+define internal void @foo_nonleaf() #1 {
+  ret void
+}
+
+define internal void @foo_none() #2 {
+  ret void
+}
+
+define internal void @foo_lr() #3 {
+  ret void
+}
+
+define internal void @foo_bkey() #4 {
+  ret void
+}
+
+define dso_local void @bar_all() #0 {
+; CHECK-LABEL: bar_all
+; CHECK-NOT:     call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_nonleaf() #1 {
+; CHECK-LABEL: bar_nonleaf
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NOT:     call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_none() #2 {
+; CHECK-LABEL: bar_none
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NOT:     call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_lr() #3 {
+; CHECK-LABEL: bar_lr
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NOT:     call void @foo_lr()
+; CHECK-NEXT:    call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+define dso_local void @bar_bkey() #4 {
+; CHECK-LABEL: bar_bkey
+; CHECK-NEXT:    call void @foo_all()
+; CHECK-NEXT:    call void @foo_nonleaf()
+; CHECK-NEXT:    call void @foo_none()
+; CHECK-NEXT:    call void @foo_lr()
+; CHECK-NOT:     call void @foo_bkey()
+  call void @foo_all()
+  call void @foo_nonleaf()
+  call void @foo_none()
+  call void @foo_lr()
+  call void @foo_bkey()
+  ret void
+}
+
+
+attributes #0 = { "branch-protection-pauth-lr"="false" "sign-return-address"="all" }
+attributes #1 = { "branch-protection-pauth-lr"="false" "sign-return-address"="non-leaf" }
+attributes #2 = { "branch-protection-pauth-lr"="false" "sign-return-address"="none" }
+attributes #3 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" }
+attributes #4 = { "branch-protection-pauth-lr"="true" "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" }
\ No newline at end of file
diff --git a/llvm/utils/TableGen/Attributes.cpp b/llvm/utils/TableGen/Attributes.cpp
index 474042a3e9a33..db3c4decccb4c 100644
--- a/llvm/utils/TableGen/Attributes.cpp
+++ b/llvm/utils/TableGen/Attributes.cpp
@@ -87,7 +87,11 @@ void Attributes::emitFnAttrCompatCheck(raw_ostream &OS, bool IsStringAttr) {
 
   for (auto *Rule : CompatRules) {
     StringRef FuncName = Rule->getValueAsString("CompatFunc");
-    OS << "  Ret &= " << FuncName << "(Caller, Callee);\n";
+    OS << "  Ret &= " << FuncName << "(Caller, Callee";
+    StringRef AttrName = Rule->getValueAsString("AttrName");
+    if (!AttrName.empty())
+      OS << ", \"" << AttrName << "\"";
+    OS << ");\n";
   }
 
   OS << "\n";

From 5ca877591e65acf18b5a8d3234ff88b215b4f369 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Fri, 23 Feb 2024 09:35:38 +0100
Subject: [PATCH 136/546] [clang][analyzer] Fix argument invalidations in
 StreamChecker. (#79470)

Specific arguments passed to stream handling functions are changed by
the function, this means these should be invalidated ("escaped") by the
analyzer. This change adds the argument invalidation (in specific cases)
to the checker.
---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp |  39 ++++-
 clang/test/Analysis/stream-invalidate.c       | 147 ++++++++++++++++++
 2 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Analysis/stream-invalidate.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index a070f451694a3..65bdc4cac3094 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -21,6 +21,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
+#include "llvm/ADT/Sequence.h"
 #include <functional>
 #include <optional>
 
@@ -629,6 +630,21 @@ const ExplodedNode *StreamChecker::getAcquisitionSite(const ExplodedNode *N,
   return nullptr;
 }
 
+static ProgramStateRef escapeArgs(ProgramStateRef State, CheckerContext &C,
+                                  const CallEvent &Call,
+                                  ArrayRef<unsigned int> EscapingArgs) {
+  const auto *CE = Call.getOriginExpr();
+
+  SmallVector<SVal> EscapingVals;
+  EscapingVals.reserve(EscapingArgs.size());
+  for (auto EscArgIdx : EscapingArgs)
+    EscapingVals.push_back(Call.getArgSVal(EscArgIdx));
+  State = State->invalidateRegions(EscapingVals, CE, C.blockCount(),
+                                   C.getLocationContext(),
+                                   /*CausesPointerEscape=*/false);
+  return State;
+}
+
 //===----------------------------------------------------------------------===//
 // Methods of StreamChecker.
 //===----------------------------------------------------------------------===//
@@ -819,6 +835,11 @@ void StreamChecker::evalFreadFwrite(const FnDescription *Desc,
     return;
   }
 
+  // At read, invalidate the buffer in any case of error or success,
+  // except if EOF was already present.
+  if (IsFread && !E.isStreamEof())
+    State = escapeArgs(State, C, Call, {0});
+
   // Generate a transition for the success state.
   // If we know the state to be FEOF at fread, do not add a success state.
   if (!IsFread || !E.isStreamEof()) {
@@ -863,6 +884,9 @@ void StreamChecker::evalFgetx(const FnDescription *Desc, const CallEvent &Call,
     return;
 
   if (!E.isStreamEof()) {
+    // If there was already EOF, assume that read buffer is not changed.
+    // Otherwise it may change at success or failure.
+    State = escapeArgs(State, C, Call, {0});
     if (SingleChar) {
       // Generate a transition for the success state of `fgetc`.
       NonLoc RetVal = makeRetVal(C, E.CE).castAs<NonLoc>();
@@ -1011,6 +1035,14 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call,
         State->BindExpr(E.CE, C.getLocationContext(), RetVal);
     StateNotFailed =
         E.assumeBinOpNN(StateNotFailed, BO_GE, RetVal, E.getZeroVal(Call));
+    if (!StateNotFailed)
+      return;
+
+    SmallVector<unsigned int> EscArgs;
+    for (auto EscArg : llvm::seq(2u, Call.getNumArgs()))
+      EscArgs.push_back(EscArg);
+    StateNotFailed = escapeArgs(StateNotFailed, C, Call, EscArgs);
+
     if (StateNotFailed)
       C.addTransition(StateNotFailed);
   }
@@ -1073,8 +1105,12 @@ void StreamChecker::evalGetdelim(const FnDescription *Desc,
   // return -1.
   // If an error occurs, the function shall return -1 and set 'errno'.
 
-  // Add transition for the successful state.
   if (!E.isStreamEof()) {
+    // Escape buffer and size (may change by the call).
+    // May happen even at error (partial read?).
+    State = escapeArgs(State, C, Call, {0, 1});
+
+    // Add transition for the successful state.
     NonLoc RetVal = makeRetVal(C, E.CE).castAs<NonLoc>();
     ProgramStateRef StateNotFailed =
         State->BindExpr(E.CE, C.getLocationContext(), RetVal);
@@ -1161,6 +1197,7 @@ void StreamChecker::evalFgetpos(const FnDescription *Desc,
 
   ProgramStateRef StateNotFailed, StateFailed;
   std::tie(StateFailed, StateNotFailed) = E.makeRetValAndAssumeDual(State, C);
+  StateNotFailed = escapeArgs(StateNotFailed, C, Call, {1});
 
   // This function does not affect the stream state.
   // Still we add success and failure state with the appropriate return value.
diff --git a/clang/test/Analysis/stream-invalidate.c b/clang/test/Analysis/stream-invalidate.c
new file mode 100644
index 0000000000000..6745d11a2fe70
--- /dev/null
+++ b/clang/test/Analysis/stream-invalidate.c
@@ -0,0 +1,147 @@
+// RUN: %clang_analyze_cc1 -verify %s \
+// RUN: -analyzer-checker=core \
+// RUN: -analyzer-checker=alpha.unix.Stream \
+// RUN: -analyzer-checker=debug.ExprInspection
+
+#include "Inputs/system-header-simulator.h"
+
+void clang_analyzer_eval(int);
+void clang_analyzer_dump(int);
+
+void test_fread(void) {
+  FILE *F = fopen("file", "r+");
+  if (!F)
+    return;
+
+  char Buf[3] = {10, 10, 10};
+  fread(Buf, 1, 3, F);
+  // The check applies to success and failure.
+  clang_analyzer_dump(Buf[0]); // expected-warning {{conj_$}} Should not preserve the previous value, thus should not be 10.
+  clang_analyzer_dump(Buf[2]); // expected-warning {{conj_$}}
+  if (feof(F)) {
+    char Buf1[3] = {10, 10, 10};
+    fread(Buf1, 1, 3, F); // expected-warning {{is in EOF state}}
+    clang_analyzer_dump(Buf1[0]); // expected-warning {{10 S32b}}
+    clang_analyzer_dump(Buf1[2]); // expected-warning {{10 S32b}}
+  }
+
+  fclose(F);
+}
+
+void test_fwrite(void) {
+  FILE *F = fopen("file", "r+");
+  if (!F)
+    return;
+
+  char Buf[3] = {10, 10, 10};
+  fwrite(Buf, 1, 3, F);
+  // The check applies to success and failure.
+  clang_analyzer_dump(Buf[0]); // expected-warning {{10 S32b}}
+  clang_analyzer_dump(Buf[2]); // expected-warning {{10 S32b}}
+
+  fclose(F);
+}
+
+void test_fgets() {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+
+  char Buf[3] = {10, 10, 10};
+  fgets(Buf, 3, F);
+  // The check applies to success and failure.
+  clang_analyzer_dump(Buf[0]); // expected-warning {{conj_$}} Should not preserve the previous value, thus should not be 10.
+  clang_analyzer_dump(Buf[2]); // expected-warning {{conj_$}}
+  if (feof(F)) {
+    char Buf1[3] = {10, 10, 10};
+    fgets(Buf1, 3, F); // expected-warning {{is in EOF state}}
+    clang_analyzer_dump(Buf1[0]); // expected-warning {{10 S32b}}
+    clang_analyzer_dump(Buf1[2]); // expected-warning {{10 S32b}}
+  }
+
+  fclose(F);
+}
+
+void test_fputs() {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+
+  char *Buf = "aaa";
+  fputs(Buf, F);
+  // The check applies to success and failure.
+  clang_analyzer_dump(Buf[0]); // expected-warning {{97 S32b}}
+  clang_analyzer_dump(Buf[2]); // expected-warning {{97 S32b}}
+  clang_analyzer_dump(Buf[3]); // expected-warning {{0 S32b}}
+
+  fclose(F);
+}
+
+void test_fscanf() {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+
+  int a = 1;
+  unsigned b;
+  int Ret = fscanf(F, "%d %u", &a, &b);
+  if (Ret == 0) {
+    clang_analyzer_dump(a); // expected-warning {{conj_$}}
+    // FIXME: should be {{1 S32b}}.
+    clang_analyzer_dump(b); // expected-warning {{conj_$}}
+    // FIXME: should be {{uninitialized value}}.
+  } else if (Ret == 1) {
+    clang_analyzer_dump(a); // expected-warning {{conj_$}}
+    clang_analyzer_dump(b); // expected-warning {{conj_$}}
+    // FIXME: should be {{uninitialized value}}.
+  } else if (Ret >= 2) {
+    clang_analyzer_dump(a); // expected-warning {{conj_$}}
+    clang_analyzer_dump(b); // expected-warning {{conj_$}}
+    clang_analyzer_eval(Ret == 2); // expected-warning {{FALSE}} expected-warning {{TRUE}}
+    // FIXME: should be only TRUE.
+  } else {
+    clang_analyzer_dump(a); // expected-warning {{1 S32b}}
+    clang_analyzer_dump(b); // expected-warning {{uninitialized value}}
+  }
+
+  fclose(F);
+}
+
+void test_getdelim(char *P, size_t Sz) {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+
+  char *P1 = P;
+  size_t Sz1 = Sz;
+  ssize_t Ret = getdelim(&P, &Sz, '\t', F);
+  if (Ret < 0) {
+    clang_analyzer_eval(P == P1); // expected-warning {{FALSE}} \
+                                  // expected-warning {{TRUE}}
+    clang_analyzer_eval(Sz == Sz1); // expected-warning {{FALSE}} \
+                                    // expected-warning {{TRUE}}
+  } else {
+    clang_analyzer_eval(P == P1); // expected-warning {{FALSE}} \
+                                  // expected-warning {{TRUE}}
+    clang_analyzer_eval(Sz == Sz1); // expected-warning {{FALSE}} \
+                                    // expected-warning {{TRUE}}
+  }
+
+  fclose(F);
+}
+
+void test_fgetpos() {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+
+  fpos_t Pos = 1;
+  int Ret = fgetpos(F, &Pos);
+  if (Ret == 0) {
+    clang_analyzer_dump(Pos); // expected-warning {{conj_$}}
+  } else {
+    clang_analyzer_dump(Pos); // expected-warning {{1 S32b}}
+  }
+
+  fclose(F);
+}

From d68d29516102252f6bf6dc23fb22cef144ca1cb3 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 09:48:13 +0100
Subject: [PATCH 137/546] [mlir][Transforms][NFC] Turn op/block arg
 replacements into `IRRewrite`s (#81757)

This commit is a refactoring of the dialect conversion. The dialect
conversion maintains a list of "IR rewrites" that can be committed (upon
success) or rolled back (upon failure).

Until now, op replacements and block argument replacements were kept
track in separate data structures inside the dialect conversion. This
commit turns them into `IRRewrite`s, so that they can be committed or
rolled back just like any other rewrite. This simplifies the internal
state of the dialect conversion.

Overview of changes:
* Add two new rewrite classes: `ReplaceBlockArgRewrite` and
`ReplaceOperationRewrite`. Remove the `OpReplacement` helper class; it
is now part of `ReplaceOperationRewrite`.
* Simplify `RewriterState`: `numReplacements` and `numArgReplacements`
are no longer needed. (Now being kept track of by `numRewrites`.)
* Add `IRRewrite::cleanup`. Operations should not be erased in `commit`
because they may still be referenced in other internal state of the
dialect conversion (`mapping`). Detaching operations is fine.
* `trackedOps` are now updated during the "commit" phase instead of
after applying all rewrites.
---
 .../Transforms/Utils/DialectConversion.cpp    | 297 +++++++++---------
 1 file changed, 157 insertions(+), 140 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index db41b9f19e7e8..dec68048dc1d3 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -153,14 +153,12 @@ namespace {
 /// This is useful when saving and undoing a set of rewrites.
 struct RewriterState {
   RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations,
-                unsigned numReplacements, unsigned numArgReplacements,
                 unsigned numRewrites, unsigned numIgnoredOperations,
                 unsigned numErased)
       : numCreatedOps(numCreatedOps),
         numUnresolvedMaterializations(numUnresolvedMaterializations),
-        numReplacements(numReplacements),
-        numArgReplacements(numArgReplacements), numRewrites(numRewrites),
-        numIgnoredOperations(numIgnoredOperations), numErased(numErased) {}
+        numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations),
+        numErased(numErased) {}
 
   /// The current number of created operations.
   unsigned numCreatedOps;
@@ -168,12 +166,6 @@ struct RewriterState {
   /// The current number of unresolved materializations.
   unsigned numUnresolvedMaterializations;
 
-  /// The current number of replacements queued.
-  unsigned numReplacements;
-
-  /// The current number of argument replacements queued.
-  unsigned numArgReplacements;
-
   /// The current number of rewrites performed.
   unsigned numRewrites;
 
@@ -184,20 +176,6 @@ struct RewriterState {
   unsigned numErased;
 };
 
-//===----------------------------------------------------------------------===//
-// OpReplacement
-
-/// This class represents one requested operation replacement via 'replaceOp' or
-/// 'eraseOp`.
-struct OpReplacement {
-  OpReplacement(const TypeConverter *converter = nullptr)
-      : converter(converter) {}
-
-  /// An optional type converter that can be used to materialize conversions
-  /// between the new and old values if necessary.
-  const TypeConverter *converter;
-};
-
 //===----------------------------------------------------------------------===//
 // UnresolvedMaterialization
 
@@ -321,19 +299,27 @@ class IRRewrite {
     MoveBlock,
     SplitBlock,
     BlockTypeConversion,
+    ReplaceBlockArg,
     // Operation rewrites
     MoveOperation,
-    ModifyOperation
+    ModifyOperation,
+    ReplaceOperation
   };
 
   virtual ~IRRewrite() = default;
 
-  /// Roll back the rewrite.
+  /// Roll back the rewrite. Operations may be erased during rollback.
   virtual void rollback() = 0;
 
-  /// Commit the rewrite.
+  /// Commit the rewrite. Operations may be unlinked from their blocks during
+  /// the commit phase, but they must not be erased yet. This is because
+  /// internal dialect conversion state (such as `mapping`) may still be using
+  /// them. Operations must be erased during cleanup.
   virtual void commit() {}
 
+  /// Cleanup operations. Cleanup is called after commit.
+  virtual void cleanup() {}
+
   Kind getKind() const { return kind; }
 
   static bool classof(const IRRewrite *rewrite) { return true; }
@@ -360,7 +346,7 @@ class BlockRewrite : public IRRewrite {
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() >= Kind::CreateBlock &&
-           rewrite->getKind() <= Kind::BlockTypeConversion;
+           rewrite->getKind() <= Kind::ReplaceBlockArg;
   }
 
 protected:
@@ -428,6 +414,8 @@ class EraseBlockRewrite : public BlockRewrite {
   void commit() override {
     // Erase the block.
     assert(block && "expected block");
+    assert(block->empty() && "expected empty block");
+    block->dropAllDefinedValueUses();
     delete block;
     block = nullptr;
   }
@@ -589,6 +577,27 @@ class BlockTypeConversionRewrite : public BlockRewrite {
   const TypeConverter *converter;
 };
 
+/// Replacing a block argument. This rewrite is not immediately reflected in the
+/// IR. An internal IR mapping is updated, but the actual replacement is delayed
+/// until the rewrite is committed.
+class ReplaceBlockArgRewrite : public BlockRewrite {
+public:
+  ReplaceBlockArgRewrite(ConversionPatternRewriterImpl &rewriterImpl,
+                         Block *block, BlockArgument arg)
+      : BlockRewrite(Kind::ReplaceBlockArg, rewriterImpl, block), arg(arg) {}
+
+  static bool classof(const IRRewrite *rewrite) {
+    return rewrite->getKind() == Kind::ReplaceBlockArg;
+  }
+
+  void commit() override;
+
+  void rollback() override;
+
+private:
+  BlockArgument arg;
+};
+
 /// An operation rewrite.
 class OperationRewrite : public IRRewrite {
 public:
@@ -597,7 +606,7 @@ class OperationRewrite : public IRRewrite {
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() >= Kind::MoveOperation &&
-           rewrite->getKind() <= Kind::ModifyOperation;
+           rewrite->getKind() <= Kind::ReplaceOperation;
   }
 
 protected:
@@ -698,6 +707,39 @@ class ModifyOperationRewrite : public OperationRewrite {
   SmallVector<Block *, 2> successors;
   void *propertiesStorage = nullptr;
 };
+
+/// Replacing an operation. Erasing an operation is treated as a special case
+/// with "null" replacements. This rewrite is not immediately reflected in the
+/// IR. An internal IR mapping is updated, but values are not replaced and the
+/// original op is not erased until the rewrite is committed.
+class ReplaceOperationRewrite : public OperationRewrite {
+public:
+  ReplaceOperationRewrite(ConversionPatternRewriterImpl &rewriterImpl,
+                          Operation *op, const TypeConverter *converter,
+                          bool changedResults)
+      : OperationRewrite(Kind::ReplaceOperation, rewriterImpl, op),
+        converter(converter), changedResults(changedResults) {}
+
+  static bool classof(const IRRewrite *rewrite) {
+    return rewrite->getKind() == Kind::ReplaceOperation;
+  }
+
+  void commit() override;
+
+  void rollback() override;
+
+  void cleanup() override;
+
+private:
+  friend struct OperationConverter;
+
+  /// An optional type converter that can be used to materialize conversions
+  /// between the new and old values if necessary.
+  const TypeConverter *converter;
+
+  /// A boolean flag that indicates whether result types have changed or not.
+  bool changedResults;
+};
 } // namespace
 
 /// Return "true" if there is an operation rewrite that matches the specified
@@ -890,6 +932,7 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
     void eraseBlock(Block *block) override {
       if (erased.contains(block))
         return;
+      assert(block->empty() && "expected empty block");
       block->dropAllDefinedValueUses();
       RewriterBase::eraseBlock(block);
     }
@@ -921,12 +964,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// conversion.
   SmallVector<UnresolvedMaterialization> unresolvedMaterializations;
 
-  /// Ordered map of requested operation replacements.
-  llvm::MapVector<Operation *, OpReplacement> replacements;
-
-  /// Ordered vector of any requested block argument replacements.
-  SmallVector<BlockArgument, 4> argReplacements;
-
   /// Ordered list of block operations (creations, splits, motions).
   SmallVector<std::unique_ptr<IRRewrite>> rewrites;
 
@@ -941,11 +978,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// operation was ignored.
   SetVector<Operation *> ignoredOps;
 
-  /// A vector of indices into `replacements` of operations that were replaced
-  /// with values with different result types than the original operation, e.g.
-  /// 1->N conversion of some kind.
-  SmallVector<unsigned, 4> operationsWithChangedResults;
-
   /// The current type converter, or nullptr if no type converter is currently
   /// active.
   const TypeConverter *currentTypeConverter = nullptr;
@@ -957,6 +989,12 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// This allows the user to collect the match failure message.
   function_ref<void(Diagnostic &)> notifyCallback;
 
+  /// A set of pre-existing operations. When mode == OpConversionMode::Analysis,
+  /// this is populated with ops found to be legalizable to the target.
+  /// When mode == OpConversionMode::Partial, this is populated with ops found
+  /// *not* to be legalizable to the target.
+  DenseSet<Operation *> *trackedOps = nullptr;
+
 #ifndef NDEBUG
   /// A set of operations that have pending updates. This tracking isn't
   /// strictly necessary, and is thus only active during debug builds for extra
@@ -1001,6 +1039,8 @@ void BlockTypeConversionRewrite::commit() {
     }
   }
 
+  assert(origBlock->empty() && "expected empty block");
+  origBlock->dropAllDefinedValueUses();
   delete origBlock;
   origBlock = nullptr;
 }
@@ -1063,6 +1103,47 @@ LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
   return success();
 }
 
+void ReplaceBlockArgRewrite::commit() {
+  Value repl = rewriterImpl.mapping.lookupOrNull(arg, arg.getType());
+  if (!repl)
+    return;
+
+  if (isa<BlockArgument>(repl)) {
+    arg.replaceAllUsesWith(repl);
+    return;
+  }
+
+  // If the replacement value is an operation, we check to make sure that we
+  // don't replace uses that are within the parent operation of the
+  // replacement value.
+  Operation *replOp = cast<OpResult>(repl).getOwner();
+  Block *replBlock = replOp->getBlock();
+  arg.replaceUsesWithIf(repl, [&](OpOperand &operand) {
+    Operation *user = operand.getOwner();
+    return user->getBlock() != replBlock || replOp->isBeforeInBlock(user);
+  });
+}
+
+void ReplaceBlockArgRewrite::rollback() { rewriterImpl.mapping.erase(arg); }
+
+void ReplaceOperationRewrite::commit() {
+  for (OpResult result : op->getResults())
+    if (Value newValue =
+            rewriterImpl.mapping.lookupOrNull(result, result.getType()))
+      result.replaceAllUsesWith(newValue);
+  if (rewriterImpl.trackedOps)
+    rewriterImpl.trackedOps->erase(op);
+  // Do not erase the operation yet. It may still be referenced in `mapping`.
+  op->getBlock()->getOperations().remove(op);
+}
+
+void ReplaceOperationRewrite::rollback() {
+  for (auto result : op->getResults())
+    rewriterImpl.mapping.erase(result);
+}
+
+void ReplaceOperationRewrite::cleanup() { eraseOp(op); }
+
 void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) {
   for (Region &region : op->getRegions()) {
     for (Block &block : region.getBlocks()) {
@@ -1085,51 +1166,16 @@ void ConversionPatternRewriterImpl::discardRewrites() {
 }
 
 void ConversionPatternRewriterImpl::applyRewrites() {
-  // Apply all of the rewrites replacements requested during conversion.
-  for (auto &repl : replacements) {
-    for (OpResult result : repl.first->getResults())
-      if (Value newValue = mapping.lookupOrNull(result, result.getType()))
-        result.replaceAllUsesWith(newValue);
-  }
-
-  // Apply all of the requested argument replacements.
-  for (BlockArgument arg : argReplacements) {
-    Value repl = mapping.lookupOrNull(arg, arg.getType());
-    if (!repl)
-      continue;
-
-    if (isa<BlockArgument>(repl)) {
-      arg.replaceAllUsesWith(repl);
-      continue;
-    }
-
-    // If the replacement value is an operation, we check to make sure that we
-    // don't replace uses that are within the parent operation of the
-    // replacement value.
-    Operation *replOp = cast<OpResult>(repl).getOwner();
-    Block *replBlock = replOp->getBlock();
-    arg.replaceUsesWithIf(repl, [&](OpOperand &operand) {
-      Operation *user = operand.getOwner();
-      return user->getBlock() != replBlock || replOp->isBeforeInBlock(user);
-    });
-  }
+  // Commit all rewrites.
+  for (auto &rewrite : rewrites)
+    rewrite->commit();
+  for (auto &rewrite : rewrites)
+    rewrite->cleanup();
 
   // Drop all of the unresolved materialization operations created during
   // conversion.
   for (auto &mat : unresolvedMaterializations)
     eraseRewriter.eraseOp(mat.getOp());
-
-  // In a second pass, erase all of the replaced operations in reverse. This
-  // allows processing nested operations before their parent region is
-  // destroyed. Because we process in reverse order, producers may be deleted
-  // before their users (a pattern deleting a producer and then the consumer)
-  // so we first drop all uses explicitly.
-  for (auto &repl : llvm::reverse(replacements))
-    eraseRewriter.eraseOp(repl.first);
-
-  // Commit all rewrites.
-  for (auto &rewrite : rewrites)
-    rewrite->commit();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1137,28 +1183,14 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 
 RewriterState ConversionPatternRewriterImpl::getCurrentState() {
   return RewriterState(createdOps.size(), unresolvedMaterializations.size(),
-                       replacements.size(), argReplacements.size(),
                        rewrites.size(), ignoredOps.size(),
                        eraseRewriter.erased.size());
 }
 
 void ConversionPatternRewriterImpl::resetState(RewriterState state) {
-  // Reset any replaced arguments.
-  for (BlockArgument replacedArg :
-       llvm::drop_begin(argReplacements, state.numArgReplacements))
-    mapping.erase(replacedArg);
-  argReplacements.resize(state.numArgReplacements);
-
   // Undo any rewrites.
   undoRewrites(state.numRewrites);
 
-  // Reset any replaced operations and undo any saved mappings.
-  for (auto &repl : llvm::drop_begin(replacements, state.numReplacements))
-    for (auto result : repl.first->getResults())
-      mapping.erase(result);
-  while (replacements.size() != state.numReplacements)
-    replacements.pop_back();
-
   // Pop all of the newly inserted materializations.
   while (unresolvedMaterializations.size() !=
          state.numUnresolvedMaterializations) {
@@ -1183,11 +1215,6 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) {
   while (ignoredOps.size() != state.numIgnoredOperations)
     ignoredOps.pop_back();
 
-  // Reset operations with changed results.
-  while (!operationsWithChangedResults.empty() &&
-         operationsWithChangedResults.back() >= state.numReplacements)
-    operationsWithChangedResults.pop_back();
-
   while (eraseRewriter.erased.size() != state.numErased)
     eraseRewriter.erased.pop_back();
 }
@@ -1256,7 +1283,8 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
 
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
   // Check to see if this operation was replaced or its parent ignored.
-  return replacements.count(op) || ignoredOps.count(op->getParentOp());
+  return ignoredOps.count(op->getParentOp()) ||
+         hasRewrite<ReplaceOperationRewrite>(rewrites, op);
 }
 
 void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
@@ -1396,7 +1424,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
              "invalid to provide a replacement value when the argument isn't "
              "dropped");
       mapping.map(origArg, inputMap->replacementValue);
-      argReplacements.push_back(origArg);
+      appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
       continue;
     }
 
@@ -1430,7 +1458,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
     }
 
     mapping.map(origArg, newArg);
-    argReplacements.push_back(origArg);
+    appendRewrite<ReplaceBlockArgRewrite>(block, origArg);
     argInfo[i] = ConvertedArgInfo(inputMap->inputNo, inputMap->size, newArg);
   }
 
@@ -1462,7 +1490,12 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
 void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
                                                      ValueRange newValues) {
   assert(newValues.size() == op->getNumResults());
-  assert(!replacements.count(op) && "operation was already replaced");
+#ifndef NDEBUG
+  for (auto &rewrite : rewrites)
+    if (auto *opReplacement = dyn_cast<ReplaceOperationRewrite>(rewrite.get()))
+      assert(opReplacement->getOperation() != op &&
+             "operation was already replaced");
+#endif // NDEBUG
 
   // Track if any of the results changed, e.g. erased and replaced with null.
   bool resultChanged = false;
@@ -1477,11 +1510,9 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
     mapping.map(result, newValue);
     resultChanged |= (newValue.getType() != result.getType());
   }
-  if (resultChanged)
-    operationsWithChangedResults.push_back(replacements.size());
 
-  // Record the requested operation replacement.
-  replacements.insert(std::make_pair(op, OpReplacement(currentTypeConverter)));
+  appendRewrite<ReplaceOperationRewrite>(op, currentTypeConverter,
+                                         resultChanged);
 
   // Mark this operation as recursively ignored so that we don't need to
   // convert any nested operations.
@@ -1576,8 +1607,6 @@ void ConversionPatternRewriter::eraseOp(Operation *op) {
 }
 
 void ConversionPatternRewriter::eraseBlock(Block *block) {
-  impl->notifyBlockIsBeingErased(block);
-
   // Mark all ops for erasure.
   for (Operation &op : *block)
     eraseOp(&op);
@@ -1586,6 +1615,7 @@ void ConversionPatternRewriter::eraseBlock(Block *block) {
   // object and will be actually destroyed when rewrites are applied. This
   // allows us to keep the operations in the block live and undo the removal by
   // re-inserting the block.
+  impl->notifyBlockIsBeingErased(block);
   block->getParent()->getBlocks().remove(block);
 }
 
@@ -1615,7 +1645,7 @@ void ConversionPatternRewriter::replaceUsesOfBlockArgument(BlockArgument from,
                              << "'(in region of '" << parentOp->getName()
                              << "'(" << from.getOwner()->getParentOp() << ")\n";
   });
-  impl->argReplacements.push_back(from);
+  impl->appendRewrite<ReplaceBlockArgRewrite>(from.getOwner(), from);
   impl->mapping.map(impl->mapping.lookupOrDefault(from), to);
 }
 
@@ -2039,16 +2069,13 @@ OperationLegalizer::legalizePatternResult(Operation *op, const Pattern &pattern,
 
 #ifndef NDEBUG
   assert(impl.pendingRootUpdates.empty() && "dangling root updates");
-
   // Check that the root was either replaced or updated in place.
+  auto newRewrites = llvm::drop_begin(impl.rewrites, curState.numRewrites);
   auto replacedRoot = [&] {
-    return llvm::any_of(
-        llvm::drop_begin(impl.replacements, curState.numReplacements),
-        [op](auto &it) { return it.first == op; });
+    return hasRewrite<ReplaceOperationRewrite>(newRewrites, op);
   };
   auto updatedRootInPlace = [&] {
-    return hasRewrite<ModifyOperationRewrite>(
-        llvm::drop_begin(impl.rewrites, curState.numRewrites), op);
+    return hasRewrite<ModifyOperationRewrite>(newRewrites, op);
   };
   assert((replacedRoot() || updatedRootInPlace()) &&
          "expected pattern to replace the root operation");
@@ -2081,7 +2108,8 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
     if (!rewrite)
       continue;
     Block *block = rewrite->getBlock();
-    if (isa<BlockTypeConversionRewrite, EraseBlockRewrite>(rewrite))
+    if (isa<BlockTypeConversionRewrite, EraseBlockRewrite,
+            ReplaceBlockArgRewrite>(rewrite))
       continue;
     // Only check blocks outside of the current operation.
     Operation *parentOp = block->getParentOp();
@@ -2476,6 +2504,7 @@ LogicalResult OperationConverter::convertOperations(
   ConversionPatternRewriter rewriter(ops.front()->getContext());
   ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl();
   rewriterImpl.notifyCallback = notifyCallback;
+  rewriterImpl.trackedOps = trackedOps;
 
   for (auto *op : toConvert)
     if (failed(convert(rewriter, op)))
@@ -2493,13 +2522,6 @@ LogicalResult OperationConverter::convertOperations(
     rewriterImpl.discardRewrites();
   } else {
     rewriterImpl.applyRewrites();
-
-    // It is possible for a later pattern to erase an op that was originally
-    // identified as illegal and added to the trackedOps, remove it now after
-    // replacements have been computed.
-    if (trackedOps)
-      for (auto &repl : rewriterImpl.replacements)
-        trackedOps->erase(repl.first);
   }
   return success();
 }
@@ -2513,21 +2535,20 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
       failed(legalizeConvertedArgumentTypes(rewriter, rewriterImpl)))
     return failure();
 
-  if (rewriterImpl.operationsWithChangedResults.empty())
-    return success();
-
   // Process requested operation replacements.
-  for (unsigned i = 0, e = rewriterImpl.operationsWithChangedResults.size();
-       i != e; ++i) {
-    unsigned replIdx = rewriterImpl.operationsWithChangedResults[i];
-    auto &repl = *(rewriterImpl.replacements.begin() + replIdx);
-    for (OpResult result : repl.first->getResults()) {
+  for (unsigned i = 0; i < rewriterImpl.rewrites.size(); ++i) {
+    auto *opReplacement =
+        dyn_cast<ReplaceOperationRewrite>(rewriterImpl.rewrites[i].get());
+    if (!opReplacement || !opReplacement->changedResults)
+      continue;
+    Operation *op = opReplacement->getOperation();
+    for (OpResult result : op->getResults()) {
       Value newValue = rewriterImpl.mapping.lookupOrNull(result);
 
       // If the operation result was replaced with null, all of the uses of this
       // value should be replaced.
       if (!newValue) {
-        if (failed(legalizeErasedResult(repl.first, result, rewriterImpl)))
+        if (failed(legalizeErasedResult(op, result, rewriterImpl)))
           return failure();
         continue;
       }
@@ -2541,15 +2562,11 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
         inverseMapping = rewriterImpl.mapping.getInverse();
 
       // Legalize this result.
-      rewriter.setInsertionPoint(repl.first);
-      if (failed(legalizeChangedResultType(repl.first, result, newValue,
-                                           repl.second.converter, rewriter,
+      rewriter.setInsertionPoint(op);
+      if (failed(legalizeChangedResultType(op, result, newValue,
+                                           opReplacement->converter, rewriter,
                                            rewriterImpl, *inverseMapping)))
         return failure();
-
-      // Update the end iterator for this loop in the case it was updated
-      // when legalizing generated conversion operations.
-      e = rewriterImpl.operationsWithChangedResults.size();
     }
   }
   return success();

From b014944e47ba6e2031e968268b15fba43a9e1dbf Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 23 Feb 2024 16:54:11 +0800
Subject: [PATCH 138/546] [NFC] [doc] Mentioning to include the guard headers
 from imported modules

---
 clang/docs/StandardCPlusPlusModules.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 0347ff077fdb8..c5478bba45f38 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -868,6 +868,9 @@ headers to:
   ...
   #endif
 
+If the modules imported by your library provides such headers too, remember to add them to
+your ``your_library_imported.h`` too.
+
 Importing modules
 ~~~~~~~~~~~~~~~~~
 

From ace83da316fbd2196fa35e8fd90218dcf84a020c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 23 Feb 2024 09:09:45 +0100
Subject: [PATCH 139/546] [clang][Interp][NFC] Improve Program dump()ing

Add colors as well as more details for global variables.
---
 clang/lib/AST/Interp/Descriptor.h |  3 ++
 clang/lib/AST/Interp/Disasm.cpp   | 71 ++++++++++++++++++++++++++++---
 clang/lib/AST/Interp/Program.h    |  1 +
 3 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index ac8707a521e19..0f64d678f3ef6 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -213,6 +213,9 @@ struct Descriptor final {
   bool isRecord() const { return !IsArray && ElemRecord; }
   /// Checks if this is a dummy descriptor.
   bool isDummy() const { return IsDummy; }
+
+  void dump() const;
+  void dump(llvm::raw_ostream &OS) const;
 };
 
 /// Bitfield tracking the initialisation status of elements of primitive arrays.
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index eba437e05f59d..3bc9312debeb7 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -16,6 +16,7 @@
 #include "Opcode.h"
 #include "PrimType.h"
 #include "Program.h"
+#include "clang/AST/ASTDumperUtils.h"
 #include "clang/AST/DeclCXX.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
@@ -55,7 +56,10 @@ inline IntegralAP<true> ReadArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
 LLVM_DUMP_METHOD void Function::dump() const { dump(llvm::errs()); }
 
 LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
-  OS << getName() << " " << (const void *)this << "\n";
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_GREEN, true});
+    OS << getName() << " " << (const void *)this << "\n";
+  }
   OS << "frame size: " << getFrameSize() << "\n";
   OS << "arg size:   " << getArgSize() << "\n";
   OS << "rvo:        " << hasRVO() << "\n";
@@ -83,14 +87,67 @@ LLVM_DUMP_METHOD void Function::dump(llvm::raw_ostream &OS) const {
 LLVM_DUMP_METHOD void Program::dump() const { dump(llvm::errs()); }
 
 LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
-  OS << ":: Program\n";
-  OS << "Global Variables: " << Globals.size() << "\n";
-  OS << "Functions: " << Funcs.size() << "\n";
-  OS << "\n";
-  for (auto &Func : Funcs) {
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
+    OS << "\n:: Program\n";
+  }
+
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::WHITE, true});
+    OS << "Total memory : " << Allocator.getTotalMemory() << " bytes\n";
+    OS << "Global Variables: " << Globals.size() << "\n";
+  }
+  unsigned GI = 0;
+  for (const Global *G : Globals) {
+    const Descriptor *Desc = G->block()->getDescriptor();
+    OS << GI << ": " << (void *)G->block() << " ";
+    Desc->dump(OS);
+    OS << "\n";
+    ++GI;
+  }
+
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::WHITE, true});
+    OS << "Functions: " << Funcs.size() << "\n";
+  }
+  for (const auto &Func : Funcs) {
     Func.second->dump();
   }
-  for (auto &Anon : AnonFuncs) {
+  for (const auto &Anon : AnonFuncs) {
     Anon->dump();
   }
 }
+
+LLVM_DUMP_METHOD void Descriptor::dump() const {
+  dump(llvm::errs());
+  llvm::errs() << '\n';
+}
+
+LLVM_DUMP_METHOD void Descriptor::dump(llvm::raw_ostream &OS) const {
+  // Source
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BLUE, true});
+    if (const auto *ND = dyn_cast_if_present<NamedDecl>(asDecl()))
+      OS << ND->getName();
+    else if (asExpr())
+      OS << "expr (TODO)";
+  }
+
+  // Print a few interesting bits about the descriptor.
+  if (isPrimitiveArray())
+    OS << " primitive-array";
+  else if (isCompositeArray())
+    OS << " composite-array";
+  else if (isRecord())
+    OS << " record";
+  else if (isPrimitive())
+    OS << " primitive";
+
+  if (isZeroSizeArray())
+    OS << " zero-size-arrary";
+  else if (isUnknownSizeArray())
+    OS << " unknown-size-array";
+
+  if (isDummy())
+    OS << " dummy";
+}
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index 364a63dbf477a..7922eafbeb2d0 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -190,6 +190,7 @@ class Program final {
     std::byte *data() { return B.data(); }
     /// Return a pointer to the block.
     Block *block() { return &B; }
+    const Block *block() const { return &B; }
 
   private:
     /// Required metadata - does not actually track pointers.

From 9ca70d72f4f217ff4f6ab337ad4a8e6666860791 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 10:03:26 +0100
Subject: [PATCH 140/546] [mlir][Transforms][NFC] Turn op creation into
 `IRRewrite` (#81759)

This commit is a refactoring of the dialect conversion. The dialect
conversion maintains a list of "IR rewrites" that can be committed (upon
success) or rolled back (upon failure).

Until now, the dialect conversion kept track of "op creation" in
separate internal data structures. This commit turns "op creation" into
an `IRRewrite` that can be committed and rolled back just like any other
rewrite. This commit simplifies the internal state of the dialect
conversion.
---
 .../Transforms/Utils/DialectConversion.cpp    | 102 +++++++++++-------
 1 file changed, 64 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index dec68048dc1d3..704597148dfac 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -152,17 +152,12 @@ namespace {
 /// This class contains a snapshot of the current conversion rewriter state.
 /// This is useful when saving and undoing a set of rewrites.
 struct RewriterState {
-  RewriterState(unsigned numCreatedOps, unsigned numUnresolvedMaterializations,
-                unsigned numRewrites, unsigned numIgnoredOperations,
-                unsigned numErased)
-      : numCreatedOps(numCreatedOps),
-        numUnresolvedMaterializations(numUnresolvedMaterializations),
+  RewriterState(unsigned numUnresolvedMaterializations, unsigned numRewrites,
+                unsigned numIgnoredOperations, unsigned numErased)
+      : numUnresolvedMaterializations(numUnresolvedMaterializations),
         numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations),
         numErased(numErased) {}
 
-  /// The current number of created operations.
-  unsigned numCreatedOps;
-
   /// The current number of unresolved materializations.
   unsigned numUnresolvedMaterializations;
 
@@ -303,7 +298,8 @@ class IRRewrite {
     // Operation rewrites
     MoveOperation,
     ModifyOperation,
-    ReplaceOperation
+    ReplaceOperation,
+    CreateOperation
   };
 
   virtual ~IRRewrite() = default;
@@ -376,7 +372,10 @@ class CreateBlockRewrite : public BlockRewrite {
     auto &blockOps = block->getOperations();
     while (!blockOps.empty())
       blockOps.remove(blockOps.begin());
-    eraseBlock(block);
+    if (block->getParent())
+      eraseBlock(block);
+    else
+      delete block;
   }
 };
 
@@ -606,7 +605,7 @@ class OperationRewrite : public IRRewrite {
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() >= Kind::MoveOperation &&
-           rewrite->getKind() <= Kind::ReplaceOperation;
+           rewrite->getKind() <= Kind::CreateOperation;
   }
 
 protected:
@@ -740,6 +739,19 @@ class ReplaceOperationRewrite : public OperationRewrite {
   /// A boolean flag that indicates whether result types have changed or not.
   bool changedResults;
 };
+
+class CreateOperationRewrite : public OperationRewrite {
+public:
+  CreateOperationRewrite(ConversionPatternRewriterImpl &rewriterImpl,
+                         Operation *op)
+      : OperationRewrite(Kind::CreateOperation, rewriterImpl, op) {}
+
+  static bool classof(const IRRewrite *rewrite) {
+    return rewrite->getKind() == Kind::CreateOperation;
+  }
+
+  void rollback() override;
+};
 } // namespace
 
 /// Return "true" if there is an operation rewrite that matches the specified
@@ -957,9 +969,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
 
-  /// Ordered vector of all of the newly created operations during conversion.
-  SmallVector<Operation *> createdOps;
-
   /// Ordered vector of all unresolved type conversion materializations during
   /// conversion.
   SmallVector<UnresolvedMaterialization> unresolvedMaterializations;
@@ -1144,6 +1153,15 @@ void ReplaceOperationRewrite::rollback() {
 
 void ReplaceOperationRewrite::cleanup() { eraseOp(op); }
 
+void CreateOperationRewrite::rollback() {
+  for (Region &region : op->getRegions()) {
+    while (!region.getBlocks().empty())
+      region.getBlocks().remove(region.getBlocks().begin());
+  }
+  op->dropAllUses();
+  eraseOp(op);
+}
+
 void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) {
   for (Region &region : op->getRegions()) {
     for (Block &block : region.getBlocks()) {
@@ -1161,8 +1179,6 @@ void ConversionPatternRewriterImpl::discardRewrites() {
   // Remove any newly created ops.
   for (UnresolvedMaterialization &materialization : unresolvedMaterializations)
     detachNestedAndErase(materialization.getOp());
-  for (auto *op : llvm::reverse(createdOps))
-    detachNestedAndErase(op);
 }
 
 void ConversionPatternRewriterImpl::applyRewrites() {
@@ -1182,9 +1198,8 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 // State Management
 
 RewriterState ConversionPatternRewriterImpl::getCurrentState() {
-  return RewriterState(createdOps.size(), unresolvedMaterializations.size(),
-                       rewrites.size(), ignoredOps.size(),
-                       eraseRewriter.erased.size());
+  return RewriterState(unresolvedMaterializations.size(), rewrites.size(),
+                       ignoredOps.size(), eraseRewriter.erased.size());
 }
 
 void ConversionPatternRewriterImpl::resetState(RewriterState state) {
@@ -1205,12 +1220,6 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) {
     detachNestedAndErase(op);
   }
 
-  // Pop all of the newly created operations.
-  while (createdOps.size() != state.numCreatedOps) {
-    detachNestedAndErase(createdOps.back());
-    createdOps.pop_back();
-  }
-
   // Pop all of the recorded ignored operations that are no longer valid.
   while (ignoredOps.size() != state.numIgnoredOperations)
     ignoredOps.pop_back();
@@ -1478,7 +1487,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
   });
   if (!previous.isSet()) {
     // This is a newly created op.
-    createdOps.push_back(op);
+    appendRewrite<CreateOperationRewrite>(op);
     return;
   }
   Operation *prevOp = previous.getPoint() == previous.getBlock()->end()
@@ -1979,13 +1988,16 @@ OperationLegalizer::legalizeWithFold(Operation *op,
   rewriter.replaceOp(op, replacementValues);
 
   // Recursively legalize any new constant operations.
-  for (unsigned i = curState.numCreatedOps, e = rewriterImpl.createdOps.size();
+  for (unsigned i = curState.numRewrites, e = rewriterImpl.rewrites.size();
        i != e; ++i) {
-    Operation *cstOp = rewriterImpl.createdOps[i];
-    if (failed(legalize(cstOp, rewriter))) {
+    auto *createOp =
+        dyn_cast<CreateOperationRewrite>(rewriterImpl.rewrites[i].get());
+    if (!createOp)
+      continue;
+    if (failed(legalize(createOp->getOperation(), rewriter))) {
       LLVM_DEBUG(logFailure(rewriterImpl.logger,
                             "failed to legalize generated constant '{0}'",
-                            cstOp->getName()));
+                            createOp->getOperation()->getName()));
       rewriterImpl.resetState(curState);
       return failure();
     }
@@ -2132,9 +2144,14 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
     // blocks in regions created by this pattern will already be legalized later
     // on. If we haven't built the set yet, build it now.
     if (operationsToIgnore.empty()) {
-      auto createdOps = ArrayRef<Operation *>(impl.createdOps)
-                            .drop_front(state.numCreatedOps);
-      operationsToIgnore.insert(createdOps.begin(), createdOps.end());
+      for (unsigned i = state.numRewrites, e = impl.rewrites.size(); i != e;
+           ++i) {
+        auto *createOp =
+            dyn_cast<CreateOperationRewrite>(impl.rewrites[i].get());
+        if (!createOp)
+          continue;
+        operationsToIgnore.insert(createOp->getOperation());
+      }
     }
 
     // If this operation should be considered for re-legalization, try it.
@@ -2152,8 +2169,11 @@ LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
 LogicalResult OperationLegalizer::legalizePatternCreatedOperations(
     ConversionPatternRewriter &rewriter, ConversionPatternRewriterImpl &impl,
     RewriterState &state, RewriterState &newState) {
-  for (int i = state.numCreatedOps, e = newState.numCreatedOps; i != e; ++i) {
-    Operation *op = impl.createdOps[i];
+  for (int i = state.numRewrites, e = newState.numRewrites; i != e; ++i) {
+    auto *createOp = dyn_cast<CreateOperationRewrite>(impl.rewrites[i].get());
+    if (!createOp)
+      continue;
+    Operation *op = createOp->getOperation();
     if (failed(legalize(op, rewriter))) {
       LLVM_DEBUG(logFailure(impl.logger,
                             "failed to legalize generated operation '{0}'({1})",
@@ -2583,10 +2603,16 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
     });
     return liveUserIt == val.user_end() ? nullptr : *liveUserIt;
   };
-  for (auto &r : rewriterImpl.rewrites)
-    if (auto *rewrite = dyn_cast<BlockTypeConversionRewrite>(r.get()))
-      if (failed(rewrite->materializeLiveConversions(findLiveUser)))
+  // Note: `rewrites` may be reallocated as the loop is running.
+  for (int64_t i = 0; i < static_cast<int64_t>(rewriterImpl.rewrites.size());
+       ++i) {
+    auto &rewrite = rewriterImpl.rewrites[i];
+    if (auto *blockTypeConversionRewrite =
+            dyn_cast<BlockTypeConversionRewrite>(rewrite.get()))
+      if (failed(blockTypeConversionRewrite->materializeLiveConversions(
+              findLiveUser)))
         return failure();
+  }
   return success();
 }
 

From 59ff4d131c7d6b3bfcbe8e96cac99c9d8a65bf4e Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 10:15:12 +0100
Subject: [PATCH 141/546] [mlir][Transforms][NFC] Turn unresolved
 materializations into `IRRewrite`s (#81761)

This commit is a refactoring of the dialect conversion. The dialect
conversion maintains a list of "IR rewrites" that can be committed (upon
success) or rolled back (upon failure).

This commit turns the creation of unresolved materializations
(`unrealized_conversion_cast`) into `IRRewrite` objects. After this
commit, all steps in `applyRewrites` and `discardRewrites` are calls to
`IRRewrite::commit` and `IRRewrite::rollback`.
---
 .../Transforms/Utils/DialectConversion.cpp    | 369 +++++++++---------
 1 file changed, 176 insertions(+), 193 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 704597148dfac..635a2cb00f388 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -152,15 +152,11 @@ namespace {
 /// This class contains a snapshot of the current conversion rewriter state.
 /// This is useful when saving and undoing a set of rewrites.
 struct RewriterState {
-  RewriterState(unsigned numUnresolvedMaterializations, unsigned numRewrites,
-                unsigned numIgnoredOperations, unsigned numErased)
-      : numUnresolvedMaterializations(numUnresolvedMaterializations),
-        numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations),
+  RewriterState(unsigned numRewrites, unsigned numIgnoredOperations,
+                unsigned numErased)
+      : numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations),
         numErased(numErased) {}
 
-  /// The current number of unresolved materializations.
-  unsigned numUnresolvedMaterializations;
-
   /// The current number of rewrites performed.
   unsigned numRewrites;
 
@@ -171,109 +167,10 @@ struct RewriterState {
   unsigned numErased;
 };
 
-//===----------------------------------------------------------------------===//
-// UnresolvedMaterialization
-
-/// This class represents an unresolved materialization, i.e. a materialization
-/// that was inserted during conversion that needs to be legalized at the end of
-/// the conversion process.
-class UnresolvedMaterialization {
-public:
-  /// The type of materialization.
-  enum Kind {
-    /// This materialization materializes a conversion for an illegal block
-    /// argument type, to a legal one.
-    Argument,
-
-    /// This materialization materializes a conversion from an illegal type to a
-    /// legal one.
-    Target
-  };
-
-  UnresolvedMaterialization(UnrealizedConversionCastOp op = nullptr,
-                            const TypeConverter *converter = nullptr,
-                            Kind kind = Target, Type origOutputType = nullptr)
-      : op(op), converterAndKind(converter, kind),
-        origOutputType(origOutputType) {}
-
-  /// Return the temporary conversion operation inserted for this
-  /// materialization.
-  UnrealizedConversionCastOp getOp() const { return op; }
-
-  /// Return the type converter of this materialization (which may be null).
-  const TypeConverter *getConverter() const {
-    return converterAndKind.getPointer();
-  }
-
-  /// Return the kind of this materialization.
-  Kind getKind() const { return converterAndKind.getInt(); }
-
-  /// Set the kind of this materialization.
-  void setKind(Kind kind) { converterAndKind.setInt(kind); }
-
-  /// Return the original illegal output type of the input values.
-  Type getOrigOutputType() const { return origOutputType; }
-
-private:
-  /// The unresolved materialization operation created during conversion.
-  UnrealizedConversionCastOp op;
-
-  /// The corresponding type converter to use when resolving this
-  /// materialization, and the kind of this materialization.
-  llvm::PointerIntPair<const TypeConverter *, 1, Kind> converterAndKind;
-
-  /// The original output type. This is only used for argument conversions.
-  Type origOutputType;
-};
-} // namespace
-
-/// Build an unresolved materialization operation given an output type and set
-/// of input operands.
-static Value buildUnresolvedMaterialization(
-    UnresolvedMaterialization::Kind kind, Block *insertBlock,
-    Block::iterator insertPt, Location loc, ValueRange inputs, Type outputType,
-    Type origOutputType, const TypeConverter *converter,
-    SmallVectorImpl<UnresolvedMaterialization> &unresolvedMaterializations) {
-  // Avoid materializing an unnecessary cast.
-  if (inputs.size() == 1 && inputs.front().getType() == outputType)
-    return inputs.front();
-
-  // Create an unresolved materialization. We use a new OpBuilder to avoid
-  // tracking the materialization like we do for other operations.
-  OpBuilder builder(insertBlock, insertPt);
-  auto convertOp =
-      builder.create<UnrealizedConversionCastOp>(loc, outputType, inputs);
-  unresolvedMaterializations.emplace_back(convertOp, converter, kind,
-                                          origOutputType);
-  return convertOp.getResult(0);
-}
-static Value buildUnresolvedArgumentMaterialization(
-    PatternRewriter &rewriter, Location loc, ValueRange inputs,
-    Type origOutputType, Type outputType, const TypeConverter *converter,
-    SmallVectorImpl<UnresolvedMaterialization> &unresolvedMaterializations) {
-  return buildUnresolvedMaterialization(
-      UnresolvedMaterialization::Argument, rewriter.getInsertionBlock(),
-      rewriter.getInsertionPoint(), loc, inputs, outputType, origOutputType,
-      converter, unresolvedMaterializations);
-}
-static Value buildUnresolvedTargetMaterialization(
-    Location loc, Value input, Type outputType, const TypeConverter *converter,
-    SmallVectorImpl<UnresolvedMaterialization> &unresolvedMaterializations) {
-  Block *insertBlock = input.getParentBlock();
-  Block::iterator insertPt = insertBlock->begin();
-  if (OpResult inputRes = dyn_cast<OpResult>(input))
-    insertPt = ++inputRes.getOwner()->getIterator();
-
-  return buildUnresolvedMaterialization(
-      UnresolvedMaterialization::Target, insertBlock, insertPt, loc, input,
-      outputType, outputType, converter, unresolvedMaterializations);
-}
-
 //===----------------------------------------------------------------------===//
 // IR rewrites
 //===----------------------------------------------------------------------===//
 
-namespace {
 /// An IR rewrite that can be committed (upon success) or rolled back (upon
 /// failure).
 ///
@@ -299,7 +196,8 @@ class IRRewrite {
     MoveOperation,
     ModifyOperation,
     ReplaceOperation,
-    CreateOperation
+    CreateOperation,
+    UnresolvedMaterialization
   };
 
   virtual ~IRRewrite() = default;
@@ -605,7 +503,7 @@ class OperationRewrite : public IRRewrite {
 
   static bool classof(const IRRewrite *rewrite) {
     return rewrite->getKind() >= Kind::MoveOperation &&
-           rewrite->getKind() <= Kind::CreateOperation;
+           rewrite->getKind() <= Kind::UnresolvedMaterialization;
   }
 
 protected:
@@ -752,6 +650,70 @@ class CreateOperationRewrite : public OperationRewrite {
 
   void rollback() override;
 };
+
+/// The type of materialization.
+enum MaterializationKind {
+  /// This materialization materializes a conversion for an illegal block
+  /// argument type, to a legal one.
+  Argument,
+
+  /// This materialization materializes a conversion from an illegal type to a
+  /// legal one.
+  Target
+};
+
+/// An unresolved materialization, i.e., a "builtin.unrealized_conversion_cast"
+/// op. Unresolved materializations are erased at the end of the dialect
+/// conversion.
+class UnresolvedMaterializationRewrite : public OperationRewrite {
+public:
+  UnresolvedMaterializationRewrite(
+      ConversionPatternRewriterImpl &rewriterImpl,
+      UnrealizedConversionCastOp op, const TypeConverter *converter = nullptr,
+      MaterializationKind kind = MaterializationKind::Target,
+      Type origOutputType = nullptr)
+      : OperationRewrite(Kind::UnresolvedMaterialization, rewriterImpl, op),
+        converterAndKind(converter, kind), origOutputType(origOutputType) {}
+
+  static bool classof(const IRRewrite *rewrite) {
+    return rewrite->getKind() == Kind::UnresolvedMaterialization;
+  }
+
+  UnrealizedConversionCastOp getOperation() const {
+    return cast<UnrealizedConversionCastOp>(op);
+  }
+
+  void rollback() override;
+
+  void cleanup() override;
+
+  /// Return the type converter of this materialization (which may be null).
+  const TypeConverter *getConverter() const {
+    return converterAndKind.getPointer();
+  }
+
+  /// Return the kind of this materialization.
+  MaterializationKind getMaterializationKind() const {
+    return converterAndKind.getInt();
+  }
+
+  /// Set the kind of this materialization.
+  void setMaterializationKind(MaterializationKind kind) {
+    converterAndKind.setInt(kind);
+  }
+
+  /// Return the original illegal output type of the input values.
+  Type getOrigOutputType() const { return origOutputType; }
+
+private:
+  /// The corresponding type converter to use when resolving this
+  /// materialization, and the kind of this materialization.
+  llvm::PointerIntPair<const TypeConverter *, 1, MaterializationKind>
+      converterAndKind;
+
+  /// The original output type. This is only used for argument conversions.
+  Type origOutputType;
+};
 } // namespace
 
 /// Return "true" if there is an operation rewrite that matches the specified
@@ -794,14 +756,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
       : rewriter(rewriter), eraseRewriter(rewriter.getContext()),
         notifyCallback(nullptr) {}
 
-  /// Cleanup and destroy any generated rewrite operations. This method is
-  /// invoked when the conversion process fails.
-  void discardRewrites();
-
-  /// Apply all requested operation rewrites. This method is invoked when the
-  /// conversion process succeeds.
-  void applyRewrites();
-
   //===--------------------------------------------------------------------===//
   // State Management
   //===--------------------------------------------------------------------===//
@@ -809,6 +763,10 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Return the current state of the rewriter.
   RewriterState getCurrentState();
 
+  /// Apply all requested operation rewrites. This method is invoked when the
+  /// conversion process succeeds.
+  void applyRewrites();
+
   /// Reset the state of the rewriter to a previously saved point.
   void resetState(RewriterState state);
 
@@ -841,17 +799,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// removes them from being considered for legalization.
   void markNestedOpsIgnored(Operation *op);
 
-  /// Detach any operations nested in the given operation from their parent
-  /// blocks, and erase the given operation. This can be used when the nested
-  /// operations are scheduled for erasure themselves, so deleting the regions
-  /// of the given operation together with their content would result in
-  /// double-free. This happens, for example, when rolling back op creation in
-  /// the reverse order and if the nested ops were created before the parent op.
-  /// This function does not need to collect nested ops recursively because it
-  /// is expected to also be called for each nested op when it is about to be
-  /// deleted.
-  void detachNestedAndErase(Operation *op);
-
   //===--------------------------------------------------------------------===//
   // Type Conversion
   //===--------------------------------------------------------------------===//
@@ -890,6 +837,28 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
       Block *block, const TypeConverter *converter,
       TypeConverter::SignatureConversion &signatureConversion);
 
+  //===--------------------------------------------------------------------===//
+  // Materializations
+  //===--------------------------------------------------------------------===//
+  /// Build an unresolved materialization operation given an output type and set
+  /// of input operands.
+  Value buildUnresolvedMaterialization(MaterializationKind kind,
+                                       Block *insertBlock,
+                                       Block::iterator insertPt, Location loc,
+                                       ValueRange inputs, Type outputType,
+                                       Type origOutputType,
+                                       const TypeConverter *converter);
+
+  Value buildUnresolvedArgumentMaterialization(PatternRewriter &rewriter,
+                                               Location loc, ValueRange inputs,
+                                               Type origOutputType,
+                                               Type outputType,
+                                               const TypeConverter *converter);
+
+  Value buildUnresolvedTargetMaterialization(Location loc, Value input,
+                                             Type outputType,
+                                             const TypeConverter *converter);
+
   //===--------------------------------------------------------------------===//
   // Rewriter Notification Hooks
   //===--------------------------------------------------------------------===//
@@ -969,10 +938,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   // replacing a value with one of a different type.
   ConversionValueMapping mapping;
 
-  /// Ordered vector of all unresolved type conversion materializations during
-  /// conversion.
-  SmallVector<UnresolvedMaterialization> unresolvedMaterializations;
-
   /// Ordered list of block operations (creations, splits, motions).
   SmallVector<std::unique_ptr<IRRewrite>> rewrites;
 
@@ -1162,24 +1127,15 @@ void CreateOperationRewrite::rollback() {
   eraseOp(op);
 }
 
-void ConversionPatternRewriterImpl::detachNestedAndErase(Operation *op) {
-  for (Region &region : op->getRegions()) {
-    for (Block &block : region.getBlocks()) {
-      while (!block.getOperations().empty())
-        block.getOperations().remove(block.getOperations().begin());
-      block.dropAllDefinedValueUses();
-    }
+void UnresolvedMaterializationRewrite::rollback() {
+  if (getMaterializationKind() == MaterializationKind::Target) {
+    for (Value input : op->getOperands())
+      rewriterImpl.mapping.erase(input);
   }
-  eraseRewriter.eraseOp(op);
+  eraseOp(op);
 }
 
-void ConversionPatternRewriterImpl::discardRewrites() {
-  undoRewrites();
-
-  // Remove any newly created ops.
-  for (UnresolvedMaterialization &materialization : unresolvedMaterializations)
-    detachNestedAndErase(materialization.getOp());
-}
+void UnresolvedMaterializationRewrite::cleanup() { eraseOp(op); }
 
 void ConversionPatternRewriterImpl::applyRewrites() {
   // Commit all rewrites.
@@ -1187,39 +1143,20 @@ void ConversionPatternRewriterImpl::applyRewrites() {
     rewrite->commit();
   for (auto &rewrite : rewrites)
     rewrite->cleanup();
-
-  // Drop all of the unresolved materialization operations created during
-  // conversion.
-  for (auto &mat : unresolvedMaterializations)
-    eraseRewriter.eraseOp(mat.getOp());
 }
 
 //===----------------------------------------------------------------------===//
 // State Management
 
 RewriterState ConversionPatternRewriterImpl::getCurrentState() {
-  return RewriterState(unresolvedMaterializations.size(), rewrites.size(),
-                       ignoredOps.size(), eraseRewriter.erased.size());
+  return RewriterState(rewrites.size(), ignoredOps.size(),
+                       eraseRewriter.erased.size());
 }
 
 void ConversionPatternRewriterImpl::resetState(RewriterState state) {
   // Undo any rewrites.
   undoRewrites(state.numRewrites);
 
-  // Pop all of the newly inserted materializations.
-  while (unresolvedMaterializations.size() !=
-         state.numUnresolvedMaterializations) {
-    UnresolvedMaterialization mat = unresolvedMaterializations.pop_back_val();
-    UnrealizedConversionCastOp op = mat.getOp();
-
-    // If this was a target materialization, drop the mapping that was inserted.
-    if (mat.getKind() == UnresolvedMaterialization::Target) {
-      for (Value input : op->getOperands())
-        mapping.erase(input);
-    }
-    detachNestedAndErase(op);
-  }
-
   // Pop all of the recorded ignored operations that are no longer valid.
   while (ignoredOps.size() != state.numIgnoredOperations)
     ignoredOps.pop_back();
@@ -1280,8 +1217,7 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
     if (currentTypeConverter && desiredType && newOperandType != desiredType) {
       Location operandLoc = inputLoc ? *inputLoc : operand.getLoc();
       Value castValue = buildUnresolvedTargetMaterialization(
-          operandLoc, newOperand, desiredType, currentTypeConverter,
-          unresolvedMaterializations);
+          operandLoc, newOperand, desiredType, currentTypeConverter);
       mapping.map(mapping.lookupOrDefault(newOperand), castValue);
       newOperand = castValue;
     }
@@ -1463,7 +1399,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
 
       newArg = buildUnresolvedArgumentMaterialization(
           rewriter, origArg.getLoc(), replArgs, origOutputType, outputType,
-          converter, unresolvedMaterializations);
+          converter);
     }
 
     mapping.map(origArg, newArg);
@@ -1476,6 +1412,50 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
   return newBlock;
 }
 
+//===----------------------------------------------------------------------===//
+// Materializations
+//===----------------------------------------------------------------------===//
+
+/// Build an unresolved materialization operation given an output type and set
+/// of input operands.
+Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
+    MaterializationKind kind, Block *insertBlock, Block::iterator insertPt,
+    Location loc, ValueRange inputs, Type outputType, Type origOutputType,
+    const TypeConverter *converter) {
+  // Avoid materializing an unnecessary cast.
+  if (inputs.size() == 1 && inputs.front().getType() == outputType)
+    return inputs.front();
+
+  // Create an unresolved materialization. We use a new OpBuilder to avoid
+  // tracking the materialization like we do for other operations.
+  OpBuilder builder(insertBlock, insertPt);
+  auto convertOp =
+      builder.create<UnrealizedConversionCastOp>(loc, outputType, inputs);
+  appendRewrite<UnresolvedMaterializationRewrite>(convertOp, converter, kind,
+                                                  origOutputType);
+  return convertOp.getResult(0);
+}
+Value ConversionPatternRewriterImpl::buildUnresolvedArgumentMaterialization(
+    PatternRewriter &rewriter, Location loc, ValueRange inputs,
+    Type origOutputType, Type outputType, const TypeConverter *converter) {
+  return buildUnresolvedMaterialization(
+      MaterializationKind::Argument, rewriter.getInsertionBlock(),
+      rewriter.getInsertionPoint(), loc, inputs, outputType, origOutputType,
+      converter);
+}
+Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization(
+    Location loc, Value input, Type outputType,
+    const TypeConverter *converter) {
+  Block *insertBlock = input.getParentBlock();
+  Block::iterator insertPt = insertBlock->begin();
+  if (OpResult inputRes = dyn_cast<OpResult>(input))
+    insertPt = ++inputRes.getOwner()->getIterator();
+
+  return buildUnresolvedMaterialization(MaterializationKind::Target,
+                                        insertBlock, insertPt, loc, input,
+                                        outputType, outputType, converter);
+}
+
 //===----------------------------------------------------------------------===//
 // Rewriter Notification Hooks
 
@@ -2528,18 +2508,18 @@ LogicalResult OperationConverter::convertOperations(
 
   for (auto *op : toConvert)
     if (failed(convert(rewriter, op)))
-      return rewriterImpl.discardRewrites(), failure();
+      return rewriterImpl.undoRewrites(), failure();
 
   // Now that all of the operations have been converted, finalize the conversion
   // process to ensure any lingering conversion artifacts are cleaned up and
   // legalized.
   if (failed(finalize(rewriter)))
-    return rewriterImpl.discardRewrites(), failure();
+    return rewriterImpl.undoRewrites(), failure();
 
   // After a successful conversion, apply rewrites if this is not an analysis
   // conversion.
   if (mode == OpConversionMode::Analysis) {
-    rewriterImpl.discardRewrites();
+    rewriterImpl.undoRewrites();
   } else {
     rewriterImpl.applyRewrites();
   }
@@ -2645,11 +2625,12 @@ replaceMaterialization(ConversionPatternRewriterImpl &rewriterImpl,
 /// Compute all of the unresolved materializations that will persist beyond the
 /// conversion process, and require inserting a proper user materialization for.
 static void computeNecessaryMaterializations(
-    DenseMap<Operation *, UnresolvedMaterialization *> &materializationOps,
+    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
+        &materializationOps,
     ConversionPatternRewriter &rewriter,
     ConversionPatternRewriterImpl &rewriterImpl,
     DenseMap<Value, SmallVector<Value>> &inverseMapping,
-    SetVector<UnresolvedMaterialization *> &necessaryMaterializations) {
+    SetVector<UnresolvedMaterializationRewrite *> &necessaryMaterializations) {
   auto isLive = [&](Value value) {
     auto findFn = [&](Operation *user) {
       auto matIt = materializationOps.find(user);
@@ -2684,14 +2665,17 @@ static void computeNecessaryMaterializations(
         return Value();
       };
 
-  SetVector<UnresolvedMaterialization *> worklist;
-  for (auto &mat : rewriterImpl.unresolvedMaterializations) {
-    materializationOps.try_emplace(mat.getOp(), &mat);
-    worklist.insert(&mat);
+  SetVector<UnresolvedMaterializationRewrite *> worklist;
+  for (auto &rewrite : rewriterImpl.rewrites) {
+    auto *mat = dyn_cast<UnresolvedMaterializationRewrite>(rewrite.get());
+    if (!mat)
+      continue;
+    materializationOps.try_emplace(mat->getOperation(), mat);
+    worklist.insert(mat);
   }
   while (!worklist.empty()) {
-    UnresolvedMaterialization *mat = worklist.pop_back_val();
-    UnrealizedConversionCastOp op = mat->getOp();
+    UnresolvedMaterializationRewrite *mat = worklist.pop_back_val();
+    UnrealizedConversionCastOp op = mat->getOperation();
 
     // We currently only handle target materializations here.
     assert(op->getNumResults() == 1 && "unexpected materialization type");
@@ -2733,7 +2717,7 @@ static void computeNecessaryMaterializations(
     auto isBlockArg = [](Value v) { return isa<BlockArgument>(v); };
     if (llvm::any_of(op->getOperands(), isBlockArg) ||
         llvm::any_of(inverseMapping[op->getResult(0)], isBlockArg)) {
-      mat->setKind(UnresolvedMaterialization::Argument);
+      mat->setMaterializationKind(MaterializationKind::Argument);
     }
 
     // If the materialization does not have any live users, we don't need to
@@ -2743,7 +2727,7 @@ static void computeNecessaryMaterializations(
     // value replacement even if the types differ in some cases. When those
     // patterns are fixed, we can drop the argument special case here.
     bool isMaterializationLive = isLive(opResult);
-    if (mat->getKind() == UnresolvedMaterialization::Argument)
+    if (mat->getMaterializationKind() == MaterializationKind::Argument)
       isMaterializationLive |= llvm::any_of(inverseMapping[opResult], isLive);
     if (!isMaterializationLive)
       continue;
@@ -2763,8 +2747,9 @@ static void computeNecessaryMaterializations(
 /// Legalize the given unresolved materialization. Returns success if the
 /// materialization was legalized, failure otherise.
 static LogicalResult legalizeUnresolvedMaterialization(
-    UnresolvedMaterialization &mat,
-    DenseMap<Operation *, UnresolvedMaterialization *> &materializationOps,
+    UnresolvedMaterializationRewrite &mat,
+    DenseMap<Operation *, UnresolvedMaterializationRewrite *>
+        &materializationOps,
     ConversionPatternRewriter &rewriter,
     ConversionPatternRewriterImpl &rewriterImpl,
     DenseMap<Value, SmallVector<Value>> &inverseMapping) {
@@ -2784,7 +2769,7 @@ static LogicalResult legalizeUnresolvedMaterialization(
         return Value();
       };
 
-  UnrealizedConversionCastOp op = mat.getOp();
+  UnrealizedConversionCastOp op = mat.getOperation();
   if (!rewriterImpl.ignoredOps.insert(op))
     return success();
 
@@ -2834,8 +2819,8 @@ static LogicalResult legalizeUnresolvedMaterialization(
       rewriter.setInsertionPoint(op);
 
     Value newMaterialization;
-    switch (mat.getKind()) {
-    case UnresolvedMaterialization::Argument:
+    switch (mat.getMaterializationKind()) {
+    case MaterializationKind::Argument:
       // Try to materialize an argument conversion.
       // FIXME: The current argument materialization hook expects the original
       // output type, even though it doesn't use that as the actual output type
@@ -2852,7 +2837,7 @@ static LogicalResult legalizeUnresolvedMaterialization(
       // If an argument materialization failed, fallback to trying a target
       // materialization.
       [[fallthrough]];
-    case UnresolvedMaterialization::Target:
+    case MaterializationKind::Target:
       newMaterialization = converter->materializeTargetConversion(
           rewriter, op->getLoc(), outputType, inputOperands);
       break;
@@ -2880,14 +2865,12 @@ LogicalResult OperationConverter::legalizeUnresolvedMaterializations(
     ConversionPatternRewriter &rewriter,
     ConversionPatternRewriterImpl &rewriterImpl,
     std::optional<DenseMap<Value, SmallVector<Value>>> &inverseMapping) {
-  if (rewriterImpl.unresolvedMaterializations.empty())
-    return success();
   inverseMapping = rewriterImpl.mapping.getInverse();
 
   // As an initial step, compute all of the inserted materializations that we
   // expect to persist beyond the conversion process.
-  DenseMap<Operation *, UnresolvedMaterialization *> materializationOps;
-  SetVector<UnresolvedMaterialization *> necessaryMaterializations;
+  DenseMap<Operation *, UnresolvedMaterializationRewrite *> materializationOps;
+  SetVector<UnresolvedMaterializationRewrite *> necessaryMaterializations;
   computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl,
                                    *inverseMapping, necessaryMaterializations);
 

From b13c8e5099ec7886fcd198b1f6aec14f928c963c Mon Sep 17 00:00:00 2001
From: Daniel Kiss <daniel.kiss@arm.com>
Date: Fri, 23 Feb 2024 10:20:54 +0100
Subject: [PATCH 142/546] Revert "[llvm][AArch64] Autoupgrade function
 attributes from Module attributes. (#80640)"

This reverts commit 531e8c26b3f2626e7f1a997e0e8b61d67d10aded.
---
 llvm/include/llvm/IR/AutoUpgrade.h            |  3 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  2 +-
 llvm/lib/IR/AutoUpgrade.cpp                   | 72 +------------------
 llvm/lib/Linker/IRMover.cpp                   |  4 --
 .../test/Bitcode/upgrade-arc-runtime-calls.ll |  4 +-
 .../AArch64/link-branch-target-enforcement.ll |  1 -
 .../LTO/AArch64/link-sign-return-address.ll   | 43 -----------
 llvm/test/Linker/link-arm-and-thumb.ll        |  7 +-
 8 files changed, 8 insertions(+), 128 deletions(-)
 delete mode 100644 llvm/test/LTO/AArch64/link-sign-return-address.ll

diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index c0d96efc54752..152f781ffa9b3 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -67,8 +67,7 @@ namespace llvm {
   void UpgradeSectionAttributes(Module &M);
 
   /// Correct any IR that is relying on old function attribute behavior.
-  void UpgradeFunctionAttributes(Function &F,
-                                 bool ModuleMetadataIsMaterialized = false);
+  void UpgradeFunctionAttributes(Function &F);
 
   /// If the given TBAA tag uses the scalar TBAA format, create a new node
   /// corresponding to the upgrade to the struct-path aware TBAA format.
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 8c860101afa02..832907a3f53f5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -6706,7 +6706,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   }
 
   // Look for functions that rely on old function attribute behavior.
-  UpgradeFunctionAttributes(*F, true);
+  UpgradeFunctionAttributes(*F);
 
   // Bring in any functions that this function forward-referenced via
   // blockaddresses.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index edff13c796b31..b90bbe71ac189 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5155,46 +5155,7 @@ struct StrictFPUpgradeVisitor : public InstVisitor<StrictFPUpgradeVisitor> {
 };
 } // namespace
 
-// Check if the module attribute is present and not zero.
-static bool isModuleAttributeSet(const Module *M, const StringRef &ModAttr) {
-  const auto *Attr =
-      mdconst::extract_or_null<ConstantInt>(M->getModuleFlag(ModAttr));
-  return Attr && Attr->getZExtValue();
-}
-
-// Copy an attribute from module to the function if exists.
-// First value of the pair is used when the module attribute is not zero
-// the second otherwise.
-static void
-CopyModuleAttributeToFunction(Function &F, StringRef FnAttrName,
-                              StringRef ModAttrName,
-                              std::pair<StringRef, StringRef> Values) {
-  if (F.hasFnAttribute(FnAttrName))
-    return;
-  F.addFnAttr(FnAttrName, isModuleAttributeSet(F.getParent(), ModAttrName)
-                              ? Values.first
-                              : Values.second);
-}
-
-// Copy a boolean attribute from module to the function if exists.
-// Module attribute treated false if zero otherwise true.
-static void CopyModuleAttributeToFunction(Function &F, StringRef AttrName) {
-  CopyModuleAttributeToFunction(
-      F, AttrName, AttrName,
-      std::make_pair<StringRef, StringRef>("true", "false"));
-}
-
-// Copy an attribute from module to the function if exists.
-// First value of the pair is used when the module attribute is not zero
-// the second otherwise.
-static void
-CopyModuleAttributeToFunction(Function &F, StringRef AttrName,
-                              std::pair<StringRef, StringRef> Values) {
-  CopyModuleAttributeToFunction(F, AttrName, AttrName, Values);
-}
-
-void llvm::UpgradeFunctionAttributes(Function &F,
-                                     bool ModuleMetadataIsMaterialized) {
+void llvm::UpgradeFunctionAttributes(Function &F) {
   // If a function definition doesn't have the strictfp attribute,
   // convert any callsite strictfp attributes to nobuiltin.
   if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) {
@@ -5206,37 +5167,6 @@ void llvm::UpgradeFunctionAttributes(Function &F,
   F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
   for (auto &Arg : F.args())
     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
-
-  if (!ModuleMetadataIsMaterialized)
-    return;
-  if (F.isDeclaration())
-    return;
-  Module *M = F.getParent();
-  if (!M)
-    return;
-
-  Triple T(M->getTargetTriple());
-  // Convert module level attributes to function level attributes because
-  // after merging modules the attributes might change and would have different
-  // effect on the functions as the original module would have.
-  if (T.isThumb() || T.isARM() || T.isAArch64()) {
-    if (!F.hasFnAttribute("sign-return-address")) {
-      StringRef SignType = "none";
-      if (isModuleAttributeSet(M, "sign-return-address"))
-        SignType = "non-leaf";
-
-      if (isModuleAttributeSet(M, "sign-return-address-all"))
-        SignType = "all";
-
-      F.addFnAttr("sign-return-address", SignType);
-    }
-    CopyModuleAttributeToFunction(F, "branch-target-enforcement");
-    CopyModuleAttributeToFunction(F, "branch-protection-pauth-lr");
-    CopyModuleAttributeToFunction(F, "guarded-control-stack");
-    CopyModuleAttributeToFunction(
-        F, "sign-return-address-key",
-        std::make_pair<StringRef, StringRef>("b_key", "a_key"));
-  }
 }
 
 static bool isOldLoopArgument(Metadata *MD) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 9f45ebc6eda01..37d21119447b9 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1606,10 +1606,6 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
-  // Update function attributes before copying them to destation module.
-  for (Function &F : SrcM->getFunctionList())
-    UpgradeFunctionAttributes(F, true);
-
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
diff --git a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll
index d2edec18d55e5..19f25f98953fa 100644
--- a/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll
+++ b/llvm/test/Bitcode/upgrade-arc-runtime-calls.ll
@@ -55,7 +55,7 @@ unwindBlock:
 // Check that auto-upgrader converts function calls to intrinsic calls. Note that
 // the auto-upgrader doesn't touch invoke instructions.
 
-// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality
+// ARC: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality
 // ARC: %[[V0:.*]] = tail call ptr @llvm.objc.autorelease(ptr %[[A]])
 // ARC-NEXT: tail call void @llvm.objc.autoreleasePoolPop(ptr %[[A]])
 // ARC-NEXT: %[[V1:.*]] = tail call ptr @llvm.objc.autoreleasePoolPush()
@@ -88,7 +88,7 @@ unwindBlock:
 // ARC-NEXT: tail call void @llvm.objc.arc.annotation.bottomup.bbend(ptr %[[B]], ptr %[[C]])
 // ARC-NEXT: invoke void @objc_autoreleasePoolPop(ptr %[[A]])
 
-// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) #0 personality
+// NOUPGRADE: define void @testRuntimeCalls(ptr %[[A:.*]], ptr %[[B:.*]], ptr %[[C:.*]], ptr %[[D:.*]], ptr %[[E:.*]]) personality
 // NOUPGRADE: %[[V0:.*]] = tail call ptr @objc_autorelease(ptr %[[A]])
 // NOUPGRADE-NEXT: tail call void @objc_autoreleasePoolPop(ptr %[[A]])
 // NOUPGRADE-NEXT: %[[V1:.*]] = tail call ptr @objc_autoreleasePoolPush()
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index 74d9c86881d52..ccf8cf67ede6d 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -32,7 +32,6 @@ entry:
 ; CHECK-DUMP: <main>:
 ; CHECK-DUMP:      bl      0x8 <main+0x8>
 ; CHECK-DUMP: <foo>:
-; CHECK-DUMP:     paciasp
 
 ; `main` doesn't support BTI while `foo` does, so in the binary
 ; we should see only PAC which is supported by both.
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
deleted file mode 100644
index c25857ceed7b4..0000000000000
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; Testcase to check that module with different branch-target-enforcement can
-; be mixed.
-;
-; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
-; RUN: llvm-lto -exported-symbol main \
-; RUN:          -exported-symbol foo \
-; RUN:          -filetype=obj \
-; RUN:           %t2.bc %t1.bc \
-; RUN:           -o %t1.exe 2>&1
-; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
-; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-declare i32 @foo();
-
-define i32 @main() {
-entry:
-  %add = call i32 @foo()
-  ret i32 %add
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3 }
-!0 = !{i32 8, !"branch-target-enforcement", i32 0}
-!1 = !{i32 8, !"sign-return-address", i32 0}
-!2 = !{i32 8, !"sign-return-address-all", i32 0}
-!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
-
-; CHECK-DUMP: <foo>:
-; CHECK-DUMP:     paciasp
-; CHECK-DUMP:     mov     w0, #0x2a
-; CHECK-DUMP:     autiasp
-; CHECK-DUMP:     ret
-; CHECK-DUMP: <main>:
-; CHECK-DUMP-NOT:  paciasp
-; CHECK-DUMP:      str     x30,
-; CHECK-DUMP:      bl      0x14 <main+0x4>
-
-; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
-; we should not see anything.
-; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
\ No newline at end of file
diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
index 37bd8c37f8b5e..a90f2128e4430 100644
--- a/llvm/test/Linker/link-arm-and-thumb.ll
+++ b/llvm/test/Linker/link-arm-and-thumb.ll
@@ -13,12 +13,11 @@ entry:
   ret i32 %add
 }
 
-; CHECK: define i32 @main() [[MAIN_ATTRS:#[0-9]+]]
+; CHECK: define i32 @main() {
 ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
 
-; CHECK: attributes [[MAIN_ATTRS]] = { {{.*}} }
-; CHECK: attributes [[ARM_ATTRS]] = { {{.*}} "target-features"="-thumb-mode" }
-; CHECK: attributes [[THUMB_ATTRS]] = { {{.*}} "target-features"="+thumb-mode" }
+; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
+; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
 
 ; STDERR-NOT: warning: Linking two modules of different target triples:

From 2ae8bee8f11f8d5cc26cf6b4bb71001706ca0104 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Fri, 23 Feb 2024 10:28:58 +0100
Subject: [PATCH 143/546] [ARM][GlobalISel] Remove legacy legalizer rules
 (#82619)

I've been looking at LegacyLegalizerInfo and what its place in GISel is.
It seems like it's very close to being deleted so I'm checking if we can
remove the last remaining uses of it.

Looks like we can do a drop-in replacement with the new legalizer for
ARM.
---
 llvm/lib/Target/ARM/ARMLegalizerInfo.cpp | 56 ++++--------------------
 1 file changed, 9 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index c5199aab75272..00a29f8ecb232 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -25,42 +25,6 @@
 using namespace llvm;
 using namespace LegalizeActions;
 
-/// FIXME: The following static functions are SizeChangeStrategy functions
-/// that are meant to temporarily mimic the behaviour of the old legalization
-/// based on doubling/halving non-legal types as closely as possible. This is
-/// not entirly possible as only legalizing the types that are exactly a power
-/// of 2 times the size of the legal types would require specifying all those
-/// sizes explicitly.
-/// In practice, not specifying those isn't a problem, and the below functions
-/// should disappear quickly as we add support for legalizing non-power-of-2
-/// sized types further.
-static void addAndInterleaveWithUnsupported(
-    LegacyLegalizerInfo::SizeAndActionsVec &result,
-    const LegacyLegalizerInfo::SizeAndActionsVec &v) {
-  for (unsigned i = 0; i < v.size(); ++i) {
-    result.push_back(v[i]);
-    if (i + 1 < v[i].first && i + 1 < v.size() &&
-        v[i + 1].first != v[i].first + 1)
-      result.push_back({v[i].first + 1, LegacyLegalizeActions::Unsupported});
-  }
-}
-
-static LegacyLegalizerInfo::SizeAndActionsVec
-widen_8_16(const LegacyLegalizerInfo::SizeAndActionsVec &v) {
-  assert(v.size() >= 1);
-  assert(v[0].first > 17);
-  LegacyLegalizerInfo::SizeAndActionsVec result = {
-      {1, LegacyLegalizeActions::Unsupported},
-      {8, LegacyLegalizeActions::WidenScalar},
-      {9, LegacyLegalizeActions::Unsupported},
-      {16, LegacyLegalizeActions::WidenScalar},
-      {17, LegacyLegalizeActions::Unsupported}};
-  addAndInterleaveWithUnsupported(result, v);
-  auto Largest = result.back().first;
-  result.push_back({Largest + 1, LegacyLegalizeActions::Unsupported});
-  return result;
-}
-
 static bool AEABI(const ARMSubtarget &ST) {
   return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
 }
@@ -118,15 +82,14 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
         .libcallFor({s32})
         .clampScalar(0, s32, s32);
 
-  for (unsigned Op : {G_SREM, G_UREM}) {
-    LegacyInfo.setLegalizeScalarToDifferentSizeStrategy(Op, 0, widen_8_16);
-    if (HasHWDivide)
-      LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Lower);
-    else if (AEABI(ST))
-      LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Custom);
-    else
-      LegacyInfo.setAction({Op, s32}, LegacyLegalizeActions::Libcall);
-  }
+  auto &REMBuilder =
+      getActionDefinitionsBuilder({G_SREM, G_UREM}).minScalar(0, s32);
+  if (HasHWDivide)
+    REMBuilder.lowerFor({s32});
+  else if (AEABI(ST))
+    REMBuilder.customFor({s32});
+  else
+    REMBuilder.libcallFor({s32});
 
   getActionDefinitionsBuilder(G_INTTOPTR)
       .legalFor({{p0, s32}})
@@ -202,8 +165,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
 
     LoadStoreBuilder.maxScalar(0, s32);
 
-    for (auto Ty : {s32, s64})
-      LegacyInfo.setAction({G_FNEG, Ty}, LegacyLegalizeActions::Lower);
+    getActionDefinitionsBuilder(G_FNEG).lowerFor({s32, s64});
 
     getActionDefinitionsBuilder(G_FCONSTANT).customFor({s32, s64});
 

From bbdc62e7180168effd0c480979bdaf933d0615d1 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Fri, 23 Feb 2024 09:29:45 +0000
Subject: [PATCH 144/546] [AArch64][CostModel] Improve scalar frem cost
 (#80423)

In AArch64 the cost of scalar frem is the cost of a call to 'fmod'.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  7 ++
 .../CostModel/AArch64/arith-fp-frem.ll        | 68 +++++++++----------
 .../Analysis/CostModel/AArch64/arith-fp.ll    | 22 +++---
 3 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6655931181c2d..010e569809e27 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2972,6 +2972,13 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
 
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
                                          Op2Info);
+  case ISD::FREM:
+    // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
+    // those functions are not declared in the module.
+    if (!Ty->isVectorTy())
+      return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                         Op2Info);
   }
 }
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
index 20e0ef7ea3428..63149adfa2158 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp-frem.ll
@@ -22,44 +22,44 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f64'
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f64'
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; NEON-ARMPL-LABEL: 'frem_f64'
-; NEON-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ;
 ; NEON-SLEEF-LABEL: 'frem_f64'
-; NEON-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-ARMPL-LABEL: 'frem_f64'
-; SVE-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-SLEEF-LABEL: 'frem_f64'
-; SVE-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f64'
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f64'
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem double %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem double %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 2 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem double %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 2 For instruction: %res = frem double %in, %in
 ;
@@ -83,55 +83,55 @@ define void @frem_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 
 define void @frem_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ; NEON-NO-VECLIB-LABEL: 'frem_f32'
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
-; NEON-NO-VECLIB:  LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-NO-VECLIB-LABEL: 'frem_f32'
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
-; SVE-NO-VECLIB:  LV: Found an estimated cost of 20 for VF 4 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-NO-VECLIB:  LV: Found an estimated cost of 52 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-NO-VECLIB:  LV: Found an estimated cost of Invalid for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; NEON-ARMPL-LABEL: 'frem_f32'
-; NEON-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; NEON-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ;
 ; NEON-SLEEF-LABEL: 'frem_f32'
-; NEON-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; NEON-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; NEON-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; NEON-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-ARMPL-LABEL: 'frem_f32'
-; SVE-ARMPL:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-SLEEF-LABEL: 'frem_f32'
-; SVE-SLEEF:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-ARMPL-TAILFOLD-LABEL: 'frem_f32'
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
 ; SVE-ARMPL-TAILFOLD:  LV: Found an estimated cost of 10 for VF vscale x 4 For instruction: %res = frem float %in, %in
 ;
 ; SVE-SLEEF-TAILFOLD-LABEL: 'frem_f32'
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 2 for VF 1 For instruction: %res = frem float %in, %in
-; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 8 for VF 2 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 1 For instruction: %res = frem float %in, %in
+; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 24 for VF 2 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of 10 for VF 4 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %res = frem float %in, %in
 ; SVE-SLEEF-TAILFOLD:  LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %res = frem float %in, %in
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
index c352892354fc2..497ade4f2f613 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-fp.ll
@@ -197,17 +197,17 @@ define i32 @fdiv(i32 %arg) {
 
 define i32 @frem(i32 %arg) {
 ; CHECK-LABEL: 'frem'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = frem <4 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8F16 = frem <8 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V16F16 = frem <16 x half> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = frem <2 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = frem <4 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F16 = frem half undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 216 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F32 = frem float undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2F32 = frem <2 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V4F32 = frem <4 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8F32 = frem <8 x float> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %F64 = frem double undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = frem <2 x double> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4F64 = frem <4 x double> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %F16 = frem half undef, undef

From 335d34d9eae8c943e2164373c7eab1e450eaf435 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Fri, 23 Feb 2024 10:30:19 +0100
Subject: [PATCH 145/546] [MLIR][LLVM] Fix debug intrinsic import (#82637)

This revision handles the case that the translation of a scope fails due
to cyclic metadata. This mainly affects the import of debug intrinsics
that indirectly take such a scope as metadata argument (e.g. via local
variable or label metadata). This commit ensures we drop intrinsics with
such a dependency on cyclic metadata.
---
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |  6 ++-
 mlir/lib/Target/LLVMIR/DebugImporter.cpp      | 35 +++++++++++-----
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       |  9 ++++-
 .../Target/LLVMIR/Import/import-failure.ll    | 40 +++++++++++++++++--
 4 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index feb3578fe2d49..b88f1186a44b4 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -513,7 +513,11 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
       });
   }];
   let mlirBuilder = [{
-    $_op = $_builder.create<$_qualCppClassName>($_location, $_label_attr($label));
+    DILabelAttr labelAttr = $_label_attr($label);
+    // Drop the intrinsic if the label translation fails due to cylic metadata.
+    if (!labelAttr)
+      return success();
+    $_op = $_builder.create<$_qualCppClassName>($_location, labelAttr);
   }];
   let assemblyFormat = "$label attr-dict";
 }
diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index 6521295230091..c631617f97354 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -99,21 +99,31 @@ DIFileAttr DebugImporter::translateImpl(llvm::DIFile *node) {
 }
 
 DILabelAttr DebugImporter::translateImpl(llvm::DILabel *node) {
-  return DILabelAttr::get(context, translate(node->getScope()),
+  // Return nullptr if the scope or type is a cyclic dependency.
+  DIScopeAttr scope = translate(node->getScope());
+  if (node->getScope() && !scope)
+    return nullptr;
+  return DILabelAttr::get(context, scope,
                           getStringAttrOrNull(node->getRawName()),
                           translate(node->getFile()), node->getLine());
 }
 
 DILexicalBlockAttr DebugImporter::translateImpl(llvm::DILexicalBlock *node) {
-  return DILexicalBlockAttr::get(context, translate(node->getScope()),
-                                 translate(node->getFile()), node->getLine(),
-                                 node->getColumn());
+  // Return nullptr if the scope or type is a cyclic dependency.
+  DIScopeAttr scope = translate(node->getScope());
+  if (node->getScope() && !scope)
+    return nullptr;
+  return DILexicalBlockAttr::get(context, scope, translate(node->getFile()),
+                                 node->getLine(), node->getColumn());
 }
 
 DILexicalBlockFileAttr
 DebugImporter::translateImpl(llvm::DILexicalBlockFile *node) {
-  return DILexicalBlockFileAttr::get(context, translate(node->getScope()),
-                                     translate(node->getFile()),
+  // Return nullptr if the scope or type is a cyclic dependency.
+  DIScopeAttr scope = translate(node->getScope());
+  if (node->getScope() && !scope)
+    return nullptr;
+  return DILexicalBlockFileAttr::get(context, scope, translate(node->getFile()),
                                      node->getDiscriminator());
 }
 
@@ -135,11 +145,14 @@ DebugImporter::translateImpl(llvm::DIGlobalVariable *node) {
 }
 
 DILocalVariableAttr DebugImporter::translateImpl(llvm::DILocalVariable *node) {
-  return DILocalVariableAttr::get(context, translate(node->getScope()),
-                                  getStringAttrOrNull(node->getRawName()),
-                                  translate(node->getFile()), node->getLine(),
-                                  node->getArg(), node->getAlignInBits(),
-                                  translate(node->getType()));
+  // Return nullptr if the scope or type is a cyclic dependency.
+  DIScopeAttr scope = translate(node->getScope());
+  if (node->getScope() && !scope)
+    return nullptr;
+  return DILocalVariableAttr::get(
+      context, scope, getStringAttrOrNull(node->getRawName()),
+      translate(node->getFile()), node->getLine(), node->getArg(),
+      node->getAlignInBits(), translate(node->getType()));
 }
 
 DIScopeAttr DebugImporter::translateImpl(llvm::DIScope *node) {
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 97ccb2b29f3ae..d63ea12ecd49b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1966,6 +1966,13 @@ ModuleImport::processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr,
   // TODO: find a way to support this case.
   if (isMetadataKillLocation(dbgIntr))
     return emitUnsupportedWarning();
+  // Drop debug intrinsics if the associated variable information cannot be
+  // translated due to cyclic debug metadata.
+  // TODO: Support cyclic debug metadata.
+  DILocalVariableAttr localVariableAttr =
+      matchLocalVariableAttr(dbgIntr->getArgOperand(1));
+  if (!localVariableAttr)
+    return emitUnsupportedWarning();
   FailureOr<Value> argOperand = convertMetadataValue(dbgIntr->getArgOperand(0));
   if (failed(argOperand))
     return emitError(loc) << "failed to convert a debug intrinsic operand: "
@@ -1991,8 +1998,6 @@ ModuleImport::processDebugIntrinsic(llvm::DbgVariableIntrinsic *dbgIntr,
   } else {
     builder.setInsertionPointAfterValue(*argOperand);
   }
-  DILocalVariableAttr localVariableAttr =
-      matchLocalVariableAttr(dbgIntr->getArgOperand(1));
   auto locationExprAttr =
       debugImporter->translateExpression(dbgIntr->getExpression());
   Operation *op =
diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll
index 0962134665663..9a4e939d10651 100644
--- a/mlir/test/Target/LLVMIR/Import/import-failure.ll
+++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll
@@ -59,13 +59,15 @@ define void @unhandled_intrinsic() gc "example" {
 
 ; // -----
 
+; Check that debug intrinsics with an unsupported argument are dropped.
+
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 ; CHECK:      import-failure.ll
-; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !DIArgList(i64 %arg1, i64 undef), metadata !3, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 1, DW_OP_mul, DW_OP_plus, DW_OP_stack_value)), !dbg !5
+; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !DIArgList(i64 %{{.*}}, i64 undef), metadata !3, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 1, DW_OP_mul, DW_OP_plus, DW_OP_stack_value))
 ; CHECK:      import-failure.ll
-; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !6, metadata !3, metadata !DIExpression()), !dbg !5
-define void @dropped_instruction(i64 %arg1) {
+; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata !6, metadata !3, metadata !DIExpression())
+define void @unsupported_argument(i64 %arg1) {
   call void @llvm.dbg.value(metadata !DIArgList(i64 %arg1, i64 undef), metadata !3, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_constu, 1, DW_OP_mul, DW_OP_plus, DW_OP_stack_value)), !dbg !5
   call void @llvm.dbg.value(metadata !6, metadata !3, metadata !DIExpression()), !dbg !5
   ret void
@@ -83,6 +85,38 @@ define void @dropped_instruction(i64 %arg1) {
 
 ; // -----
 
+; Check that debug intrinsics that depend on cyclic metadata are dropped.
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+; CHECK:      import-failure.ll
+; CHECK-SAME: warning: dropped instruction: call void @llvm.dbg.label(metadata !{{.*}})
+; CHECK:      import-failure.ll
+; CHECK-SAME: warning: dropped intrinsic: call void @llvm.dbg.value(metadata i64 %{{.*}}, metadata !3, metadata !DIExpression())
+define void @cylic_metadata(i64 %arg1) {
+  call void @llvm.dbg.value(metadata i64 %arg1, metadata !10, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.label(metadata !13), !dbg !14
+  ret void
+}
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!0}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
+!2 = !DIFile(filename: "import-failure.ll", directory: "/")
+!3 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !4)
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3)
+!5 = distinct !DISubprogram(name: "class_method", scope: !2, file: !2, type: !6, spFlags: DISPFlagDefinition, unit: !1)
+!6 = !DISubroutineType(types: !7)
+!7 = !{!3}
+!10 = !DILocalVariable(scope: !5, name: "arg1", file: !2, line: 1, arg: 1, align: 64);
+!11 = !DILexicalBlock(scope: !5)
+!12 = !DILexicalBlockFile(scope: !11, discriminator: 0)
+!13 = !DILabel(scope: !12, name: "label", file: !2, line: 42)
+!14 = !DILocation(line: 1, column: 2, scope: !5)
+
+; // -----
+
 ; global_dtors with non-null data fields cannot be represented in MLIR.
 ; CHECK:      <unknown>
 ; CHECK-SAME: error: unhandled global variable: @llvm.global_dtors

From a622b21f4607ee787c6fe63032a849c24374882b Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 10:31:55 +0100
Subject: [PATCH 146/546] [mlir][Transforms] Make `ConversionPatternRewriter`
 constructor private (#82244)

`ConversionPatternRewriter` objects should not be constructed outside of
dialect conversions. Some IR modifications performed through a
`ConversionPatternRewriter` are reflected in the IR in a delayed fashion
(e.g., only when the dialect conversion is guaranteed to succeed). Using
a `ConversionPatternRewriter` outside of the dialect conversion is
incorrect API usage and can bring the IR in an inconsistent state.

Migration guide: Use `IRRewriter` instead of
`ConversionPatternRewriter`.
---
 flang/lib/Frontend/FrontendActions.cpp         |  2 +-
 .../mlir/Transforms/DialectConversion.h        | 10 +++++++++-
 .../lib/Transforms/Utils/DialectConversion.cpp | 18 +++++++++++-------
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 44e80e946ed83..849b3c8e4dc02 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -177,7 +177,7 @@ static void addAMDGPUSpecificMLIRItems(mlir::ModuleOp &mlirModule,
     return;
   }
 
-  mlir::ConversionPatternRewriter builder(mlirModule.getContext());
+  mlir::IRRewriter builder(mlirModule.getContext());
   unsigned oclcABIVERsion = codeGenOpts.CodeObjectVersion;
   auto int32Type = builder.getI32Type();
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 2575be4cdea1a..5c91a9498b35d 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -27,6 +27,7 @@ class Block;
 class ConversionPatternRewriter;
 class MLIRContext;
 class Operation;
+struct OperationConverter;
 class Type;
 class Value;
 
@@ -657,7 +658,6 @@ struct ConversionPatternRewriterImpl;
 /// hooks.
 class ConversionPatternRewriter final : public PatternRewriter {
 public:
-  explicit ConversionPatternRewriter(MLIRContext *ctx);
   ~ConversionPatternRewriter() override;
 
   /// Apply a signature conversion to the entry block of the given region. This
@@ -764,6 +764,14 @@ class ConversionPatternRewriter final : public PatternRewriter {
   detail::ConversionPatternRewriterImpl &getImpl();
 
 private:
+  // Allow OperationConverter to construct new rewriters.
+  friend struct OperationConverter;
+
+  /// Conversion pattern rewriters must not be used outside of dialect
+  /// conversions. They apply some IR rewrites in a delayed fashion and could
+  /// bring the IR into an inconsistent state when used standalone.
+  explicit ConversionPatternRewriter(MLIRContext *ctx);
+
   // Hide unsupported pattern rewriter API.
   using OpBuilder::setListener;
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 635a2cb00f388..2cdbfb78faf27 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -627,9 +627,11 @@ class ReplaceOperationRewrite : public OperationRewrite {
 
   void cleanup() override;
 
-private:
-  friend struct OperationConverter;
+  const TypeConverter *getConverter() const { return converter; }
+
+  bool hasChangedResults() const { return changedResults; }
 
+private:
   /// An optional type converter that can be used to materialize conversions
   /// between the new and old values if necessary.
   const TypeConverter *converter;
@@ -2387,7 +2389,9 @@ enum OpConversionMode {
   /// applied to the operations on success.
   Analysis,
 };
+} // namespace
 
+namespace mlir {
 // This class converts operations to a given conversion target via a set of
 // rewrite patterns. The conversion behaves differently depending on the
 // conversion mode.
@@ -2447,7 +2451,7 @@ struct OperationConverter {
   /// *not* to be legalizable to the target.
   DenseSet<Operation *> *trackedOps;
 };
-} // namespace
+} // namespace mlir
 
 LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
                                           Operation *op) {
@@ -2539,7 +2543,7 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
   for (unsigned i = 0; i < rewriterImpl.rewrites.size(); ++i) {
     auto *opReplacement =
         dyn_cast<ReplaceOperationRewrite>(rewriterImpl.rewrites[i].get());
-    if (!opReplacement || !opReplacement->changedResults)
+    if (!opReplacement || !opReplacement->hasChangedResults())
       continue;
     Operation *op = opReplacement->getOperation();
     for (OpResult result : op->getResults()) {
@@ -2563,9 +2567,9 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
 
       // Legalize this result.
       rewriter.setInsertionPoint(op);
-      if (failed(legalizeChangedResultType(op, result, newValue,
-                                           opReplacement->converter, rewriter,
-                                           rewriterImpl, *inverseMapping)))
+      if (failed(legalizeChangedResultType(
+              op, result, newValue, opReplacement->getConverter(), rewriter,
+              rewriterImpl, *inverseMapping)))
         return failure();
     }
   }

From b39f5660a408b47307e57a0882eb8af85d72e283 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Fri, 23 Feb 2024 09:42:08 +0000
Subject: [PATCH 147/546] [mlir][ArmSME] Add test-lower-to-arm-sme pipeline
 (#81732)

The ArmSME compilation pipeline has evolved significantly and is now
sufficiently complex enough that it warrants a proper lowering pipeline
that encapsulates the various passes and orderings. Currently the
pipeline is loosely defined in our integration tests, but these have
diverged and are not using the same passes or ordering everywhere.

This patch introduces a test-lower-to-arm-sme pipeline mirroring
test-lower-to-llvm that provides some sanity when running e2e examples
and can be used a reference for targeting ArmSME in MLIR.

All the integration tests are updated to use this pipeline. The
intention is to productize the pipeline once it becomes more mature.
---
 .../Dialect/Linalg/CPU/ArmSME/fill-2d.mlir    |  9 +-
 .../Linalg/CPU/ArmSME/matmul-transpose-a.mlir |  9 +-
 .../Dialect/Linalg/CPU/ArmSME/matmul.mlir     |  8 +-
 .../Linalg/CPU/ArmSME/multi-tile-matmul.mlir  |  6 +-
 .../Linalg/CPU/ArmSME/use-too-many-tiles.mlir |  7 +-
 .../CPU/ArmSME/load-store-128-bit-tile.mlir   |  6 +-
 .../Vector/CPU/ArmSME/test-load-vertical.mlir |  6 +-
 .../CPU/ArmSME/test-multi-tile-transpose.mlir |  8 +-
 .../ArmSME/test-outerproduct-f16f16f32.mlir   | 10 +-
 .../CPU/ArmSME/test-outerproduct-f32.mlir     |  6 +-
 .../CPU/ArmSME/test-outerproduct-f64.mlir     |  6 +-
 .../CPU/ArmSME/test-outerproduct-i8i8i32.mlir |  8 +-
 .../CPU/ArmSME/test-transfer-read-2d.mlir     |  6 +-
 .../CPU/ArmSME/test-transfer-write-2d.mlir    |  7 +-
 .../Vector/CPU/ArmSME/test-transpose.mlir     |  6 +-
 .../Dialect/Vector/CPU/ArmSME/tile_fill.mlir  |  6 +-
 .../Vector/CPU/ArmSME/vector-load-store.mlir  |  6 +-
 .../Dialect/Vector/CPU/ArmSME/vector-ops.mlir |  5 +-
 mlir/test/lib/Dialect/ArmSME/CMakeLists.txt   | 16 +++
 .../lib/Dialect/ArmSME/TestLowerToArmSME.cpp  | 99 +++++++++++++++++++
 mlir/test/lib/Dialect/CMakeLists.txt          |  1 +
 mlir/tools/mlir-opt/CMakeLists.txt            |  1 +
 mlir/tools/mlir-opt/mlir-opt.cpp              |  2 +
 23 files changed, 141 insertions(+), 103 deletions(-)
 create mode 100644 mlir/test/lib/Dialect/ArmSME/CMakeLists.txt
 create mode 100644 mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
index 44ff1afe76d38..12f13e8dbc4a9 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
@@ -1,13 +1,8 @@
 // RUN: mlir-opt %s \
-// RUN:   -transform-interpreter \
-// RUN:   -test-transform-dialect-erase-schedule \
+// RUN:   -transform-interpreter -test-transform-dialect-erase-schedule \
 // RUN:   -lower-vector-mask \
 // RUN:   -one-shot-bufferize="bufferize-function-boundaries" \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// RUN:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// RUN:   -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// RUN:   -test-lower-to-llvm | \
+// RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=entry -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
index c781d5e0af846..34c5351c8703d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul-transpose-a.mlir
@@ -1,12 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -transform-interpreter -test-transform-dialect-erase-schedule \
-// RUN:   -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \
-// RUN:   -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// RUN:   -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \
-// RUN:   -convert-arm-sme-to-llvm \
-// RUN:   -convert-vector-to-llvm=enable-arm-sve \
-// RUN:   -cse -canonicalize -test-lower-to-llvm | \
+// RUN:   -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
index 31c3202c3fc57..2bfdaa8e8a2be 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir
@@ -1,12 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN:   -transform-interpreter -test-transform-dialect-erase-schedule \
-// RUN:   -canonicalize \
-// RUN:   -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// RUN:   -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \
-// RUN:   -convert-arm-sme-to-llvm \
-// RUN:   -convert-vector-to-llvm=enable-arm-sve \
-// RUN:   -cse -canonicalize -test-lower-to-llvm | \
+// RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
index d5c35068ccb32..e376bdde24a15 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/multi-tile-matmul.mlir
@@ -1,11 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -transform-interpreter -test-transform-dialect-erase-schedule  \
 // RUN:   -one-shot-bufferize="bufferize-function-boundaries" -canonicalize \
-// RUN:   -arm-sme-vector-legalization -canonicalize -cse \
-// RUN:   -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// RUN:   -convert-vector-to-scf=full-unroll -convert-arm-sme-to-llvm \
-// RUN:   -test-lower-to-llvm | \
+// RUN:   -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
index 42fe21cccd48a..ee3866de303e0 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
@@ -1,10 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// RUN:   -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops"  \
-// RUN:   -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \
-// RUN:   -convert-arm-sme-to-llvm -convert-vector-to-llvm=enable-arm-sve -cse \
-// RUN:   -canonicalize -test-lower-to-llvm -verify-diagnostics | \
+// RUN:   -test-lower-to-arm-sme -test-lower-to-llvm -verify-diagnostics | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
index 59b4a7e6a52f9..06b1c107cb2c1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
@@ -1,9 +1,5 @@
 // DEFINE: %{entry_point} = test_load_store_zaq0
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -allocate-arm-sme-tiles -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:  -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:  -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
index 064141c349241..27be801252b81 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
@@ -1,9 +1,5 @@
 // DEFINE: %{entry_point} = entry
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:   -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
index 0827d9b7464ad..9d836d93c85bb 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
@@ -1,10 +1,4 @@
-// RUN: mlir-opt %s -arm-sme-vector-legalization -cse -canonicalize \
-// RUN:   -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
-// RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// RUN:   -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \
-// RUN:   -convert-arm-sme-to-llvm \
-// RUN:   -convert-vector-to-llvm=enable-arm-sve \
-// RUN:   -cse -canonicalize -test-lower-to-llvm | \
+// RUN: mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:   -e=main -entry-point-result=void \
 // RUN:   -march=aarch64 -mattr="+sve,+sme" \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir
index f081838300a9a..a06ad37b054e4 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f16f16f32.mlir
@@ -1,11 +1,7 @@
+// DEFINE: %{opts} =
 // DEFINE: %{entry} = main
-// DEFINE: %{fusion_opts} = -arm-sme-outer-product-fusion
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme %{fusion_opts} \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm -o %t
+// DEFINE:   -test-lower-to-arm-sme=%{opts} -test-lower-to-llvm -o %t
 // DEFINE: %{run} = %mcr_aarch64_cmd %t \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:   -e %{entry} -entry-point-result=void \
@@ -18,7 +14,7 @@
 // Check result is the same when outerproducts are not combined into widening
 // variant.
 
-// REDEFINE: %{fusion_opts} =
+// REDEFINE: %{opts} = fuse-outer-products=false
 // RUN: %{run} | FileCheck %s
 
 func.func @main() {
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
index 5f41b37560e76..7e7869d1c957a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
@@ -1,10 +1,6 @@
 // DEFINE: %{entry_point} = test_outerproduct_no_accumulator_4x4xf32
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm -o %t
+// DEFINE:   -test-lower-to-arm-sme -test-lower-to-llvm -o %t
 // DEFINE: %{run} = %mcr_aarch64_cmd %t \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:   -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
index a1bb9b7d6f80e..46bf799232ae3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
@@ -1,10 +1,6 @@
 // DEFINE: %{entry_point} = test_outerproduct_no_accumulator_2x2xf64
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm -o %t
+// DEFINE:   -test-lower-to-arm-sme -test-lower-to-llvm -o %t
 // DEFINE: %{run} = %mcr_aarch64_cmd %t \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme-f64f64 \
 // DEFINE:   -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir
index 1770e579f0bd6..9a353ec2d2f66 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-i8i8i32.mlir
@@ -1,11 +1,5 @@
 // DEFINE: %{entry} = main
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// DEFINE:   -arm-sme-outer-product-fusion \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:   -e %{entry} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
index 6e028d5fb8361..52f56883cad9c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
@@ -1,9 +1,5 @@
 // DEFINE: %{entry_point} = entry
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:  -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:  -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
index c0c1f55d7ddd1..710cc6672f005 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
@@ -1,10 +1,5 @@
 // DEFINE: %{entry_point} = entry
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:  -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:  -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
index eee3c56351d81..88bc0d0709d48 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
@@ -1,9 +1,5 @@
 // DEFINE: %{entry_point} = entry
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:   -e %{entry_point} -entry-point-result=void \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
index 223bc8ce74343..e14917486d845 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
@@ -1,8 +1,4 @@
-// RUN: mlir-opt %s -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// RUN:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// RUN:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// RUN:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// RUN:   -test-lower-to-llvm | \
+// RUN: mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
 // RUN:  -march=aarch64 -mattr=+sve,+sme \
 // RUN:  -e entry -entry-point-result=i32 \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
index 2f151e2ec72fb..b29790db14ddc 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
@@ -1,9 +1,5 @@
 // DEFINE: %{entry_point} = za0_d_f64
-// DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
-// DEFINE:   -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:  -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:  -e %{entry_point} -entry-point-result=i32 \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir
index f28bf19b29993..c8c401bed1446 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir
@@ -1,8 +1,5 @@
 // DEFINE: %{entry_point} = entry
-// DEFINE: %{compile} = mlir-opt %s -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
-// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
-// DEFINE:   -convert-arm-sme-to-llvm -test-lower-to-llvm
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-arm-sme -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:  -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:  -e %{entry_point} -entry-point-result=i32 \
diff --git a/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt b/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt
new file mode 100644
index 0000000000000..de4971ff7eb3d
--- /dev/null
+++ b/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Exclude tests from libMLIR.so
+add_mlir_library(MLIRArmSMETestPasses
+  TestLowerToArmSME.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+
+  LINK_LIBS PUBLIC
+  MLIRArithToArmSME
+  MLIRArmSMEToLLVM
+  MLIRArmSMEToSCF
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRVectorToArmSME
+  MLIRVectorToSCF
+  )
diff --git a/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp b/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp
new file mode 100644
index 0000000000000..48d4a5859f8a0
--- /dev/null
+++ b/mlir/test/lib/Dialect/ArmSME/TestLowerToArmSME.cpp
@@ -0,0 +1,99 @@
+//===- TestLowerToArmSME.cpp - Test lowering to ArmSME as a sink pass -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass for testing the lowering to ArmSME as a
+// generally usable sink pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
+#include "mlir/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.h"
+#include "mlir/Conversion/ArmSMEToSCF/ArmSMEToSCF.h"
+#include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h"
+#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/ArmSME/Transforms/Passes.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassOptions.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+struct TestLowerToArmSMEOptions
+    : public PassPipelineOptions<TestLowerToArmSMEOptions> {
+  PassOptions::Option<bool> fuseOuterProducts{
+      *this, "fuse-outer-products",
+      llvm::cl::desc("Fuse outer product operations via "
+                     "'-arm-sme-outer-product-fusion' pass"),
+      llvm::cl::init(true)};
+};
+
+void buildTestLowerToArmSME(OpPassManager &pm,
+                            const TestLowerToArmSMEOptions &options) {
+  // Legalize vector operations so they can be converted to ArmSME.
+  pm.addPass(arm_sme::createVectorLegalizationPass());
+
+  // Sprinkle some cleanups.
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+
+  // Passes that convert operations on vectors to ArmSME operations.
+
+  // Convert Arith to ArmSME.
+  pm.addPass(createArithToArmSMEConversionPass());
+  // Convert Vector to ArmSME.
+  pm.addPass(createConvertVectorToArmSMEPass());
+
+  // Fuse outer products.
+  if (options.fuseOuterProducts)
+    pm.addPass(arm_sme::createOuterProductFusionPass());
+
+  // Convert operations on high-level vectors to loops.
+
+  // Convert ArmSME to SCF.
+  pm.addPass(createConvertArmSMEToSCFPass());
+
+  // Convert Vector to SCF (with full unroll enabled).
+  pm.addPass(createConvertVectorToSCFPass(
+      VectorTransferToSCFOptions().enableFullUnroll()));
+
+  // Allocate tiles for ArmSME operations.
+  //
+  // Later passes may create further ArmSME ops that implement the
+  // ArmSMETileOpInterface, but tiles are allocated for root operations,
+  // all of which should now exist.
+  pm.addPass(arm_sme::createTileAllocationPass());
+
+  // Enable streaming-mode and ZA.
+  pm.addPass(arm_sme::createEnableArmStreamingPass(
+      arm_sme::ArmStreamingMode::StreamingLocally, arm_sme::ArmZaMode::NewZA,
+      /*onlyIfRequiredByOps=*/true));
+
+  // Convert ArmSME to LLVM.
+  pm.addPass(createConvertArmSMEToLLVMPass());
+
+  // Sprinkle some cleanups.
+  pm.addPass(createCanonicalizerPass());
+  pm.addPass(createCSEPass());
+}
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestLowerToArmSME() {
+  PassPipelineRegistration<TestLowerToArmSMEOptions>(
+      "test-lower-to-arm-sme",
+      "An example pipeline to lower operations on vectors (arith, vector) to "
+      "LLVM via ArmSME.",
+      buildTestLowerToArmSME);
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt
index 30a17c201ff76..e20cd4473a358 100644
--- a/mlir/test/lib/Dialect/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory(Affine)
 add_subdirectory(Arith)
+add_subdirectory(ArmSME)
 add_subdirectory(Bufferization)
 add_subdirectory(ControlFlow)
 add_subdirectory(DLTI)
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 68aa6bad5f92c..701fc461b3b4e 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -17,6 +17,7 @@ if(MLIR_INCLUDE_TESTS)
     MLIRTestFuncToLLVM
     MLIRAffineTransformsTestPasses
     MLIRArithTestPasses
+    MLIRArmSMETestPasses
     MLIRBufferizationTestPasses
     MLIRControlFlowTestPasses
     MLIRDLTITestPasses
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index f11c6b4355fdd..4dfa05cc8ca88 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -109,6 +109,7 @@ void registerTestLoopFusion();
 void registerTestCFGLoopInfoPass();
 void registerTestLoopMappingPass();
 void registerTestLoopUnrollingPass();
+void registerTestLowerToArmSME();
 void registerTestLowerToLLVM();
 void registerTestMakeIsolatedFromAbovePass();
 void registerTestMatchReductionPass();
@@ -233,6 +234,7 @@ void registerTestPasses() {
   mlir::test::registerTestCFGLoopInfoPass();
   mlir::test::registerTestLoopMappingPass();
   mlir::test::registerTestLoopUnrollingPass();
+  mlir::test::registerTestLowerToArmSME();
   mlir::test::registerTestLowerToLLVM();
   mlir::test::registerTestMakeIsolatedFromAbovePass();
   mlir::test::registerTestMatchReductionPass();

From 78890904c41cc4221839dafb7ae906971a9db51a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 23 Feb 2024 09:48:58 +0000
Subject: [PATCH 148/546] [mlir][math] Propagate scalability in
 `convert-math-to-llvm` (#82635)

This also generally increases the coverage of scalable vector types in
the math-to-llvm tests.
---
 mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp | 18 ++---
 .../Conversion/MathToLLVM/math-to-llvm.mlir   | 81 +++++++++++++++++++
 2 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
index 1b729611a3623..23e957288eb95 100644
--- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
+++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
@@ -148,10 +148,10 @@ struct ExpM1OpLowering : public ConvertOpToLLVMPattern<math::ExpM1Op> {
     return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
+          auto numElements = LLVM::getVectorNumElements(llvm1DVectorTy);
           auto splatAttr = SplatElementsAttr::get(
-              mlir::VectorType::get(
-                  {LLVM::getVectorNumElements(llvm1DVectorTy).getFixedValue()},
-                  floatType),
+              mlir::VectorType::get({numElements.getKnownMinValue()}, floatType,
+                                    {numElements.isScalable()}),
               floatOne);
           auto one =
               rewriter.create<LLVM::ConstantOp>(loc, llvm1DVectorTy, splatAttr);
@@ -207,10 +207,10 @@ struct Log1pOpLowering : public ConvertOpToLLVMPattern<math::Log1pOp> {
     return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
+          auto numElements = LLVM::getVectorNumElements(llvm1DVectorTy);
           auto splatAttr = SplatElementsAttr::get(
-              mlir::VectorType::get(
-                  {LLVM::getVectorNumElements(llvm1DVectorTy).getFixedValue()},
-                  floatType),
+              mlir::VectorType::get({numElements.getKnownMinValue()}, floatType,
+                                    {numElements.isScalable()}),
               floatOne);
           auto one =
               rewriter.create<LLVM::ConstantOp>(loc, llvm1DVectorTy, splatAttr);
@@ -266,10 +266,10 @@ struct RsqrtOpLowering : public ConvertOpToLLVMPattern<math::RsqrtOp> {
     return LLVM::detail::handleMultidimensionalVectors(
         op.getOperation(), adaptor.getOperands(), *getTypeConverter(),
         [&](Type llvm1DVectorTy, ValueRange operands) {
+          auto numElements = LLVM::getVectorNumElements(llvm1DVectorTy);
           auto splatAttr = SplatElementsAttr::get(
-              mlir::VectorType::get(
-                  {LLVM::getVectorNumElements(llvm1DVectorTy).getFixedValue()},
-                  floatType),
+              mlir::VectorType::get({numElements.getKnownMinValue()}, floatType,
+                                    {numElements.isScalable()}),
               floatOne);
           auto one =
               rewriter.create<LLVM::ConstantOp>(loc, llvm1DVectorTy, splatAttr);
diff --git a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
index 3de2f11d1d12c..56129dbd27889 100644
--- a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
+++ b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
@@ -77,6 +77,18 @@ func.func @log1p_2dvector_fmf(%arg0 : vector<4x3xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @log1p_scalable_vector(
+// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32>
+func.func @log1p_scalable_vector(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> {
+  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32>
+  // CHECK: %[[ADD:.*]] = llvm.fadd %[[ONE]], %[[VEC]]  : vector<[4]xf32>
+  // CHECK: %[[LOG:.*]] = llvm.intr.log(%[[ADD]])  : (vector<[4]xf32>) -> vector<[4]xf32>
+  %0 = math.log1p %arg0 : vector<[4]xf32>
+  func.return %0 : vector<[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @expm1(
 // CHECK-SAME: f32
 func.func @expm1(%arg0 : f32) {
@@ -113,6 +125,18 @@ func.func @expm1_vector(%arg0 : vector<4xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @expm1_scalable_vector(
+// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32>
+func.func @expm1_scalable_vector(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> {
+  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32>
+  // CHECK: %[[EXP:.*]] = llvm.intr.exp(%[[VEC]])  : (vector<[4]xf32>) -> vector<[4]xf32>
+  // CHECK: %[[SUB:.*]] = llvm.fsub %[[EXP]], %[[ONE]] : vector<[4]xf32>
+  %0 = math.expm1 %arg0 : vector<[4]xf32>
+  func.return %0 : vector<[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @expm1_vector_fmf(
 // CHECK-SAME: vector<4xf32>
 func.func @expm1_vector_fmf(%arg0 : vector<4xf32>) {
@@ -177,6 +201,16 @@ func.func @cttz_vec(%arg0 : vector<4xi32>) {
 
 // -----
 
+// CHECK-LABEL: func @cttz_scalable_vec(
+// CHECK-SAME: %[[VEC:.*]]: vector<[4]xi32>
+func.func @cttz_scalable_vec(%arg0 : vector<[4]xi32>) -> vector<[4]xi32> {
+  // CHECK: "llvm.intr.cttz"(%[[VEC]]) <{is_zero_poison = false}> : (vector<[4]xi32>) -> vector<[4]xi32>
+  %0 = math.cttz %arg0 : vector<[4]xi32>
+  func.return %0 : vector<[4]xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @ctpop(
 // CHECK-SAME: i32
 func.func @ctpop(%arg0 : i32) {
@@ -197,6 +231,16 @@ func.func @ctpop_vector(%arg0 : vector<3xi32>) {
 
 // -----
 
+// CHECK-LABEL: func @ctpop_scalable_vector(
+// CHECK-SAME: %[[VEC:.*]]: vector<[4]xi32>
+func.func @ctpop_scalable_vector(%arg0 : vector<[4]xi32>) -> vector<[4]xi32> {
+  // CHECK: llvm.intr.ctpop(%[[VEC]]) : (vector<[4]xi32>) -> vector<[4]xi32>
+  %0 = math.ctpop %arg0 : vector<[4]xi32>
+  func.return %0 : vector<[4]xi32>
+}
+
+// -----
+
 // CHECK-LABEL: func @rsqrt_double(
 // CHECK-SAME: f64
 func.func @rsqrt_double(%arg0 : f64) {
@@ -233,6 +277,18 @@ func.func @rsqrt_vector(%arg0 : vector<4xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @rsqrt_scalable_vector(
+// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32>
+func.func @rsqrt_scalable_vector(%arg0 : vector<[4]xf32>) ->  vector<[4]xf32>{
+  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32>
+  // CHECK: %[[SQRT:.*]] = llvm.intr.sqrt(%[[VEC]]) : (vector<[4]xf32>) -> vector<[4]xf32>
+  // CHECK: %[[DIV:.*]] = llvm.fdiv %[[ONE]], %[[SQRT]] : vector<[4]xf32>
+  %0 = math.rsqrt %arg0 : vector<[4]xf32>
+  func.return  %0 : vector<[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @rsqrt_vector_fmf(
 // CHECK-SAME: vector<4xf32>
 func.func @rsqrt_vector_fmf(%arg0 : vector<4xf32>) {
@@ -245,6 +301,18 @@ func.func @rsqrt_vector_fmf(%arg0 : vector<4xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @rsqrt_scalable_vector_fmf(
+// CHECK-SAME: %[[VEC:.*]]: vector<[4]xf32>
+func.func @rsqrt_scalable_vector_fmf(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> {
+  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32>
+  // CHECK: %[[SQRT:.*]] = llvm.intr.sqrt(%[[VEC]]) {fastmathFlags = #llvm.fastmath<fast>} : (vector<[4]xf32>) -> vector<[4]xf32>
+  // CHECK: %[[DIV:.*]] = llvm.fdiv %[[ONE]], %[[SQRT]] {fastmathFlags = #llvm.fastmath<fast>} : vector<[4]xf32>
+  %0 = math.rsqrt %arg0 fastmath<fast> : vector<[4]xf32>
+  func.return %0 : vector<[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @rsqrt_multidim_vector(
 func.func @rsqrt_multidim_vector(%arg0 : vector<4x3xf32>) {
   // CHECK: %[[EXTRACT:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.array<4 x vector<3xf32>>
@@ -258,6 +326,19 @@ func.func @rsqrt_multidim_vector(%arg0 : vector<4x3xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @rsqrt_multidim_scalable_vector(
+func.func @rsqrt_multidim_scalable_vector(%arg0 : vector<4x[4]xf32>) -> vector<4x[4]xf32> {
+  // CHECK: %[[EXTRACT:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.array<4 x vector<[4]xf32>>
+  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(dense<1.000000e+00> : vector<[4]xf32>) : vector<[4]xf32>
+  // CHECK: %[[SQRT:.*]] = llvm.intr.sqrt(%[[EXTRACT]]) : (vector<[4]xf32>) -> vector<[4]xf32>
+  // CHECK: %[[DIV:.*]] = llvm.fdiv %[[ONE]], %[[SQRT]] : vector<[4]xf32>
+  // CHECK: %[[INSERT:.*]] = llvm.insertvalue %[[DIV]], %{{.*}}[0] : !llvm.array<4 x vector<[4]xf32>>
+  %0 = math.rsqrt %arg0 : vector<4x[4]xf32>
+  func.return %0 : vector<4x[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @fpowi(
 // CHECK-SAME: f64
 func.func @fpowi(%arg0 : f64, %arg1 : i32) {

From 13acb3af5ad48e850cf37dcf02270ede3f267bd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 23 Feb 2024 10:52:28 +0100
Subject: [PATCH 149/546] [clang][Interp] Don't diagnose alread invalid
 function decls

They have already been diagnosed before. Also improve that test case.
---
 clang/lib/AST/Interp/Interp.cpp |  4 ++++
 clang/test/SemaCXX/PR68542.cpp  | 14 +++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 82bc1f240cc51..b2fe70dc14f9d 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -462,6 +462,10 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) {
     if (S.getLangOpts().CPlusPlus11) {
       const FunctionDecl *DiagDecl = F->getDecl();
 
+      // Invalid decls have been diagnosed before.
+      if (DiagDecl->isInvalidDecl())
+        return false;
+
       // If this function is not constexpr because it is an inherited
       // non-constexpr constructor, diagnose that directly.
       const auto *CD = dyn_cast<CXXConstructorDecl>(DiagDecl);
diff --git a/clang/test/SemaCXX/PR68542.cpp b/clang/test/SemaCXX/PR68542.cpp
index fc767a78c8b00..e266bf9ba77ab 100644
--- a/clang/test/SemaCXX/PR68542.cpp
+++ b/clang/test/SemaCXX/PR68542.cpp
@@ -1,20 +1,20 @@
 // RUN: %clang_cc1 -verify -std=c++20 -fsyntax-only %s
+// RUN: %clang_cc1 -verify -std=c++20 -fsyntax-only %s -fexperimental-new-constant-interpreter
 
-struct S {
+struct S { // expected-note {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'S &&' for 1st argument}} \
+           // expected-note {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const S &' for 1st argument}}
     int e;
 };
 
 template<class T>
 consteval int get_format() {
-	return nullptr; // expected-error{{cannot initialize return object of type 'int' with an rvalue of type 'std::nullptr_t'}}
+	return nullptr; // expected-error {{cannot initialize return object of type 'int' with an rvalue of type 'std::nullptr_t'}}
 }
 
 template<class T>
 constexpr S f(T) noexcept {
-	return get_format<T>(); // expected-error{{no viable conversion from returned value of type 'int' to function return type 'S'}}
+	return get_format<T>(); // expected-error {{no viable conversion from returned value of type 'int' to function return type 'S'}}
 }
 
-constexpr S x = f(0); // expected-error{{constexpr variable 'x' must be initialized by a constant expression}}
-// expected-note@-1{{in instantiation of function template specialization 'f<int>' requested here}}
-// expected-note@3{{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'S &&' for 1st argument}}
-// expected-note@3{{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const S &' for 1st argument}}
+constexpr S x = f(0); // expected-error {{constexpr variable 'x' must be initialized by a constant expression}} \
+                      // expected-note {{in instantiation of function template specialization 'f<int>' requested here}}

From 5f1319bb385342c7ef4124b05b83b89ef8588ee8 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 11:28:05 +0100
Subject: [PATCH 150/546] [mlir][Transforms] Encapsulate dialect conversion
 options in `ConversionConfig` (#82250)

This commit adds a new `ConversionConfig` struct that allows users to
customize the dialect conversion. This configuration is similar to
`GreedyRewriteConfig` for the greedy pattern rewrite driver.

A few existing options are moved to this objects, simplifying the
dialect conversion API.
---
 .../mlir/Transforms/DialectConversion.h       |  75 ++++++----
 .../Transforms/Utils/DialectConversion.cpp    | 134 ++++++++----------
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  14 +-
 3 files changed, 118 insertions(+), 105 deletions(-)

diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 5c91a9498b35d..7e8e67a9d1782 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -24,6 +24,7 @@ namespace mlir {
 // Forward declarations.
 class Attribute;
 class Block;
+struct ConversionConfig;
 class ConversionPatternRewriter;
 class MLIRContext;
 class Operation;
@@ -770,7 +771,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// Conversion pattern rewriters must not be used outside of dialect
   /// conversions. They apply some IR rewrites in a delayed fashion and could
   /// bring the IR into an inconsistent state when used standalone.
-  explicit ConversionPatternRewriter(MLIRContext *ctx);
+  explicit ConversionPatternRewriter(MLIRContext *ctx,
+                                     const ConversionConfig &config);
 
   // Hide unsupported pattern rewriter API.
   using OpBuilder::setListener;
@@ -1070,6 +1072,30 @@ class PDLConversionConfig final {
 
 #endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
+//===----------------------------------------------------------------------===//
+// ConversionConfig
+//===----------------------------------------------------------------------===//
+
+/// Dialect conversion configuration.
+struct ConversionConfig {
+  /// An optional callback used to notify about match failure diagnostics during
+  /// the conversion. Diagnostics reported to this callback may only be
+  /// available in debug mode.
+  function_ref<void(Diagnostic &)> notifyCallback = nullptr;
+
+  /// Partial conversion only. All operations that are found not to be
+  /// legalizable are placed in this set. (Note that if there is an op
+  /// explicitly marked as illegal, the conversion terminates and the set will
+  /// not necessarily be complete.)
+  DenseSet<Operation *> *unlegalizedOps = nullptr;
+
+  /// Analysis conversion only. All operations that are found to be legalizable
+  /// are placed in this set. Note that no actual rewrites are applied to the
+  /// IR during an analysis conversion and only pre-existing operations are
+  /// added to the set.
+  DenseSet<Operation *> *legalizableOps = nullptr;
+};
+
 //===----------------------------------------------------------------------===//
 // Op Conversion Entry Points
 //===----------------------------------------------------------------------===//
@@ -1083,19 +1109,16 @@ class PDLConversionConfig final {
 /// Apply a partial conversion on the given operations and all nested
 /// operations. This method converts as many operations to the target as
 /// possible, ignoring operations that failed to legalize. This method only
-/// returns failure if there ops explicitly marked as illegal. If an
-/// `unconvertedOps` set is provided, all operations that are found not to be
-/// legalizable to the given `target` are placed within that set. (Note that if
-/// there is an op explicitly marked as illegal, the conversion terminates and
-/// the `unconvertedOps` set will not necessarily be complete.)
+/// returns failure if there ops explicitly marked as illegal.
 LogicalResult
-applyPartialConversion(ArrayRef<Operation *> ops, const ConversionTarget &target,
+applyPartialConversion(ArrayRef<Operation *> ops,
+                       const ConversionTarget &target,
                        const FrozenRewritePatternSet &patterns,
-                       DenseSet<Operation *> *unconvertedOps = nullptr);
+                       ConversionConfig config = ConversionConfig());
 LogicalResult
 applyPartialConversion(Operation *op, const ConversionTarget &target,
                        const FrozenRewritePatternSet &patterns,
-                       DenseSet<Operation *> *unconvertedOps = nullptr);
+                       ConversionConfig config = ConversionConfig());
 
 /// Apply a complete conversion on the given operations, and all nested
 /// operations. This method returns failure if the conversion of any operation
@@ -1103,31 +1126,27 @@ applyPartialConversion(Operation *op, const ConversionTarget &target,
 /// within 'ops'.
 LogicalResult applyFullConversion(ArrayRef<Operation *> ops,
                                   const ConversionTarget &target,
-                                  const FrozenRewritePatternSet &patterns);
+                                  const FrozenRewritePatternSet &patterns,
+                                  ConversionConfig config = ConversionConfig());
 LogicalResult applyFullConversion(Operation *op, const ConversionTarget &target,
-                                  const FrozenRewritePatternSet &patterns);
+                                  const FrozenRewritePatternSet &patterns,
+                                  ConversionConfig config = ConversionConfig());
 
 /// Apply an analysis conversion on the given operations, and all nested
 /// operations. This method analyzes which operations would be successfully
 /// converted to the target if a conversion was applied. All operations that
 /// were found to be legalizable to the given 'target' are placed within the
-/// provided 'convertedOps' set; note that no actual rewrites are applied to the
-/// operations on success and only pre-existing operations are added to the set.
-/// This method only returns failure if there are unreachable blocks in any of
-/// the regions nested within 'ops'. There's an additional argument
-/// `notifyCallback` which is used for collecting match failure diagnostics
-/// generated during the conversion. Diagnostics are only reported to this
-/// callback may only be available in debug mode.
-LogicalResult applyAnalysisConversion(
-    ArrayRef<Operation *> ops, ConversionTarget &target,
-    const FrozenRewritePatternSet &patterns,
-    DenseSet<Operation *> &convertedOps,
-    function_ref<void(Diagnostic &)> notifyCallback = nullptr);
-LogicalResult applyAnalysisConversion(
-    Operation *op, ConversionTarget &target,
-    const FrozenRewritePatternSet &patterns,
-    DenseSet<Operation *> &convertedOps,
-    function_ref<void(Diagnostic &)> notifyCallback = nullptr);
+/// provided 'config.legalizableOps' set; note that no actual rewrites are
+/// applied to the operations on success. This method only returns failure if
+/// there are unreachable blocks in any of the regions nested within 'ops'.
+LogicalResult
+applyAnalysisConversion(ArrayRef<Operation *> ops, ConversionTarget &target,
+                        const FrozenRewritePatternSet &patterns,
+                        ConversionConfig config = ConversionConfig());
+LogicalResult
+applyAnalysisConversion(Operation *op, ConversionTarget &target,
+                        const FrozenRewritePatternSet &patterns,
+                        ConversionConfig config = ConversionConfig());
 } // namespace mlir
 
 #endif // MLIR_TRANSFORMS_DIALECTCONVERSION_H_
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 2cdbfb78faf27..508ee7416d55d 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -228,6 +228,8 @@ class IRRewrite {
   /// Erase the given block (unless it was already erased).
   void eraseBlock(Block *block);
 
+  const ConversionConfig &getConfig() const;
+
   const Kind kind;
   ConversionPatternRewriterImpl &rewriterImpl;
 };
@@ -754,9 +756,10 @@ static RewriteTy *findSingleRewrite(R &&rewrites, Block *block) {
 namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
-  explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter)
+  explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter,
+                                         const ConversionConfig &config)
       : rewriter(rewriter), eraseRewriter(rewriter.getContext()),
-        notifyCallback(nullptr) {}
+        config(config) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -962,14 +965,8 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// converting the arguments of blocks within that region.
   DenseMap<Region *, const TypeConverter *> regionToConverter;
 
-  /// This allows the user to collect the match failure message.
-  function_ref<void(Diagnostic &)> notifyCallback;
-
-  /// A set of pre-existing operations. When mode == OpConversionMode::Analysis,
-  /// this is populated with ops found to be legalizable to the target.
-  /// When mode == OpConversionMode::Partial, this is populated with ops found
-  /// *not* to be legalizable to the target.
-  DenseSet<Operation *> *trackedOps = nullptr;
+  /// Dialect conversion configuration.
+  const ConversionConfig &config;
 
 #ifndef NDEBUG
   /// A set of operations that have pending updates. This tracking isn't
@@ -992,6 +989,10 @@ void IRRewrite::eraseBlock(Block *block) {
   rewriterImpl.eraseRewriter.eraseBlock(block);
 }
 
+const ConversionConfig &IRRewrite::getConfig() const {
+  return rewriterImpl.config;
+}
+
 void BlockTypeConversionRewrite::commit() {
   // Process the remapping for each of the original arguments.
   for (auto [origArg, info] :
@@ -1107,8 +1108,8 @@ void ReplaceOperationRewrite::commit() {
     if (Value newValue =
             rewriterImpl.mapping.lookupOrNull(result, result.getType()))
       result.replaceAllUsesWith(newValue);
-  if (rewriterImpl.trackedOps)
-    rewriterImpl.trackedOps->erase(op);
+  if (getConfig().unlegalizedOps)
+    getConfig().unlegalizedOps->erase(op);
   // Do not erase the operation yet. It may still be referenced in `mapping`.
   op->getBlock()->getOperations().remove(op);
 }
@@ -1543,8 +1544,8 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
     Diagnostic diag(loc, DiagnosticSeverity::Remark);
     reasonCallback(diag);
     logger.startLine() << "** Failure : " << diag.str() << "\n";
-    if (notifyCallback)
-      notifyCallback(diag);
+    if (config.notifyCallback)
+      config.notifyCallback(diag);
   });
 }
 
@@ -1552,9 +1553,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
 // ConversionPatternRewriter
 //===----------------------------------------------------------------------===//
 
-ConversionPatternRewriter::ConversionPatternRewriter(MLIRContext *ctx)
+ConversionPatternRewriter::ConversionPatternRewriter(
+    MLIRContext *ctx, const ConversionConfig &config)
     : PatternRewriter(ctx),
-      impl(new detail::ConversionPatternRewriterImpl(*this)) {
+      impl(new detail::ConversionPatternRewriterImpl(*this, config)) {
   setListener(impl.get());
 }
 
@@ -2005,12 +2007,12 @@ OperationLegalizer::legalizeWithPattern(Operation *op,
     assert(rewriterImpl.pendingRootUpdates.empty() && "dangling root updates");
     LLVM_DEBUG({
       logFailure(rewriterImpl.logger, "pattern failed to match");
-      if (rewriterImpl.notifyCallback) {
+      if (rewriterImpl.config.notifyCallback) {
         Diagnostic diag(op->getLoc(), DiagnosticSeverity::Remark);
         diag << "Failed to apply pattern \"" << pattern.getDebugName()
              << "\" on op:\n"
              << *op;
-        rewriterImpl.notifyCallback(diag);
+        rewriterImpl.config.notifyCallback(diag);
       }
     });
     rewriterImpl.resetState(curState);
@@ -2398,14 +2400,12 @@ namespace mlir {
 struct OperationConverter {
   explicit OperationConverter(const ConversionTarget &target,
                               const FrozenRewritePatternSet &patterns,
-                              OpConversionMode mode,
-                              DenseSet<Operation *> *trackedOps = nullptr)
-      : opLegalizer(target, patterns), mode(mode), trackedOps(trackedOps) {}
+                              const ConversionConfig &config,
+                              OpConversionMode mode)
+      : opLegalizer(target, patterns), config(config), mode(mode) {}
 
   /// Converts the given operations to the conversion target.
-  LogicalResult
-  convertOperations(ArrayRef<Operation *> ops,
-                    function_ref<void(Diagnostic &)> notifyCallback = nullptr);
+  LogicalResult convertOperations(ArrayRef<Operation *> ops);
 
 private:
   /// Converts an operation with the given rewriter.
@@ -2442,14 +2442,11 @@ struct OperationConverter {
   /// The legalizer to use when converting operations.
   OperationLegalizer opLegalizer;
 
+  /// Dialect conversion configuration.
+  ConversionConfig config;
+
   /// The conversion mode to use when legalizing operations.
   OpConversionMode mode;
-
-  /// A set of pre-existing operations. When mode == OpConversionMode::Analysis,
-  /// this is populated with ops found to be legalizable to the target.
-  /// When mode == OpConversionMode::Partial, this is populated with ops found
-  /// *not* to be legalizable to the target.
-  DenseSet<Operation *> *trackedOps;
 };
 } // namespace mlir
 
@@ -2463,28 +2460,27 @@ LogicalResult OperationConverter::convert(ConversionPatternRewriter &rewriter,
       return op->emitError()
              << "failed to legalize operation '" << op->getName() << "'";
     // Partial conversions allow conversions to fail iff the operation was not
-    // explicitly marked as illegal. If the user provided a nonlegalizableOps
-    // set, non-legalizable ops are included.
+    // explicitly marked as illegal. If the user provided a `unlegalizedOps`
+    // set, non-legalizable ops are added to that set.
     if (mode == OpConversionMode::Partial) {
       if (opLegalizer.isIllegal(op))
         return op->emitError()
                << "failed to legalize operation '" << op->getName()
                << "' that was explicitly marked illegal";
-      if (trackedOps)
-        trackedOps->insert(op);
+      if (config.unlegalizedOps)
+        config.unlegalizedOps->insert(op);
     }
   } else if (mode == OpConversionMode::Analysis) {
     // Analysis conversions don't fail if any operations fail to legalize,
     // they are only interested in the operations that were successfully
     // legalized.
-    trackedOps->insert(op);
+    if (config.legalizableOps)
+      config.legalizableOps->insert(op);
   }
   return success();
 }
 
-LogicalResult OperationConverter::convertOperations(
-    ArrayRef<Operation *> ops,
-    function_ref<void(Diagnostic &)> notifyCallback) {
+LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
   if (ops.empty())
     return success();
   const ConversionTarget &target = opLegalizer.getTarget();
@@ -2505,10 +2501,8 @@ LogicalResult OperationConverter::convertOperations(
   }
 
   // Convert each operation and discard rewrites on failure.
-  ConversionPatternRewriter rewriter(ops.front()->getContext());
+  ConversionPatternRewriter rewriter(ops.front()->getContext(), config);
   ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl();
-  rewriterImpl.notifyCallback = notifyCallback;
-  rewriterImpl.trackedOps = trackedOps;
 
   for (auto *op : toConvert)
     if (failed(convert(rewriter, op)))
@@ -3495,57 +3489,51 @@ void mlir::registerConversionPDLFunctions(RewritePatternSet &patterns) {
 //===----------------------------------------------------------------------===//
 // Partial Conversion
 
-LogicalResult
-mlir::applyPartialConversion(ArrayRef<Operation *> ops,
-                             const ConversionTarget &target,
-                             const FrozenRewritePatternSet &patterns,
-                             DenseSet<Operation *> *unconvertedOps) {
-  OperationConverter opConverter(target, patterns, OpConversionMode::Partial,
-                                 unconvertedOps);
+LogicalResult mlir::applyPartialConversion(
+    ArrayRef<Operation *> ops, const ConversionTarget &target,
+    const FrozenRewritePatternSet &patterns, ConversionConfig config) {
+  OperationConverter opConverter(target, patterns, config,
+                                 OpConversionMode::Partial);
   return opConverter.convertOperations(ops);
 }
 LogicalResult
 mlir::applyPartialConversion(Operation *op, const ConversionTarget &target,
                              const FrozenRewritePatternSet &patterns,
-                             DenseSet<Operation *> *unconvertedOps) {
-  return applyPartialConversion(llvm::ArrayRef(op), target, patterns,
-                                unconvertedOps);
+                             ConversionConfig config) {
+  return applyPartialConversion(llvm::ArrayRef(op), target, patterns, config);
 }
 
 //===----------------------------------------------------------------------===//
 // Full Conversion
 
-LogicalResult
-mlir::applyFullConversion(ArrayRef<Operation *> ops,
-                          const ConversionTarget &target,
-                          const FrozenRewritePatternSet &patterns) {
-  OperationConverter opConverter(target, patterns, OpConversionMode::Full);
+LogicalResult mlir::applyFullConversion(ArrayRef<Operation *> ops,
+                                        const ConversionTarget &target,
+                                        const FrozenRewritePatternSet &patterns,
+                                        ConversionConfig config) {
+  OperationConverter opConverter(target, patterns, config,
+                                 OpConversionMode::Full);
   return opConverter.convertOperations(ops);
 }
-LogicalResult
-mlir::applyFullConversion(Operation *op, const ConversionTarget &target,
-                          const FrozenRewritePatternSet &patterns) {
-  return applyFullConversion(llvm::ArrayRef(op), target, patterns);
+LogicalResult mlir::applyFullConversion(Operation *op,
+                                        const ConversionTarget &target,
+                                        const FrozenRewritePatternSet &patterns,
+                                        ConversionConfig config) {
+  return applyFullConversion(llvm::ArrayRef(op), target, patterns, config);
 }
 
 //===----------------------------------------------------------------------===//
 // Analysis Conversion
 
-LogicalResult
-mlir::applyAnalysisConversion(ArrayRef<Operation *> ops,
-                              ConversionTarget &target,
-                              const FrozenRewritePatternSet &patterns,
-                              DenseSet<Operation *> &convertedOps,
-                              function_ref<void(Diagnostic &)> notifyCallback) {
-  OperationConverter opConverter(target, patterns, OpConversionMode::Analysis,
-                                 &convertedOps);
-  return opConverter.convertOperations(ops, notifyCallback);
+LogicalResult mlir::applyAnalysisConversion(
+    ArrayRef<Operation *> ops, ConversionTarget &target,
+    const FrozenRewritePatternSet &patterns, ConversionConfig config) {
+  OperationConverter opConverter(target, patterns, config,
+                                 OpConversionMode::Analysis);
+  return opConverter.convertOperations(ops);
 }
 LogicalResult
 mlir::applyAnalysisConversion(Operation *op, ConversionTarget &target,
                               const FrozenRewritePatternSet &patterns,
-                              DenseSet<Operation *> &convertedOps,
-                              function_ref<void(Diagnostic &)> notifyCallback) {
-  return applyAnalysisConversion(llvm::ArrayRef(op), target, patterns,
-                                 convertedOps, notifyCallback);
+                              ConversionConfig config) {
+  return applyAnalysisConversion(llvm::ArrayRef(op), target, patterns, config);
 }
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 108cfe8950ef6..bde4255ee4b36 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1152,8 +1152,10 @@ struct TestLegalizePatternDriver
     // Handle a partial conversion.
     if (mode == ConversionMode::Partial) {
       DenseSet<Operation *> unlegalizedOps;
-      if (failed(applyPartialConversion(
-              getOperation(), target, std::move(patterns), &unlegalizedOps))) {
+      ConversionConfig config;
+      config.unlegalizedOps = &unlegalizedOps;
+      if (failed(applyPartialConversion(getOperation(), target,
+                                        std::move(patterns), config))) {
         getOperation()->emitRemark() << "applyPartialConversion failed";
       }
       // Emit remarks for each legalizable operation.
@@ -1181,8 +1183,10 @@ struct TestLegalizePatternDriver
 
     // Analyze the convertible operations.
     DenseSet<Operation *> legalizedOps;
+    ConversionConfig config;
+    config.legalizableOps = &legalizedOps;
     if (failed(applyAnalysisConversion(getOperation(), target,
-                                       std::move(patterns), legalizedOps)))
+                                       std::move(patterns), config)))
       return signalPassFailure();
 
     // Emit remarks for each legalizable operation.
@@ -1806,8 +1810,10 @@ struct TestMergeBlocksPatternDriver
         });
 
     DenseSet<Operation *> unlegalizedOps;
+    ConversionConfig config;
+    config.unlegalizedOps = &unlegalizedOps;
     (void)applyPartialConversion(getOperation(), target, std::move(patterns),
-                                 &unlegalizedOps);
+                                 config);
     for (auto *op : unlegalizedOps)
       op->emitRemark() << "op '" << op->getName() << "' is not legalizable";
   }

From 5cb2ebc08f6fa42341409b88466c5c266e5839cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Fri, 23 Feb 2024 11:37:30 +0100
Subject: [PATCH 151/546] Reland "[clang] Preserve found-decl when constructing
 VarTemplateIds" (#82612)

Update include-cleaner tests. Now that we have proper found-decls set up
for VarTemplates, in case of instationtations we point to primary
templates and not specializations. To be changed in a follow-up patch.
---
 .../include-cleaner/unittests/WalkASTTest.cpp  | 16 ++++++++--------
 clang/include/clang/Sema/Sema.h                |  2 +-
 clang/lib/Sema/SemaTemplate.cpp                | 18 ++++++++----------
 clang/test/AST/ast-dump-using.cpp              |  7 +++++++
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
index bdfc24b8edee3..0be5db36b1fc5 100644
--- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
@@ -200,24 +200,24 @@ TEST(WalkAST, VarTemplates) {
   EXPECT_THAT(testWalk(R"cpp(
     template <typename T> T $explicit^Foo = 0;)cpp",
                        "int z = ^Foo<int>;"),
-              ElementsAre(Decl::VarTemplateSpecialization));
+              ElementsAre(Decl::VarTemplate));
   EXPECT_THAT(testWalk(R"cpp(
-    template<typename T> T Foo = 0;
-    template<> int $explicit^Foo<int> = 1;)cpp",
+    template<typename T> T $explicit^Foo = 0;
+    template<> int Foo<int> = 1;)cpp",
                        "int x = ^Foo<int>;"),
-              ElementsAre(Decl::VarTemplateSpecialization));
+              ElementsAre(Decl::VarTemplate));
   // FIXME: This points at implicit specialization, instead we should point to
   // explicit partial specializaiton pattern.
   EXPECT_THAT(testWalk(R"cpp(
-    template<typename T> T Foo = 0;
-    template<typename T> T* $explicit^Foo<T*> = nullptr;)cpp",
+    template<typename T> T $explicit^Foo = 0;
+    template<typename T> T* Foo<T*> = nullptr;)cpp",
                        "int *x = ^Foo<int *>;"),
-              ElementsAre(Decl::VarTemplateSpecialization));
+              ElementsAre(Decl::VarTemplate));
   EXPECT_THAT(testWalk(R"cpp(
     template<typename T> T $explicit^Foo = 0;
     template int Foo<int>;)cpp",
                        "int x = ^Foo<int>;"),
-              ElementsAre(Decl::VarTemplateSpecialization));
+              ElementsAre(Decl::VarTemplate));
 }
 TEST(WalkAST, FunctionTemplates) {
   // Explicit instantiation and (partial) specialization references primary
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index fcccac10f4733..e457694e4625d 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -8540,7 +8540,7 @@ class Sema final {
   /// if the arguments are dependent.
   ExprResult CheckVarTemplateId(const CXXScopeSpec &SS,
                                 const DeclarationNameInfo &NameInfo,
-                                VarTemplateDecl *Template,
+                                VarTemplateDecl *Template, NamedDecl *FoundD,
                                 SourceLocation TemplateLoc,
                                 const TemplateArgumentListInfo *TemplateArgs);
 
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 1a975a8d0a0df..7d3d665194add 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4958,11 +4958,10 @@ Sema::CheckVarTemplateId(VarTemplateDecl *Template, SourceLocation TemplateLoc,
   return Decl;
 }
 
-ExprResult
-Sema::CheckVarTemplateId(const CXXScopeSpec &SS,
-                         const DeclarationNameInfo &NameInfo,
-                         VarTemplateDecl *Template, SourceLocation TemplateLoc,
-                         const TemplateArgumentListInfo *TemplateArgs) {
+ExprResult Sema::CheckVarTemplateId(
+    const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo,
+    VarTemplateDecl *Template, NamedDecl *FoundD, SourceLocation TemplateLoc,
+    const TemplateArgumentListInfo *TemplateArgs) {
 
   DeclResult Decl = CheckVarTemplateId(Template, TemplateLoc, NameInfo.getLoc(),
                                        *TemplateArgs);
@@ -4978,8 +4977,7 @@ Sema::CheckVarTemplateId(const CXXScopeSpec &SS,
                                        NameInfo.getLoc());
 
   // Build an ordinary singleton decl ref.
-  return BuildDeclarationNameExpr(SS, NameInfo, Var,
-                                  /*FoundD=*/nullptr, TemplateArgs);
+  return BuildDeclarationNameExpr(SS, NameInfo, Var, FoundD, TemplateArgs);
 }
 
 void Sema::diagnoseMissingTemplateArguments(TemplateName Name,
@@ -5066,9 +5064,9 @@ ExprResult Sema::BuildTemplateIdExpr(const CXXScopeSpec &SS,
   bool KnownDependent = false;
   // In C++1y, check variable template ids.
   if (R.getAsSingle<VarTemplateDecl>()) {
-    ExprResult Res = CheckVarTemplateId(SS, R.getLookupNameInfo(),
-                                        R.getAsSingle<VarTemplateDecl>(),
-                                        TemplateKWLoc, TemplateArgs);
+    ExprResult Res = CheckVarTemplateId(
+        SS, R.getLookupNameInfo(), R.getAsSingle<VarTemplateDecl>(),
+        R.getRepresentativeDecl(), TemplateKWLoc, TemplateArgs);
     if (Res.isInvalid() || Res.isUsable())
       return Res;
     // Result is dependent. Carry on to build an UnresolvedLookupEpxr.
diff --git a/clang/test/AST/ast-dump-using.cpp b/clang/test/AST/ast-dump-using.cpp
index 5a4e910ffb865..8e5c60d3aabf4 100644
--- a/clang/test/AST/ast-dump-using.cpp
+++ b/clang/test/AST/ast-dump-using.cpp
@@ -2,6 +2,7 @@
 
 namespace a {
 struct S;
+template <typename T> T x = {};
 }
 namespace b {
 using a::S;
@@ -21,4 +22,10 @@ typedef S e; // check the same UsingType is reused.
 // CHECK-NEXT:   `-UsingType [[TYPE_ADDR]] 'a::S' sugar
 // CHECK-NEXT:     |-UsingShadow [[SHADOW_ADDR]] 'S'
 // CHECK-NEXT:     `-RecordType {{.*}} 'a::S'
+using a::x;
+
+void foo() {
+  x<int> = 3;
+  // CHECK: DeclRefExpr {{.*}} 'x' {{.*}} (UsingShadow {{.*}} 'x')
+}
 }

From 4419b2c27fa45a08bc3892ad0c8c5eb95d96d608 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 23 Feb 2024 11:38:00 +0100
Subject: [PATCH 152/546] [clangd] Make tidy-rename tests conditional

---
 .../clangd/unittests/ClangdLSPServerTests.cpp             | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
index 555c4c5749981..75a140767035b 100644
--- a/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ClangdLSPServerTests.cpp
@@ -11,6 +11,7 @@
 #include "ClangdServer.h"
 #include "ConfigProvider.h"
 #include "Diagnostics.h"
+#include "Feature.h"
 #include "FeatureModule.h"
 #include "LSPBinder.h"
 #include "LSPClient.h"
@@ -198,6 +199,9 @@ TEST_F(LSPTest, RecordsLatencies) {
 // clang-tidy's renames are converted to clangd's internal rename functionality,
 // see clangd#1589 and clangd#741
 TEST_F(LSPTest, ClangTidyRename) {
+  // This test requires clang-tidy checks to be linked in.
+  if (!CLANGD_TIDY_CHECKS)
+    return;
   Annotations Header(R"cpp(
     void [[foo]]();
   )cpp");
@@ -214,7 +218,9 @@ TEST_F(LSPTest, ClangTidyRename) {
   Client.didOpen("foo.hpp", Header.code());
   Client.didOpen("foo.cpp", Source.code());
 
-  auto RenameDiag = Client.diagnostics("foo.cpp").value().at(0);
+  auto Diags = Client.diagnostics("foo.cpp");
+  ASSERT_TRUE(Diags && !Diags->empty());
+  auto RenameDiag = Diags->front();
 
   auto RenameCommand =
       (*Client

From de04b7d44edbfe8c2357cc291f8806575e6e93f2 Mon Sep 17 00:00:00 2001
From: Daniel Krupp <daniel.krupp@ericsson.com>
Date: Fri, 23 Feb 2024 11:44:34 +0100
Subject: [PATCH 153/546] [analyzer] Fix core.VLASize checker false positive
 taint reports (#68140)

The checker reported a false positive on this code

void testTaintedSanitizedVLASize(void) {
  int x;
  scanf("%d", &x);
  if (x<1)
    return;
  int vla[x]; // no-warning
}

After the fix, the checker only emits tainted warning if the vla size is
coming from a tainted source and it cannot prove that it is positive.
---
 clang/docs/analyzer/checkers.rst              | 27 ++++++++++++++++---
 .../Checkers/VLASizeChecker.cpp               | 16 +++++------
 .../test/Analysis/taint-diagnostic-visitor.c  |  4 +--
 clang/test/Analysis/taint-generic.c           | 11 +++++++-
 4 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 510629d8a2d48..899622ae283b1 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -213,9 +213,8 @@ Check for undefined results of binary operators.
 
 core.VLASize (C)
 """"""""""""""""
-Check for declarations of Variable Length Arrays of undefined or zero size.
-
- Check for declarations of VLA of undefined or zero size.
+Check for declarations of Variable Length Arrays (VLA) of undefined, zero or negative
+size.
 
 .. code-block:: c
 
@@ -229,6 +228,28 @@ Check for declarations of Variable Length Arrays of undefined or zero size.
    int vla2[x]; // warn: zero size
  }
 
+
+The checker also gives warning if the `TaintPropagation` checker is switched on
+and an unbound, attacker controlled (tainted) value is used to define
+the size of the VLA.
+
+.. code-block:: c
+
+ void taintedVLA(void) {
+   int x;
+   scanf("%d", &x);
+   int vla[x]; // Declared variable-length array (VLA) has tainted (attacker controlled) size, that can be 0 or negative
+ }
+
+ void taintedVerfieidVLA(void) {
+   int x;
+   scanf("%d", &x);
+   if (x<1)
+     return;
+   int vla[x]; // no-warning. The analyzer can prove that x must be positive.
+ }
+
+
 .. _core-uninitialized-ArraySubscript:
 
 core.uninitialized.ArraySubscript (C)
diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
index d76fe49918690..87d255eeffc17 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
@@ -164,12 +164,6 @@ ProgramStateRef VLASizeChecker::checkVLAIndexSize(CheckerContext &C,
   if (SizeV.isUnknown())
     return nullptr;
 
-  // Check if the size is tainted.
-  if (isTainted(State, SizeV)) {
-    reportTaintBug(SizeE, State, C, SizeV);
-    return nullptr;
-  }
-
   // Check if the size is zero.
   DefinedSVal SizeD = SizeV.castAs<DefinedSVal>();
 
@@ -192,10 +186,10 @@ ProgramStateRef VLASizeChecker::checkVLAIndexSize(CheckerContext &C,
 
   SVal LessThanZeroVal =
       SVB.evalBinOp(State, BO_LT, SizeD, Zero, SVB.getConditionType());
+  ProgramStateRef StatePos, StateNeg;
   if (std::optional<DefinedSVal> LessThanZeroDVal =
           LessThanZeroVal.getAs<DefinedSVal>()) {
     ConstraintManager &CM = C.getConstraintManager();
-    ProgramStateRef StatePos, StateNeg;
 
     std::tie(StateNeg, StatePos) = CM.assumeDual(State, *LessThanZeroDVal);
     if (StateNeg && !StatePos) {
@@ -205,6 +199,12 @@ ProgramStateRef VLASizeChecker::checkVLAIndexSize(CheckerContext &C,
     State = StatePos;
   }
 
+  // Check if the size is tainted.
+  if ((StateNeg || StateZero) && isTainted(State, SizeV)) {
+    reportTaintBug(SizeE, State, C, SizeV);
+    return nullptr;
+  }
+
   return State;
 }
 
@@ -218,7 +218,7 @@ void VLASizeChecker::reportTaintBug(const Expr *SizeE, ProgramStateRef State,
   SmallString<256> buf;
   llvm::raw_svector_ostream os(buf);
   os << "Declared variable-length array (VLA) ";
-  os << "has tainted size";
+  os << "has tainted (attacker controlled) size that can be 0 or negative";
 
   auto report = std::make_unique<PathSensitiveBugReport>(TaintBT, os.str(), N);
   report->addRange(SizeE->getSourceRange());
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index a3fa1639bffee..020e9579ac535 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -46,8 +46,8 @@ void taintDiagnosticVLA(void) {
   scanf("%d", &x); // expected-note {{Value assigned to 'x'}}
                    // expected-note@-1 {{Taint originated here}}
                    // expected-note@-2 {{Taint propagated to the 2nd argument}}
-  int vla[x]; // expected-warning {{Declared variable-length array (VLA) has tainted size}}
-              // expected-note@-1 {{Declared variable-length array (VLA) has tainted size}}
+  int vla[x]; // expected-warning {{Declared variable-length array (VLA) has tainted}}
+              // expected-note@-1 {{Declared variable-length array (VLA) has tainted}}
 }
 
 
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 4ff474b2ed40d..e85b4106a5806 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -405,7 +405,16 @@ int testDivByZero(void) {
 void testTaintedVLASize(void) {
   int x;
   scanf("%d", &x);
-  int vla[x]; // expected-warning{{Declared variable-length array (VLA) has tainted size}}
+  int vla[x]; // expected-warning{{Declared variable-length array (VLA) has tainted (attacker controlled) size that can be 0 or negative}}
+}
+
+// Tainted-sanitized VLAs.
+void testTaintedSanitizedVLASize(void) {
+  int x;
+  scanf("%d", &x);
+  if (x<1)
+    return;
+  int vla[x]; // no-warning
 }
 
 int testTaintedAllocaMem() {

From 9dfb8430509619a4e9d36fd00a11b83a2d5d0c3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Fri, 23 Feb 2024 11:48:04 +0100
Subject: [PATCH 154/546] [include-cleaner] Use FoundDecl only for
 using-shadow-decls (#82615)

---
 .../include-cleaner/lib/WalkAST.cpp           |  5 +++
 .../include-cleaner/unittests/WalkASTTest.cpp | 34 +++++++++++--------
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
index 6c4d9b7862d91..277e6ec5b0890 100644
--- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
+++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp
@@ -128,6 +128,11 @@ class ASTWalker : public RecursiveASTVisitor<ASTWalker> {
 
   bool VisitDeclRefExpr(DeclRefExpr *DRE) {
     auto *FD = DRE->getFoundDecl();
+    // Prefer the underlying decl if FoundDecl isn't a shadow decl, e.g:
+    // - For templates, found-decl is always primary template, but we want the
+    // specializaiton itself.
+    if (!llvm::isa<UsingShadowDecl>(FD))
+      FD = DRE->getDecl();
     // For refs to non-meber-like decls, use the found decl.
     // For member-like decls, we should have a reference from the qualifier to
     // the container decl instead, which is preferred as it'll handle
diff --git a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
index 0be5db36b1fc5..e238dc3d902bb 100644
--- a/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/WalkASTTest.cpp
@@ -200,24 +200,26 @@ TEST(WalkAST, VarTemplates) {
   EXPECT_THAT(testWalk(R"cpp(
     template <typename T> T $explicit^Foo = 0;)cpp",
                        "int z = ^Foo<int>;"),
-              ElementsAre(Decl::VarTemplate));
+              ElementsAre(Decl::VarTemplateSpecialization));
   EXPECT_THAT(testWalk(R"cpp(
-    template<typename T> T $explicit^Foo = 0;
-    template<> int Foo<int> = 1;)cpp",
+    template<typename T> T Foo = 0;
+    template<> int $explicit^Foo<int> = 1;)cpp",
                        "int x = ^Foo<int>;"),
-              ElementsAre(Decl::VarTemplate));
+              ElementsAre(Decl::VarTemplateSpecialization));
   // FIXME: This points at implicit specialization, instead we should point to
   // explicit partial specializaiton pattern.
   EXPECT_THAT(testWalk(R"cpp(
-    template<typename T> T $explicit^Foo = 0;
-    template<typename T> T* Foo<T*> = nullptr;)cpp",
+    template<typename T> T Foo = 0;
+    template<typename T> T* $explicit^Foo<T*> = nullptr;)cpp",
                        "int *x = ^Foo<int *>;"),
-              ElementsAre(Decl::VarTemplate));
+              ElementsAre(Decl::VarTemplateSpecialization));
+  // Implicit specializations through explicit instantiations has source
+  // locations pointing at the primary template.
   EXPECT_THAT(testWalk(R"cpp(
     template<typename T> T $explicit^Foo = 0;
     template int Foo<int>;)cpp",
                        "int x = ^Foo<int>;"),
-              ElementsAre(Decl::VarTemplate));
+              ElementsAre(Decl::VarTemplateSpecialization));
 }
 TEST(WalkAST, FunctionTemplates) {
   // Explicit instantiation and (partial) specialization references primary
@@ -239,18 +241,19 @@ TEST(WalkAST, FunctionTemplates) {
   EXPECT_THAT(testWalk(R"cpp(
     template <typename T> void $explicit^foo() {})cpp",
                        "auto x = []{ ^foo<int>(); };"),
-              ElementsAre(Decl::FunctionTemplate));
-  // FIXME: DeclRefExpr points at primary template, not the specialization.
+              ElementsAre(Decl::Function));
   EXPECT_THAT(testWalk(R"cpp(
-    template<typename T> void $explicit^foo() {}
-    template<> void foo<int>(){})cpp",
+    template<typename T> void foo() {}
+    template<> void $explicit^foo<int>(){})cpp",
                        "auto x = []{ ^foo<int>(); };"),
-              ElementsAre(Decl::FunctionTemplate));
+              ElementsAre(Decl::Function));
+  // The decl is actually the specialization, but explicit instantations point
+  // at the primary template.
   EXPECT_THAT(testWalk(R"cpp(
     template<typename T> void $explicit^foo() {};
     template void foo<int>();)cpp",
                        "auto x = [] { ^foo<int>(); };"),
-              ElementsAre(Decl::FunctionTemplate));
+              ElementsAre(Decl::Function));
 }
 TEST(WalkAST, TemplateSpecializationsFromUsingDecl) {
   // Class templates
@@ -548,7 +551,8 @@ TEST(WalkAST, Concepts) {
   testWalk(Concept, "template<typename T> void func() requires ^Foo<T> {}");
   testWalk(Concept, "void func(^Foo auto x) {}");
   // FIXME: Foo should be explicitly referenced.
-  testWalk("template<typename T> concept Foo = true;", "void func() { ^Foo auto x = 1; }");
+  testWalk("template<typename T> concept Foo = true;",
+           "void func() { ^Foo auto x = 1; }");
 }
 
 TEST(WalkAST, FriendDecl) {

From 7bb08ee8260c825eb5af4824bc62f73155b4b592 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 11:55:24 +0100
Subject: [PATCH 155/546] [mlir][Transforms][NFC] Decouple
 `ConversionPatternRewriterImpl` from `ConversionPatternRewriter` (#82333)

`ConversionPatternRewriterImpl` no longer maintains a reference to the
respective `ConversionPatternRewriter`. An `MLIRContext` is sufficient.
This commit simplifies the internal state of
`ConversionPatternRewriterImpl`.
---
 .../Transforms/Utils/DialectConversion.cpp    | 44 +++++++++----------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 508ee7416d55d..d015bd5290123 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -756,10 +756,9 @@ static RewriteTy *findSingleRewrite(R &&rewrites, Block *block) {
 namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
-  explicit ConversionPatternRewriterImpl(PatternRewriter &rewriter,
+  explicit ConversionPatternRewriterImpl(MLIRContext *ctx,
                                          const ConversionConfig &config)
-      : rewriter(rewriter), eraseRewriter(rewriter.getContext()),
-        config(config) {}
+      : eraseRewriter(ctx), config(config) {}
 
   //===--------------------------------------------------------------------===//
   // State Management
@@ -854,8 +853,8 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
                                        Type origOutputType,
                                        const TypeConverter *converter);
 
-  Value buildUnresolvedArgumentMaterialization(PatternRewriter &rewriter,
-                                               Location loc, ValueRange inputs,
+  Value buildUnresolvedArgumentMaterialization(Block *block, Location loc,
+                                               ValueRange inputs,
                                                Type origOutputType,
                                                Type outputType,
                                                const TypeConverter *converter);
@@ -934,8 +933,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   // State
   //===--------------------------------------------------------------------===//
 
-  PatternRewriter &rewriter;
-
   /// This rewriter must be used for erasing ops/blocks.
   SingleEraseRewriter eraseRewriter;
 
@@ -1037,8 +1034,12 @@ void BlockTypeConversionRewrite::rollback() {
 
 LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
     function_ref<Operation *(Value)> findLiveUser) {
+  auto builder = OpBuilder::atBlockBegin(block, /*listener=*/&rewriterImpl);
+
   // Process the remapping for each of the original arguments.
   for (auto it : llvm::enumerate(origBlock->getArguments())) {
+    OpBuilder::InsertionGuard g(builder);
+
     // If the type of this argument changed and the argument is still live, we
     // need to materialize a conversion.
     BlockArgument origArg = it.value();
@@ -1050,14 +1051,12 @@ LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
 
     Value replacementValue = rewriterImpl.mapping.lookupOrDefault(origArg);
     bool isDroppedArg = replacementValue == origArg;
-    if (isDroppedArg)
-      rewriterImpl.rewriter.setInsertionPointToStart(getBlock());
-    else
-      rewriterImpl.rewriter.setInsertionPointAfterValue(replacementValue);
+    if (!isDroppedArg)
+      builder.setInsertionPointAfterValue(replacementValue);
     Value newArg;
     if (converter) {
       newArg = converter->materializeSourceConversion(
-          rewriterImpl.rewriter, origArg.getLoc(), origArg.getType(),
+          builder, origArg.getLoc(), origArg.getType(),
           isDroppedArg ? ValueRange() : ValueRange(replacementValue));
       assert((!newArg || newArg.getType() == origArg.getType()) &&
              "materialization hook did not provide a value of the expected "
@@ -1322,6 +1321,8 @@ LogicalResult ConversionPatternRewriterImpl::convertNonEntryRegionTypes(
 Block *ConversionPatternRewriterImpl::applySignatureConversion(
     Block *block, const TypeConverter *converter,
     TypeConverter::SignatureConversion &signatureConversion) {
+  MLIRContext *ctx = block->getParentOp()->getContext();
+
   // If no arguments are being changed or added, there is nothing to do.
   unsigned origArgCount = block->getNumArguments();
   auto convertedTypes = signatureConversion.getConvertedTypes();
@@ -1338,7 +1339,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
 
   // Map all new arguments to the location of the argument they originate from.
   SmallVector<Location> newLocs(convertedTypes.size(),
-                                rewriter.getUnknownLoc());
+                                Builder(ctx).getUnknownLoc());
   for (unsigned i = 0; i < origArgCount; ++i) {
     auto inputMap = signatureConversion.getInputMapping(i);
     if (!inputMap || inputMap->replacementValue)
@@ -1357,8 +1358,6 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
   SmallVector<std::optional<ConvertedArgInfo>, 1> argInfo;
   argInfo.resize(origArgCount);
 
-  OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(newBlock);
   for (unsigned i = 0; i != origArgCount; ++i) {
     auto inputMap = signatureConversion.getInputMapping(i);
     if (!inputMap)
@@ -1401,7 +1400,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion(
         outputType = legalOutputType;
 
       newArg = buildUnresolvedArgumentMaterialization(
-          rewriter, origArg.getLoc(), replArgs, origOutputType, outputType,
+          newBlock, origArg.getLoc(), replArgs, origOutputType, outputType,
           converter);
     }
 
@@ -1439,12 +1438,11 @@ Value ConversionPatternRewriterImpl::buildUnresolvedMaterialization(
   return convertOp.getResult(0);
 }
 Value ConversionPatternRewriterImpl::buildUnresolvedArgumentMaterialization(
-    PatternRewriter &rewriter, Location loc, ValueRange inputs,
-    Type origOutputType, Type outputType, const TypeConverter *converter) {
-  return buildUnresolvedMaterialization(
-      MaterializationKind::Argument, rewriter.getInsertionBlock(),
-      rewriter.getInsertionPoint(), loc, inputs, outputType, origOutputType,
-      converter);
+    Block *block, Location loc, ValueRange inputs, Type origOutputType,
+    Type outputType, const TypeConverter *converter) {
+  return buildUnresolvedMaterialization(MaterializationKind::Argument, block,
+                                        block->begin(), loc, inputs, outputType,
+                                        origOutputType, converter);
 }
 Value ConversionPatternRewriterImpl::buildUnresolvedTargetMaterialization(
     Location loc, Value input, Type outputType,
@@ -1556,7 +1554,7 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
 ConversionPatternRewriter::ConversionPatternRewriter(
     MLIRContext *ctx, const ConversionConfig &config)
     : PatternRewriter(ctx),
-      impl(new detail::ConversionPatternRewriterImpl(*this, config)) {
+      impl(new detail::ConversionPatternRewriterImpl(ctx, config)) {
   setListener(impl.get());
 }
 

From 404854ee2018489c15c3454857d92e3bab7c1672 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 23 Feb 2024 11:19:30 +0100
Subject: [PATCH 156/546] [clang][Interp][NFC] Print global variable
 initialization state

---
 clang/lib/AST/Interp/Disasm.cpp  | 8 ++++++++
 clang/lib/AST/Interp/Program.cpp | 2 +-
 clang/lib/AST/Interp/Program.h   | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 3bc9312debeb7..315ddb293044b 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -101,6 +101,14 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
   for (const Global *G : Globals) {
     const Descriptor *Desc = G->block()->getDescriptor();
     OS << GI << ": " << (void *)G->block() << " ";
+    {
+      Pointer GP = getPtrGlobal(GI);
+      ColorScope SC(OS, true,
+                    GP.isInitialized()
+                        ? TerminalColor{llvm::raw_ostream::GREEN, false}
+                        : TerminalColor{llvm::raw_ostream::RED, false});
+      OS << (GP.isInitialized() ? "initialized " : "uninitialized ");
+    }
     Desc->dump(OS);
     OS << "\n";
     ++GI;
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 61293a3fef470..86e18ede63811 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -102,7 +102,7 @@ unsigned Program::createGlobalString(const StringLiteral *S) {
   return I;
 }
 
-Pointer Program::getPtrGlobal(unsigned Idx) {
+Pointer Program::getPtrGlobal(unsigned Idx) const {
   assert(Idx < Globals.size());
   return Pointer(Globals[Idx]->block());
 }
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index 7922eafbeb2d0..045bf7ab7745b 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -67,7 +67,7 @@ class Program final {
   unsigned createGlobalString(const StringLiteral *S);
 
   /// Returns a pointer to a global.
-  Pointer getPtrGlobal(unsigned Idx);
+  Pointer getPtrGlobal(unsigned Idx) const;
 
   /// Returns the value of a global.
   Block *getGlobal(unsigned Idx) {

From e7c60915e61912fb24707dc67e6c4fc919515796 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 23 Feb 2024 12:01:30 +0100
Subject: [PATCH 157/546] Remove duplicated REQUIRES: asserts

---
 llvm/test/Transforms/LoopVectorize/X86/pr72969.ll | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 738f5cbaebea5..f982695983330 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -S < %s
 ; RUN: not --crash opt -mtriple=x86_64 -mattr=-avx,-avx2,-avx512f,+sse,-sse2,-sse3,-sse4.2 -passes=loop-vectorize -force-vector-width=4 -S < %s
-; REQUIRES: asserts
 
 @h = global i64 0
 

From 790bcecce6c135476d2551805c09ed670b9f8418 Mon Sep 17 00:00:00 2001
From: Evgenii Kudriashov <evgenii.kudriashov@intel.com>
Date: Fri, 23 Feb 2024 12:11:50 +0100
Subject: [PATCH 158/546] [GlobalISel] Fix a check that aligned tail call is
 lowered (#82016)

Despite of a valid tail call opportunity, backends still may not
generate a tail call or such lowering is not implemented yet.

Check that lowering has happened instead of its possibility when
generating G_ASSERT_ALIGN.
---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp  |  2 +-
 .../X86/GlobalISel/calllowering-tailcall.ll   | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 3bd1542eeb746..77dc265d795d0 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -187,7 +187,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   if (!lowerCall(MIRBuilder, Info))
     return false;
 
-  if (ReturnHintAlignReg && !Info.IsTailCall) {
+  if (ReturnHintAlignReg && !Info.LoweredTailCall) {
     MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg,
                                 ReturnHintAlign);
   }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll
new file mode 100644
index 0000000000000..6a856c32eb261
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X86
+
+declare ptr @foo()
+
+define ptr @aligned_tailcall() nounwind {
+; X64-LABEL: aligned_tailcall:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq foo
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: aligned_tailcall:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    calll foo
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %call = tail call align 8 ptr @foo()
+  ret ptr %call
+}

From 22734e15d8f2c437e8543f19632299d2e09b31f3 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 23 Feb 2024 11:31:24 +0000
Subject: [PATCH 159/546] [Clang][AArch64] Fix 'svzero_za' intrinsic to take no
 arguments. (#82648)

We previously defined svzero_za as:

  void svzero_za();

rather than:

  void svzero_za(void);

Which meant that Clang accepted arguments. Compiling for example
`svzero_za(<non-constant integer>)` ended up with incorrect IR and a
compiler crash because it couldn't select an instruction for it.
---
 clang/include/clang/Basic/arm_sme.td                   | 2 +-
 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 2da0e8d2aba9a..1ac6d5170ea28 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -142,7 +142,7 @@ let TargetGuard = "sme" in {
   def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero",
                              [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                              [ImmCheck<0, ImmCheck0_255>]>;
-  def SVZERO_ZA      : SInst<"svzero_za", "v", "", MergeNone, "aarch64_sme_zero",
+  def SVZERO_ZA      : SInst<"svzero_za", "vv", "", MergeNone, "aarch64_sme_zero",
                              [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 }
 
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
new file mode 100644
index 0000000000000..e0b6c391d9890
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fsyntax-only -verify %s
+
+void test_svzero_args(uint64_t m) {
+  svzero_za(0); // expected-error {{too many arguments to function call, expected 0, have 1}}
+  svzero_za(m); // expected-error {{too many arguments to function call, expected 0, have 1}}
+  svzero_mask_za(m); // expected-error {{argument to 'svzero_mask_za' must be a constant integer}}
+}

From 3c90fce4504e22953ec5586599afaecfb2923a9e Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 23 Feb 2024 11:31:53 +0000
Subject: [PATCH 160/546] [Clang][AArch64] Add missing prototypes for
 streaming-compatible routines (#82649)

---
 .../acle_sme_state_funs.c                     | 59 ++++++++++++++++++-
 clang/utils/TableGen/SveEmitter.cpp           |  6 ++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
index dc07efbb81603..e80a965394e7f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -28,12 +28,12 @@ bool test_in_streaming_mode(void) __arm_streaming_compatible {
 
 // CHECK-LABEL: @test_za_disable(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR3]]
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_za_disablev(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR4:[0-9]+]]
+// CPP-CHECK-NEXT:    tail call void @__arm_za_disable() #[[ATTR3]]
 // CPP-CHECK-NEXT:    ret void
 //
 void test_za_disable(void) __arm_streaming_compatible {
@@ -70,3 +70,58 @@ void test_svundef_za(void) __arm_streaming_compatible __arm_out("za") {
   svundef_za();
 }
 
+// CHECK-LABEL: @test_sc_memcpy(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    ret ptr [[CALL]]
+//
+// CPP-CHECK-LABEL: @_Z14test_sc_memcpyPvPKvm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memcpy(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    ret ptr [[CALL]]
+//
+void *test_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy(dest, src, n);
+}
+
+// CHECK-LABEL: @test_sc_memmove(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    ret ptr [[CALL]]
+//
+// CPP-CHECK-LABEL: @_Z15test_sc_memmovePvPKvm(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memmove(ptr noundef [[DEST:%.*]], ptr noundef [[SRC:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    ret ptr [[CALL]]
+//
+void *test_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible {
+  return __arm_sc_memmove(dest, src, n);
+}
+
+// CHECK-LABEL: @test_sc_memset(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    ret ptr [[CALL]]
+//
+// CPP-CHECK-LABEL: @_Z14test_sc_memsetPvim(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memset(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    ret ptr [[CALL]]
+//
+void *test_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible {
+  return __arm_sc_memset(s, c, n);
+}
+
+// CHECK-LABEL: @test_sc_memchr(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CHECK-NEXT:    ret ptr [[CALL]]
+//
+// CPP-CHECK-LABEL: @_Z14test_sc_memchrPvim(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[CALL:%.*]] = tail call ptr @__arm_sc_memchr(ptr noundef [[S:%.*]], i32 noundef [[C:%.*]], i64 noundef [[N:%.*]]) #[[ATTR3]]
+// CPP-CHECK-NEXT:    ret ptr [[CALL]]
+//
+void *test_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible {
+  return __arm_sc_memchr(s, c, n);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 174304f09007b..131397e3825b0 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1579,6 +1579,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "#endif\n";
 
   OS << "#include <arm_sve.h>\n\n";
+  OS << "#include <stddef.h>\n\n";
 
   OS << "/* Function attributes */\n";
   OS << "#define __ai static __inline__ __attribute__((__always_inline__, "
@@ -1605,6 +1606,11 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "  return x0 & 1;\n";
   OS << "}\n\n";
 
+  OS << "void *__arm_sc_memcpy(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
+  OS << "void *__arm_sc_memmove(void *dest, const void *src, size_t n) __arm_streaming_compatible;\n";
+  OS << "void *__arm_sc_memset(void *s, int c, size_t n) __arm_streaming_compatible;\n";
+  OS << "void *__arm_sc_memchr(void *s, int c, size_t n) __arm_streaming_compatible;\n\n";
+
   OS << "__ai __attribute__((target(\"sme\"))) void svundef_za(void) "
         "__arm_streaming_compatible __arm_out(\"za\") "
         "{ }\n\n";

From 8a164220207b579c31d6aa6552944441c83e9465 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 23 Feb 2024 11:37:21 +0000
Subject: [PATCH 161/546] [RemoveDIs] Add DPLabels support [3a/3] (#82633)

Patch 2 of 3 to add llvm.dbg.label support to the RemoveDIs project. The
patch stack adds the DPLabel class, which is the RemoveDIs llvm.dbg.label
equivalent.

   1. Add DbgRecord base class for DPValue and the not-yet-added
       DPLabel class.
   2. Add the DPLabel class.
-> 3. Add support to passes.

The next patch, #82639, will enable conversion between dbg.labels and DPLabels.

AssignemntTrackingAnalysis support could have gone two ways:

1. Have the analysis store a DPLabel representation in its results -
   SelectionDAGBuilder reads the analysis results and ignores all DbgRecord
   kinds.
2. Ignore DPLabels in the analysis - SelectionDAGBuilder reads the analysis
   results but still needs to iterate over DPLabels from the IR.

I went with option 2 because it's less work and is no less correct than 1. It's
worth noting that causes labels to sink to the bottom of packs of debug records.
e.g., [value, label, value] becomes [value, value, label]. This shouldn't be a
problem because labels and variable locations don't have an ordering requirement.
The ordering between variable locations is maintained and the label movement is
deterministic
---
 .../include/llvm/IR/DebugProgramInstruction.h | 10 ++---
 llvm/include/llvm/IR/IntrinsicInst.h          |  3 ++
 .../CodeGen/AssignmentTrackingAnalysis.cpp    |  9 ++--
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 12 ++++-
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp    | 17 ++++++-
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 29 ++++++++----
 llvm/lib/IR/AsmWriter.cpp                     |  4 +-
 .../Scalar/SpeculativeExecution.cpp           |  6 +--
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 10 ++++-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   | 45 ++++++++++++-------
 .../Transforms/Utils/MemoryTaggingSupport.cpp |  3 +-
 llvm/lib/Transforms/Utils/ValueMapper.cpp     |  5 +++
 .../SpeculativeExecution/PR46267.ll           |  5 +++
 13 files changed, 114 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 1c8619741eb69..84b0f743d3c9b 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -157,6 +157,11 @@ class DbgRecord : public ilist_node<DbgRecord> {
   ~DbgRecord() = default;
 };
 
+inline raw_ostream &operator<<(raw_ostream &OS, const DbgRecord &R) {
+  R.print(OS);
+  return OS;
+}
+
 /// Records a position in IR for a source label (DILabel). Corresponds to the
 /// llvm.dbg.label intrinsic.
 /// FIXME: Rename DbgLabelRecord when DPValue is renamed to DbgVariableRecord.
@@ -536,11 +541,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DPMarker &Marker) {
   return OS;
 }
 
-inline raw_ostream &operator<<(raw_ostream &OS, const DPValue &Value) {
-  Value.print(OS);
-  return OS;
-}
-
 /// Inline helper to return a range of DPValues attached to a marker. It needs
 /// to be inlined as it's frequently called, but also come after the declaration
 /// of DPMarker. Thus: it's pre-declared by users like Instruction, then an
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index b8d578d0fee08..fbaaef8ea4431 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -531,6 +531,9 @@ class DbgAssignIntrinsic : public DbgValueInst {
 class DbgLabelInst : public DbgInfoIntrinsic {
 public:
   DILabel *getLabel() const { return cast<DILabel>(getRawLabel()); }
+  void setLabel(DILabel *NewLabel) {
+    setArgOperand(0, MetadataAsValue::get(getContext(), NewLabel));
+  }
 
   Metadata *getRawLabel() const {
     return cast<MetadataAsValue>(getArgOperand(0))->getMetadata();
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 7b66a851db252..3b84624c3d4dc 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -829,11 +829,7 @@ class MemLocFragmentFill {
   void process(BasicBlock &BB, VarFragMap &LiveSet) {
     BBInsertBeforeMap[&BB].clear();
     for (auto &I : BB) {
-      for (DbgRecord &DR : I.getDbgValueRange()) {
-        // FIXME: DPValue::filter usage needs attention in this file; we need
-        // to make sure dbg.labels are handled correctly in RemoveDIs mode.
-        // Cast below to ensure this gets fixed when DPLabels are introduced.
-        DPValue &DPV = cast<DPValue>(DR);
+      for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
         if (const auto *Locs = FnVarLocs->getWedge(&DPV)) {
           for (const VarLocInfo &Loc : *Locs) {
             addDef(Loc, &DPV, *I.getParent(), LiveSet);
@@ -1919,6 +1915,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
     // attached DPValues, or a non-debug instruction with attached unprocessed
     // DPValues.
     if (II != EI && II->hasDbgValues()) {
+      // Skip over non-variable debug records (i.e., labels). They're going to
+      // be read from IR (possibly re-ordering them within the debug record
+      // range) rather than from the analysis results.
       for (DPValue &DPV : DPValue::filter(II->getDbgValueRange())) {
         resetInsertionPoint(DPV);
         processDPValue(DPV, LiveSet);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 7c95cef2eeb76..38bb808dd5bd5 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3275,7 +3275,17 @@ void IRTranslator::translateDbgDeclareRecord(Value *Address, bool HasArgList,
 
 void IRTranslator::translateDbgInfo(const Instruction &Inst,
                                       MachineIRBuilder &MIRBuilder) {
-  for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) {
+  for (DbgRecord &DR : Inst.getDbgValueRange()) {
+    if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+      MIRBuilder.setDebugLoc(DPL->getDebugLoc());
+      assert(DPL->getLabel() && "Missing label");
+      assert(DPL->getLabel()->isValidLocationForIntrinsic(
+                 MIRBuilder.getDebugLoc()) &&
+             "Expected inlined-at fields to agree");
+      MIRBuilder.buildDbgLabel(DPL->getLabel());
+      continue;
+    }
+    DPValue &DPV = cast<DPValue>(DR);
     const DILocalVariable *Variable = DPV.getVariable();
     const DIExpression *Expression = DPV.getExpression();
     Value *V = DPV.getVariableLocationOp(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 5651498dd3f5a..246762dd7ab62 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1188,11 +1188,24 @@ void FastISel::handleDbgInfo(const Instruction *II) {
   MIMD = MIMetadata();
 
   // Reverse order of debug records, because fast-isel walks through backwards.
-  for (DbgRecord &DPR : llvm::reverse(II->getDbgValueRange())) {
+  for (DbgRecord &DR : llvm::reverse(II->getDbgValueRange())) {
     flushLocalValueMap();
     recomputeInsertPt();
 
-    DPValue &DPV = cast<DPValue>(DPR);
+    if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+      assert(DPL->getLabel() && "Missing label");
+      if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
+        LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DPL << "\n");
+        continue;
+      }
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DPL->getDebugLoc(),
+              TII.get(TargetOpcode::DBG_LABEL))
+          .addMetadata(DPL->getLabel());
+      continue;
+    }
+
+    DPValue &DPV = cast<DPValue>(DR);
 
     Value *V = nullptr;
     if (!DPV.hasArgList())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e893a5b616d33..ee600d389c2cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1241,17 +1241,30 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
                              It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
       }
     }
-    // We must early-exit here to prevent any DPValues from being emitted below,
-    // as we have just emitted the debug values resulting from assignment
-    // tracking analysis, making any existing DPValues redundant (and probably
-    // less correct).
-    return;
   }
 
+  // We must skip DPValues if they've already been processed above as we
+  // have just emitted the debug values resulting from assignment tracking
+  // analysis, making any existing DPValues redundant (and probably less
+  // correct). We still need to process DPLabels. This does sink DPLabels
+  // to the bottom of the group of debug records. That sholdn't be important
+  // as it does so deterministcally and ordering between DPLabels and DPValues
+  // is immaterial (other than for MIR/IR printing).
+  bool SkipDPValues = DAG.getFunctionVarLocs();
   // Is there is any debug-info attached to this instruction, in the form of
-  // DPValue non-instruction debug-info records.
-  for (DbgRecord &DPR : I.getDbgValueRange()) {
-    DPValue &DPV = cast<DPValue>(DPR);
+  // DbgRecord non-instruction debug-info records.
+  for (DbgRecord &DR : I.getDbgValueRange()) {
+    if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+      assert(DPL->getLabel() && "Missing label");
+      SDDbgLabel *SDV =
+          DAG.getDbgLabel(DPL->getLabel(), DPL->getDebugLoc(), SDNodeOrder);
+      DAG.AddDbgLabel(SDV);
+      continue;
+    }
+
+    if (SkipDPValues)
+      continue;
+    DPValue &DPV = cast<DPValue>(DR);
     DILocalVariable *Variable = DPV.getVariable();
     DIExpression *Expression = DPV.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index c2a470c5fc716..fba404c9b027c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1141,12 +1141,14 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
     CreateMetadataSlot(DPV->getVariable());
-    CreateMetadataSlot(DPV->getDebugLoc());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
+  } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
+    CreateMetadataSlot(DPL->getLabel());
   } else {
     llvm_unreachable("unsupported DbgRecord kind");
   }
+  CreateMetadataSlot(DR.getDebugLoc());
 }
 
 void SlotTracker::processInstructionMetadata(const Instruction &I) {
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index f4f3070d11c7b..260f31b59ed29 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -291,9 +291,9 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   InstructionCost TotalSpeculationCost = 0;
   unsigned NotHoistedInstCount = 0;
   for (const auto &I : FromBlock) {
-    // Make note of any DPValues that need hoisting.
-    for (DbgRecord &DR : I.getDbgValueRange()) {
-      DPValue &DPV = cast<DPValue>(DR);
+    // Make note of any DPValues that need hoisting. DPLabels
+    // get left behind just like llvm.dbg.labels.
+    for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
       if (HasNoUnhoistedInstr(DPV.location_ops()))
         DPValuesToHoist[DPV.getInstruction()].push_back(&DPV);
     }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7fd6759a61fba..5bb109a04ff17 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -386,7 +386,15 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
   SmallVector<DPValue *, 8> ToBeRemoved;
   SmallDenseSet<DebugVariable> VariableSet;
   for (auto &I : reverse(*BB)) {
-    for (DPValue &DPV : reverse(DPValue::filter(I.getDbgValueRange()))) {
+    for (DbgRecord &DR : reverse(I.getDbgValueRange())) {
+      if (isa<DPLabel>(DR)) {
+        // Emulate existing behaviour (see comment below for dbg.declares).
+        // FIXME: Don't do this.
+        VariableSet.clear();
+        continue;
+      }
+
+      DPValue &DPV = cast<DPValue>(DR);
       // Skip declare-type records, as the debug intrinsic method only works
       // on dbg.value intrinsics.
       if (DPV.getType() == DPValue::LocationType::Declare) {
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 8ebcf0c04fd5a..bab065153f3ef 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1585,8 +1585,30 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     return cast<DILocalVariable>(NewVar);
   };
 
-  auto UpdateDPValuesOnInst = [&](Instruction &I) -> void {
-    for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
+  auto UpdateDbgLabel = [&](auto *LabelRecord) {
+    // Point the label record to a fresh label within the new function if
+    // the record was not inlined from some other function.
+    if (LabelRecord->getDebugLoc().getInlinedAt())
+      return;
+    DILabel *OldLabel = LabelRecord->getLabel();
+    DINode *&NewLabel = RemappedMetadata[OldLabel];
+    if (!NewLabel) {
+      DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram(
+          *OldLabel->getScope(), *NewSP, Ctx, Cache);
+      NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(),
+                              OldLabel->getFile(), OldLabel->getLine());
+    }
+    LabelRecord->setLabel(cast<DILabel>(NewLabel));
+  };
+
+  auto UpdateDbgRecordsOnInst = [&](Instruction &I) -> void {
+    for (DbgRecord &DR : I.getDbgValueRange()) {
+      if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+        UpdateDbgLabel(DPL);
+        continue;
+      }
+
+      DPValue &DPV = cast<DPValue>(DR);
       // Apply the two updates that dbg.values get: invalid operands, and
       // variable metadata fixup.
       if (any_of(DPV.location_ops(), IsInvalidLocation)) {
@@ -1599,13 +1621,11 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       }
       if (!DPV.getDebugLoc().getInlinedAt())
         DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable()));
-      DPV.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DPV.getDebugLoc(),
-                                                           *NewSP, Ctx, Cache));
     }
   };
 
   for (Instruction &I : instructions(NewFunc)) {
-    UpdateDPValuesOnInst(I);
+    UpdateDbgRecordsOnInst(I);
 
     auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
     if (!DII)
@@ -1614,17 +1634,7 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     // Point the intrinsic to a fresh label within the new function if the
     // intrinsic was not inlined from some other function.
     if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      if (DLI->getDebugLoc().getInlinedAt())
-        continue;
-      DILabel *OldLabel = DLI->getLabel();
-      DINode *&NewLabel = RemappedMetadata[OldLabel];
-      if (!NewLabel) {
-        DILocalScope *NewScope = DILocalScope::cloneScopeForSubprogram(
-            *OldLabel->getScope(), *NewSP, Ctx, Cache);
-        NewLabel = DILabel::get(Ctx, NewScope, OldLabel->getName(),
-                                OldLabel->getFile(), OldLabel->getLine());
-      }
-      DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
+      UpdateDbgLabel(DLI);
       continue;
     }
 
@@ -1658,6 +1668,9 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     if (const DebugLoc &DL = I.getDebugLoc())
       I.setDebugLoc(
           DebugLoc::replaceInlinedAtSubprogram(DL, *NewSP, Ctx, Cache));
+    for (DbgRecord &DR : I.getDbgValueRange())
+      DR.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DR.getDebugLoc(),
+                                                          *NewSP, Ctx, Cache));
 
     // Loop info metadata may contain line locations. Fix them up.
     auto updateLoopInfoLoc = [&Ctx, &Cache, NewSP](Metadata *MD) -> Metadata * {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 08fdd3b75ffcb..2ff7c01510767 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -111,8 +111,7 @@ Instruction *getUntagLocationIfFunctionExit(Instruction &Inst) {
 
 void StackInfoBuilder::visit(Instruction &Inst) {
   // Visit non-intrinsic debug-info records attached to Inst.
-  for (DbgRecord &DR : Inst.getDbgValueRange()) {
-    DPValue &DPV = cast<DPValue>(DR);
+  for (DPValue &DPV : DPValue::filter(Inst.getDbgValueRange())) {
     auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 6e46469f5a601..91ab2795a4b9d 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -538,6 +538,11 @@ Value *Mapper::mapValue(const Value *V) {
 }
 
 void Mapper::remapDPValue(DbgRecord &DR) {
+  if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
+    DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel())));
+    return;
+  }
+
   DPValue &V = cast<DPValue>(DR);
   // Remap variables and DILocations.
   auto *MappedVar = mapMetadata(V.getVariable());
diff --git a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
index c27b492b4b876..d940ee6a7863d 100644
--- a/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
+++ b/llvm/test/Transforms/SpeculativeExecution/PR46267.ll
@@ -41,12 +41,16 @@ land.rhs:                                         ; preds = %entry
 ; CHECK-NEXT: call void @llvm.dbg.declare(metadata ptr %y
 ; CHECK-NEXT: %a0 = load i32, ptr undef, align 1
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %a0
+; CHECK-NEXT: call void @llvm.dbg.label
   call void @llvm.dbg.label(metadata !11), !dbg !10
   %y = alloca i32, align 4
   call void @llvm.dbg.declare(metadata ptr %y, metadata !14, metadata !DIExpression()), !dbg !10
 
   %a0 = load i32, ptr undef, align 1
   call void @llvm.dbg.value(metadata i32 %a0, metadata !9, metadata !DIExpression()), !dbg !10
+  ;; RemoveDIs: Check a label that is attached to a hoisted instruction
+  ;; gets left behind (match intrinsic-style debug info behaviour).
+  call void @llvm.dbg.label(metadata !15), !dbg !10
 
   %a2 = add i32 %i, 0
   call void @llvm.dbg.value(metadata i32 %a2, metadata !13, metadata !DIExpression()), !dbg !10
@@ -82,3 +86,4 @@ attributes #1 = { nounwind readnone speculatable willreturn }
 !12 = !DILocalVariable(name: "x", scope: !6, file: !1, line: 3, type: !4)
 !13 = !DILocalVariable(name: "a2", scope: !6, file: !1, line: 3, type: !4)
 !14 = !DILocalVariable(name: "y", scope: !6, file: !1, line: 3, type: !4)
+!15 = !DILabel(scope: !6, name: "label2", file: !1, line: 2)

From cdf19d13bf39f0679c3636eada87a5645f9a4c84 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 23 Feb 2024 11:43:28 +0000
Subject: [PATCH 162/546] [Clang] Fix acle_sme_zero.c (missing
 aarch64-registered-target)

This test was added in #82648
---
 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
index e0b6c391d9890..8ea80bc6568fe 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -1,3 +1,4 @@
+// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fsyntax-only -verify %s
 
 void test_svzero_args(uint64_t m) {

From e1326434742980b03433464dd9435ea66ad5be47 Mon Sep 17 00:00:00 2001
From: tsitdikov <tsitdikov@google.com>
Date: Fri, 23 Feb 2024 11:47:40 +0000
Subject: [PATCH 163/546] Add build rule for MLIRArmSMETestPasses

MLIRArmSMETestPasses was added in https://github.com/llvm/llvm-project/commit/b39f5660a408b47307e57a0882eb8af85d72e283, we need to add a build rule for it as well.
---
 .../llvm-project-overlay/mlir/test/BUILD.bazel  | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index c3bc3f196c55d..497256573dfc5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -905,6 +905,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "TestArmSME",
+    srcs = glob(["lib/Dialect/ArmSME/*.cpp"]),
+    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
+    includes = ["lib/Dialect/Test"],
+    deps = [
+        "//mlir:ArithToArmSME",
+        "//mlir:ArmSMEToLLVM",
+        "//mlir:ArmSMEToSCF",
+        "//mlir:IR",
+        "//mlir:Pass",
+        "//mlir:Transforms",
+        "//mlir:VectorToArmSME",
+        "//mlir:VectorToSCF",
+    ],
+)
+
 cc_library(
     name = "TestBufferization",
     srcs = glob(["lib/Dialect/Bufferization/*.cpp"]),

From 3dfca24dda1b3596685d02109185ea2885cc0124 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 23 Feb 2024 03:50:00 -0800
Subject: [PATCH 164/546] [AMDGPU] Fix encoding of VOP3P dpp on GFX11 and GFX12
 (#82710)

The bug affects dpp forms of v_dot2_f32_f16. The encoding does not match
SP3 and does not set op_sel_hi bits properly.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td               | 2 ++
 llvm/lib/Target/AMDGPU/VOPInstructions.td                 | 1 +
 llvm/test/MC/AMDGPU/gfx11-promotions.s                    | 8 ++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s               | 4 ++--
 llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s                | 2 +-
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s               | 4 ++--
 llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s                | 2 +-
 .../MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt     | 8 ++++----
 .../test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt | 4 ++--
 .../MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt     | 8 ++++----
 .../test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt | 4 ++--
 11 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index cf76de40aef41..ac3c8f95306bc 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1353,6 +1353,7 @@ class VOP3P_DPP16<bits<7> op, VOP_DPP_Pseudo ps, int subtarget,
   let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
   let OtherPredicates = ps.OtherPredicates;
+  let IsPacked = ps.IsPacked;
 }
 
 class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1362,6 +1363,7 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
   let SchedRW = ps.SchedRW;
   let Uses = ps.Uses;
   let OtherPredicates = ps.OtherPredicates;
+  let IsPacked = ps.IsPacked;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 2989d05e968ef..80d7d96a5e3cc 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -818,6 +818,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   let VALU = 1;
   let DPP = 1;
   let Size = 8;
+  let IsPacked = P.IsPacked;
 
   let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
 
diff --git a/llvm/test/MC/AMDGPU/gfx11-promotions.s b/llvm/test/MC/AMDGPU/gfx11-promotions.s
index 0bd90266457ee..67e7beaa262f4 100644
--- a/llvm/test/MC/AMDGPU/gfx11-promotions.s
+++ b/llvm/test/MC/AMDGPU/gfx11-promotions.s
@@ -337,17 +337,17 @@ v_dot2_f32_f16_e64 v0, v1, v2, v3
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x00,0x13,0xcc,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x40,0x13,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x77,0x39,0x05]
 
 //===----------------------------------------------------------------------===//
 // VOP3P.DPP16.
 //===----------------------------------------------------------------------===//
 
 v_dot2_f32_f16 v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0]
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x39,0x00,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x39,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s
index 2cfb8abd4e979..3ff4ed27f1b25 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp16.s
@@ -2,10 +2,10 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x04,0xff]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe
-// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+// GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
 
 v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
 // GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s
index 2656ba0cf1807..3fb993dc8bec4 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3p_dpp8.s
@@ -15,4 +15,4 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:
 // GFX11: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+// GFX11: encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
index 75bd1696e10bb..a6360684f1d0e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp16.s
@@ -2,10 +2,10 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[0,0,0] neg_hi:[0,0,0] quad_perm:[2,2,3,1] bound_ctrl:0 fi:1
-// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x04,0xff]
+// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x04,0xff]
 
 v_dot2_f32_f16 v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] bank_mask:0xe
-// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
+// GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
 
 v_fma_mix_f32 v0, v1, v2, v3 op_sel:[0,0,0] row_ror:7 bank_mask:0x1 bound_ctrl:0
 // GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x01,0xf1]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
index 14cf169d4b424..299339339e8c6 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3p_dpp8.s
@@ -15,7 +15,7 @@ v_fma_mixlo_f16 v0, abs(v1), -v2, abs(v3) op_sel:[1,0,0] op_sel_hi:[1,0,0] dpp8:
 // GFX12: encoding: [0x00,0x0d,0x21,0xcc,0xe9,0x04,0x0e,0x4c,0x01,0x92,0x44,0x92]
 
 v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
+// GFX12: encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
 
 v_dot4_f32_fp8_bf8 v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7]
 // GFX12: v_dot4_f32_fp8_bf8_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x40,0x24,0xcc,0xe9,0x04,0x0e,0x1c,0x01,0x88,0xc6,0xfa]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
index 6b230367c8313..ceca6d9fc3faa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp16.txt
@@ -1,11 +1,11 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
-# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
-0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
+0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe
 
-# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
-0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff]
+0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff
 
 # GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
 0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
index 89c9b54d7cfee..57c96170eadce 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3p_dpp8.txt
@@ -1,8 +1,8 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
 
-# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
-0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05
+# GFX11: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
+0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05
 
 # GFX11: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
 0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
index 52fd0530681cf..10f438465d65e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp16.txt
@@ -1,11 +1,11 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 
-# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe]
-0x00,0x05,0x13,0xcc,0xfa,0x04,0x0e,0x64,0x01,0x1b,0x00,0xfe
+# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[1,1,0] neg_hi:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xe ; encoding: [0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe]
+0x00,0x45,0x13,0xcc,0xfa,0x04,0x0e,0x7c,0x01,0x1b,0x00,0xfe
 
-# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff]
-0x00,0x00,0x13,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x7a,0x0c,0xff
+# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 quad_perm:[2,2,3,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; encoding: [0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff]
+0x00,0x40,0x13,0xcc,0xfa,0x04,0x0e,0x1c,0x01,0x7a,0x0c,0xff
 
 # GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 row_ror:7 row_mask:0xf bank_mask:0x1 bound_ctrl:1 ; encoding: [0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1]
 0x00,0x00,0x20,0xcc,0xfa,0x04,0x0e,0x04,0x01,0x27,0x09,0xf1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
index 688212e51c427..2fb9c23ed5ec5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3p_dpp8.txt
@@ -1,8 +1,8 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX12 %s
 
-# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05]
-0x00,0x05,0x13,0xcc,0xe9,0x04,0x0e,0xc4,0x01,0x77,0x39,0x05
+# GFX12: v_dot2_f32_f16_e64_dpp v0, v1, v2, v3 neg_lo:[0,1,1] neg_hi:[1,0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05]
+0x00,0x45,0x13,0xcc,0xe9,0x04,0x0e,0xdc,0x01,0x77,0x39,0x05
 
 # GFX12: v_fma_mix_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[2,2,2,2,4,4,4,4] fi:1 ; encoding: [0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92]
 0x00,0x80,0x20,0xcc,0xea,0x04,0x0e,0x04,0x01,0x92,0x44,0x92

From d9e4309b451c1b24d4e0a6304057663b877e5266 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20B=C3=B6ck?= <markus.boeck02@gmail.com>
Date: Fri, 23 Feb 2024 12:50:20 +0100
Subject: [PATCH 165/546] [mlir][NFC] Fix format specifier warning on Windows

`%ld` specifier is defined to work on values of type `long`. The parameter given to `fprintf` is of type `intptr_t` whose actual underlying integer type is unspecified. On Unix systems it happens to commonly be `long` but on 64-bit Windows it is defined as `long long`.

The cross-platform way to print a `intptr_t` is to use `PRIdPTR` which expands to the correct format specifier for `intptr_t`. This avoids any undefined behaviour and compiler warnings.
---
 mlir/test/CAPI/llvm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c
index 5a78fac91a509..1817988dd67dd 100644
--- a/mlir/test/CAPI/llvm.c
+++ b/mlir/test/CAPI/llvm.c
@@ -15,6 +15,7 @@
 #include "mlir-c/Support.h"
 
 #include <assert.h>
+#include <inttypes.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -105,7 +106,7 @@ static int testStructTypeCreation(MlirContext ctx) {
   // CHECK: i8
   // CHECK: i32
   // CHECK: i64
-  fprintf(stderr, "num elements: %ld\n",
+  fprintf(stderr, "num elements: %" PRIdPTR "\n",
           mlirLLVMStructTypeGetNumElementTypes(literal));
   for (intptr_t i = 0; i < 3; ++i) {
     mlirTypeDump(mlirLLVMStructTypeGetElementType(literal, i));

From 6ac2c0488f0e06036fc2bd7a94bea71fb930b363 Mon Sep 17 00:00:00 2001
From: tsitdikov <tsitdikov@google.com>
Date: Fri, 23 Feb 2024 11:57:14 +0000
Subject: [PATCH 166/546] Add TestArmSME dependency to mlir-opt library.

TestArmSME was added in https://github.com/llvm/llvm-project/commit/e1326434742980b03433464dd9435ea66ad5be47, now we need to add dependency on it.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index a34874efa5b19..853d136d9478f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9170,6 +9170,7 @@ cc_binary(
         "//mlir/test:TestAffine",
         "//mlir/test:TestAnalysis",
         "//mlir/test:TestArith",
+        "//mlir/test:TestArmSME",
         "//mlir/test:TestBufferization",
         "//mlir/test:TestControlFlow",
         "//mlir/test:TestDLTI",

From f1e0392b822e06f39c49df3ba594f4c98f608ba0 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <152526799+quic-garvgupt@users.noreply.github.com>
Date: Fri, 23 Feb 2024 17:31:58 +0530
Subject: [PATCH 167/546] [RISCV] Disable generation of asynchronous unwind
 tables for RISCV baremetal (#81727)

The below culprit patch enabled the generation of asynchronous unwind
tables (-funwind-tables=2) by default for RISCV for both linux and
RISCVToolChain baremetal object. However, since there are 2 baremetal
toolchain objects for RISCV, this created a discrepancy between their
behavior. Moreover, enabling the generation of asynchronous unwind
tables based on whether `-gcc-toolchain` option is present or not
doesn't seem to be the best criteria to decide on the same. This patch
make the behavior consistent by disabling the unwind tables in
RISCVToolChain Baremetal object.

Culprit Patch - https://reviews.llvm.org/D145164
---
 clang/lib/Driver/ToolChains/RISCVToolchain.cpp | 5 +++++
 clang/lib/Driver/ToolChains/RISCVToolchain.h   | 2 ++
 clang/test/Driver/riscv-features.c             | 8 ++++++++
 3 files changed, 15 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index 85beb945cbf6f..624099d21ae12 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -86,6 +86,11 @@ RISCVToolChain::GetUnwindLibType(const llvm::opt::ArgList &Args) const {
   return ToolChain::UNW_None;
 }
 
+ToolChain::UnwindTableLevel RISCVToolChain::getDefaultUnwindTableLevel(
+    const llvm::opt::ArgList &Args) const {
+  return UnwindTableLevel::None;
+}
+
 void RISCVToolChain::addClangTargetOptions(
     const llvm::opt::ArgList &DriverArgs,
     llvm::opt::ArgStringList &CC1Args,
diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.h b/clang/lib/Driver/ToolChains/RISCVToolchain.h
index cec817ef7190b..fa0aa265d842b 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.h
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.h
@@ -28,6 +28,8 @@ class LLVM_LIBRARY_VISIBILITY RISCVToolChain : public Generic_ELF {
   RuntimeLibType GetDefaultRuntimeLibType() const override;
   UnwindLibType
   GetUnwindLibType(const llvm::opt::ArgList &Args) const override;
+  UnwindTableLevel
+  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override;
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index a108383e29fb6..fc5fb0f27e3af 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -41,6 +41,14 @@
 // FAST-UNALIGNED-ACCESS: "-target-feature" "+fast-unaligned-access"
 // NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-fast-unaligned-access"
 
+// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain="" -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE
+// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain="" -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE
+// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain="" -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE
+// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain="" -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE
+//
+// UWTABLE: "-funwind-tables=2"
+// NOUWTABLE-NOT: "-funwind-tables=2"
+
 // RUN: %clang --target=riscv32-linux -### %s -fsyntax-only 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=DEFAULT-LINUX
 // RUN: %clang --target=riscv64-linux -### %s -fsyntax-only 2>&1 \

From 3b3d0978c334702114131e4dab549aa25b9f0ad4 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 23 Feb 2024 12:12:50 +0000
Subject: [PATCH 168/546] [Clang] Fix acle_sme_zero.c once more.

---
 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
index 8ea80bc6568fe..a852ffa09c60e 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -1,6 +1,8 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fsyntax-only -verify %s
 
+#include <arm_sme.h>
+
 void test_svzero_args(uint64_t m) {
   svzero_za(0); // expected-error {{too many arguments to function call, expected 0, have 1}}
   svzero_za(m); // expected-error {{too many arguments to function call, expected 0, have 1}}

From bcf9826a5392f40063869c3d2b72a5cd1b87d14b Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Fri, 23 Feb 2024 13:15:08 +0100
Subject: [PATCH 169/546] [MLIR] Expose approximation patterns for tanh/erf.
 (#82750)

These patterns can already be used via
populateMathPolynomialApproximationPatterns, but that includes a number
of other patterns that may not be needed.

There are already similar functions for expansion.

For now only adding tanh and erf since I have a concrete use case for
these two.
---
 mlir/include/mlir/Dialect/Math/Transforms/Passes.h     |  3 +++
 .../Math/Transforms/PolynomialApproximation.cpp        | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
index 010dde5ea7384..11b2c7a7afa2f 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
@@ -45,6 +45,9 @@ struct MathPolynomialApproximationOptions {
   bool enableAvx2 = false;
 };
 
+void populatePolynomialApproximateTanhPattern(RewritePatternSet &patterns);
+void populatePolynomialApproximateErfPattern(RewritePatternSet &patterns);
+
 void populateMathPolynomialApproximationPatterns(
     RewritePatternSet &patterns,
     const MathPolynomialApproximationOptions &options = {});
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 71e4e13103f51..962cb28b7c2ab 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -1471,6 +1471,16 @@ RsqrtApproximation::matchAndRewrite(math::RsqrtOp op,
 
 //----------------------------------------------------------------------------//
 
+void mlir::populatePolynomialApproximateTanhPattern(
+    RewritePatternSet &patterns) {
+  patterns.add<TanhApproximation>(patterns.getContext());
+}
+
+void mlir::populatePolynomialApproximateErfPattern(
+    RewritePatternSet &patterns) {
+  patterns.add<ErfPolynomialApproximation>(patterns.getContext());
+}
+
 void mlir::populateMathPolynomialApproximationPatterns(
     RewritePatternSet &patterns,
     const MathPolynomialApproximationOptions &options) {

From ddb4450a468072b5c066c29f4821edec4689d500 Mon Sep 17 00:00:00 2001
From: r4nt <klimek@google.com>
Date: Fri, 23 Feb 2024 13:18:00 +0100
Subject: [PATCH 170/546] [ClangFormat] Fix indent in child lines within a
 macro argument. (#82523)

When reconstructing lines from a macro expansion, make sure that lines
at different levels in the expanded code get indented correctly as part
of the macro argument.
---
 clang/lib/Format/MacroCallReconstructor.cpp   |  68 +++++----
 clang/lib/Format/Macros.h                     |  10 +-
 clang/lib/Format/UnwrappedLineParser.cpp      |   6 +
 clang/lib/Format/UnwrappedLineParser.h        |   2 +
 .../Format/FormatTestMacroExpansion.cpp       |  21 ++-
 .../Format/MacroCallReconstructorTest.cpp     | 129 ++++++++++++------
 6 files changed, 163 insertions(+), 73 deletions(-)

diff --git a/clang/lib/Format/MacroCallReconstructor.cpp b/clang/lib/Format/MacroCallReconstructor.cpp
index cbdd1683c54d1..101acefdfe7a3 100644
--- a/clang/lib/Format/MacroCallReconstructor.cpp
+++ b/clang/lib/Format/MacroCallReconstructor.cpp
@@ -33,7 +33,7 @@ void forEachToken(const UnwrappedLine &Line, const T &Call,
                   FormatToken *Parent = nullptr) {
   bool First = true;
   for (const auto &N : Line.Tokens) {
-    Call(N.Tok, Parent, First);
+    Call(N.Tok, Parent, First, Line.Level);
     First = false;
     for (const auto &Child : N.Children)
       forEachToken(Child, Call, N.Tok);
@@ -44,7 +44,7 @@ MacroCallReconstructor::MacroCallReconstructor(
     unsigned Level,
     const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
         &ActiveExpansions)
-    : Level(Level), IdToReconstructed(ActiveExpansions) {
+    : Result(Level), IdToReconstructed(ActiveExpansions) {
   Result.Tokens.push_back(std::make_unique<LineNode>());
   ActiveReconstructedLines.push_back(&Result);
 }
@@ -52,9 +52,8 @@ MacroCallReconstructor::MacroCallReconstructor(
 void MacroCallReconstructor::addLine(const UnwrappedLine &Line) {
   assert(State != Finalized);
   LLVM_DEBUG(llvm::dbgs() << "MCR: new line...\n");
-  forEachToken(Line, [&](FormatToken *Token, FormatToken *Parent, bool First) {
-    add(Token, Parent, First);
-  });
+  forEachToken(Line, [&](FormatToken *Token, FormatToken *Parent, bool First,
+                         unsigned Level) { add(Token, Parent, First, Level); });
   assert(InProgress || finished());
 }
 
@@ -62,8 +61,8 @@ UnwrappedLine MacroCallReconstructor::takeResult() && {
   finalize();
   assert(Result.Tokens.size() == 1 &&
          Result.Tokens.front()->Children.size() == 1);
-  UnwrappedLine Final =
-      createUnwrappedLine(*Result.Tokens.front()->Children.front(), Level);
+  UnwrappedLine Final = createUnwrappedLine(
+      *Result.Tokens.front()->Children.front(), Result.Level);
   assert(!Final.Tokens.empty());
   return Final;
 }
@@ -72,7 +71,8 @@ UnwrappedLine MacroCallReconstructor::takeResult() && {
 // ExpandedParent in the incoming unwrapped line. \p First specifies whether it
 // is the first token in a given unwrapped line.
 void MacroCallReconstructor::add(FormatToken *Token,
-                                 FormatToken *ExpandedParent, bool First) {
+                                 FormatToken *ExpandedParent, bool First,
+                                 unsigned Level) {
   LLVM_DEBUG(
       llvm::dbgs() << "MCR: Token: " << Token->TokenText << ", Parent: "
                    << (ExpandedParent ? ExpandedParent->TokenText : "<null>")
@@ -102,7 +102,7 @@ void MacroCallReconstructor::add(FormatToken *Token,
       First = true;
   }
 
-  prepareParent(ExpandedParent, First);
+  prepareParent(ExpandedParent, First, Level);
 
   if (Token->MacroCtx) {
     // If this token was generated by a macro call, add the reconstructed
@@ -129,7 +129,7 @@ void MacroCallReconstructor::add(FormatToken *Token,
 // is the parent of ActiveReconstructedLines.back() in the reconstructed
 // unwrapped line.
 void MacroCallReconstructor::prepareParent(FormatToken *ExpandedParent,
-                                           bool NewLine) {
+                                           bool NewLine, unsigned Level) {
   LLVM_DEBUG({
     llvm::dbgs() << "ParentMap:\n";
     debugParentMap();
@@ -172,7 +172,7 @@ void MacroCallReconstructor::prepareParent(FormatToken *ExpandedParent,
     }
     assert(!ActiveReconstructedLines.empty());
     ActiveReconstructedLines.back()->Tokens.back()->Children.push_back(
-        std::make_unique<ReconstructedLine>());
+        std::make_unique<ReconstructedLine>(Level));
     ActiveReconstructedLines.push_back(
         &*ActiveReconstructedLines.back()->Tokens.back()->Children.back());
   } else if (parentLine().Tokens.back()->Tok != Parent) {
@@ -424,7 +424,8 @@ bool MacroCallReconstructor::processNextReconstructed() {
       SpelledParentToReconstructedParent[MacroCallStructure.back()
                                              .ParentLastToken] = Token;
       appendToken(Token);
-      prepareParent(Token, /*NewLine=*/true);
+      prepareParent(Token, /*NewLine=*/true,
+                    MacroCallStructure.back().Line->Level);
       Token->MacroParent = true;
       return false;
     }
@@ -435,7 +436,8 @@ bool MacroCallReconstructor::processNextReconstructed() {
             [MacroCallStructure.back().Line->Tokens.back()->Tok] = Token;
         Token->MacroParent = true;
         appendToken(Token, MacroCallStructure.back().Line);
-        prepareParent(Token, /*NewLine=*/true);
+        prepareParent(Token, /*NewLine=*/true,
+                      MacroCallStructure.back().Line->Level);
         return true;
       }
       if (Token->is(tok::r_paren)) {
@@ -509,16 +511,36 @@ MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line,
   for (const auto &N : Line.Tokens) {
     Result.Tokens.push_back(N->Tok);
     UnwrappedLineNode &Current = Result.Tokens.back();
-    for (const auto &Child : N->Children) {
-      if (Child->Tokens.empty())
-        continue;
-      Current.Children.push_back(createUnwrappedLine(*Child, Level + 1));
-    }
-    if (Current.Children.size() == 1 &&
-        Current.Tok->isOneOf(tok::l_paren, tok::comma)) {
-      Result.Tokens.splice(Result.Tokens.end(),
-                           Current.Children.front().Tokens);
-      Current.Children.clear();
+    auto NumChildren =
+        std::count_if(N->Children.begin(), N->Children.end(),
+                      [](const auto &Child) { return !Child->Tokens.empty(); });
+    if (NumChildren == 1 && Current.Tok->isOneOf(tok::l_paren, tok::comma)) {
+      // If we only have one child, and the child is due to a macro expansion
+      // (either attached to a left parenthesis or comma), merge the child into
+      // the current line to prevent forced breaks for macro arguments.
+      auto *Child = std::find_if(
+          N->Children.begin(), N->Children.end(),
+          [](const auto &Child) { return !Child->Tokens.empty(); });
+      auto Line = createUnwrappedLine(**Child, Level);
+      Result.Tokens.splice(Result.Tokens.end(), Line.Tokens);
+    } else if (NumChildren > 0) {
+      // When there are multiple children with different indent, make sure that
+      // we indent them:
+      // 1. One level below the current line's level.
+      // 2. At the correct level relative to each other.
+      unsigned MinChildLevel =
+          std::min_element(N->Children.begin(), N->Children.end(),
+                           [](const auto &E1, const auto &E2) {
+                             return E1->Level < E2->Level;
+                           })
+              ->get()
+              ->Level;
+      for (const auto &Child : N->Children) {
+        if (Child->Tokens.empty())
+          continue;
+        Current.Children.push_back(createUnwrappedLine(
+            *Child, Level + 1 + (Child->Level - MinChildLevel)));
+      }
     }
   }
   return Result;
diff --git a/clang/lib/Format/Macros.h b/clang/lib/Format/Macros.h
index 1964624e828ce..d2f7fe502364c 100644
--- a/clang/lib/Format/Macros.h
+++ b/clang/lib/Format/Macros.h
@@ -231,8 +231,9 @@ class MacroCallReconstructor {
   UnwrappedLine takeResult() &&;
 
 private:
-  void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
-  void prepareParent(FormatToken *ExpandedParent, bool First);
+  void add(FormatToken *Token, FormatToken *ExpandedParent, bool First,
+           unsigned Level);
+  void prepareParent(FormatToken *ExpandedParent, bool First, unsigned Level);
   FormatToken *getParentInResult(FormatToken *Parent);
   void reconstruct(FormatToken *Token);
   void startReconstruction(FormatToken *Token);
@@ -272,6 +273,8 @@ class MacroCallReconstructor {
   // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
   // instead of rolling our own type.
   struct ReconstructedLine {
+    explicit ReconstructedLine(unsigned Level) : Level(Level) {}
+    unsigned Level;
     llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
   };
 
@@ -373,9 +376,6 @@ class MacroCallReconstructor {
   // \- )
   llvm::SmallVector<MacroCallState> MacroCallStructure;
 
-  // Level the generated UnwrappedLine will be at.
-  const unsigned Level;
-
   // Maps from identifier of the macro call to an unwrapped line containing
   // all tokens of the macro call.
   const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 8f6453a25d9d4..3a424bdcde793 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -90,6 +90,12 @@ class ScopedDeclarationState {
 
 } // end anonymous namespace
 
+std::ostream &operator<<(std::ostream &Stream, const UnwrappedLine &Line) {
+  llvm::raw_os_ostream OS(Stream);
+  printLine(OS, Line);
+  return Stream;
+}
+
 class ScopedLineState {
 public:
   ScopedLineState(UnwrappedLineParser &Parser,
diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h
index 739298690bbd7..1403533a2d0ef 100644
--- a/clang/lib/Format/UnwrappedLineParser.h
+++ b/clang/lib/Format/UnwrappedLineParser.h
@@ -420,6 +420,8 @@ struct UnwrappedLineNode {
   SmallVector<UnwrappedLine, 0> Children;
 };
 
+std::ostream &operator<<(std::ostream &Stream, const UnwrappedLine &Line);
+
 } // end namespace format
 } // end namespace clang
 
diff --git a/clang/unittests/Format/FormatTestMacroExpansion.cpp b/clang/unittests/Format/FormatTestMacroExpansion.cpp
index 653ec2a94c64d..85ab6ea3794e8 100644
--- a/clang/unittests/Format/FormatTestMacroExpansion.cpp
+++ b/clang/unittests/Format/FormatTestMacroExpansion.cpp
@@ -48,7 +48,7 @@ TEST_F(FormatTestMacroExpansion, UnexpandConfiguredMacros) {
 )",
                Style);
   verifyIncompleteFormat("ID3({, ID(a *b),\n"
-                         "  ;\n"
+                         "    ;\n"
                          "  });",
                          Style);
 
@@ -131,9 +131,9 @@ ID(CALL(CALL(a * b)));
   EXPECT_EQ(R"(
 ID3(
     {
-    CLASS
-    a *b;
-    };
+      CLASS
+        a *b;
+      };
     },
     ID(x *y);
     ,
@@ -287,6 +287,19 @@ TEST_F(FormatTestMacroExpansion,
                Style);
 }
 
+TEST_F(FormatTestMacroExpansion, IndentChildrenWithinMacroCall) {
+  FormatStyle Style = getGoogleStyleWithColumns(22);
+  Style.Macros.push_back("MACRO(a, b)=a=(b)");
+  verifyFormat("void f() {\n"
+               "  MACRO(a b, call([] {\n"
+               "          if (expr) {\n"
+               "            indent();\n"
+               "          }\n"
+               "        }));\n"
+               "}",
+               Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
diff --git a/clang/unittests/Format/MacroCallReconstructorTest.cpp b/clang/unittests/Format/MacroCallReconstructorTest.cpp
index 6e6900577d165..9df21eae70cb7 100644
--- a/clang/unittests/Format/MacroCallReconstructorTest.cpp
+++ b/clang/unittests/Format/MacroCallReconstructorTest.cpp
@@ -151,17 +151,21 @@ class MacroCallReconstructorTest : public ::testing::Test {
                                            Lex.Allocator, Lex.IdentTable);
   }
 
-  UnwrappedLine line(llvm::ArrayRef<FormatToken *> Tokens) {
+  UnwrappedLine line(llvm::ArrayRef<FormatToken *> Tokens, unsigned Level = 0) {
     UnwrappedLine Result;
+    Result.Level = Level;
     for (FormatToken *Tok : Tokens)
       Result.Tokens.push_back(UnwrappedLineNode(Tok));
     return Result;
   }
 
-  UnwrappedLine line(llvm::StringRef Text) { return line({lex(Text)}); }
+  UnwrappedLine line(llvm::StringRef Text, unsigned Level = 0) {
+    return line({lex(Text)}, Level);
+  }
 
-  UnwrappedLine line(llvm::ArrayRef<Chunk> Chunks) {
+  UnwrappedLine line(llvm::ArrayRef<Chunk> Chunks, unsigned Level = 0) {
     UnwrappedLine Result;
+    Result.Level = Level;
     for (const Chunk &Chunk : Chunks) {
       Result.Tokens.insert(Result.Tokens.end(), Chunk.Tokens.begin(),
                            Chunk.Tokens.end());
@@ -186,6 +190,8 @@ class MacroCallReconstructorTest : public ::testing::Test {
 };
 
 bool matchesTokens(const UnwrappedLine &L1, const UnwrappedLine &L2) {
+  if (L1.Level != L2.Level)
+    return false;
   if (L1.Tokens.size() != L2.Tokens.size())
     return false;
   for (auto L1It = L1.Tokens.begin(), L2It = L2.Tokens.begin();
@@ -288,7 +294,8 @@ TEST_F(MacroCallReconstructorTest, StatementSequence) {
               matchesLine(line(
                   {U1.consume("SEMI"),
                    children({line({U2.consume("SEMI"),
-                                   children({line(U3.consume("SEMI"))})})})})));
+                                   children({line(U3.consume("SEMI"), 2)})},
+                                  1)})})));
 }
 
 TEST_F(MacroCallReconstructorTest, NestedBlock) {
@@ -337,9 +344,9 @@ TEST_F(MacroCallReconstructorTest, NestedBlock) {
 
   auto Expected = line({Chunk2Start,
                         children({
-                            line(Chunk2LBrace),
-                            line({Chunk1, Chunk2Mid}),
-                            line(Chunk2RBrace),
+                            line(Chunk2LBrace, 1),
+                            line({Chunk1, Chunk2Mid}, 1),
+                            line(Chunk2RBrace, 1),
                         }),
                         Chunk2End});
   EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected));
@@ -379,9 +386,11 @@ TEST_F(MacroCallReconstructorTest, NestedChildBlocks) {
   Unexp.addLine(
       line({E.consume("f([] {"),
             children({line({E.consume("f([] {"),
-                            children({line(E.consume("return a * b;"))}),
-                            E.consume("})")})}),
-            E.consume("})")}));
+                            children({line(E.consume("return a * b;"), 3)}),
+                            E.consume("})")},
+                           2)}),
+            E.consume("})")},
+           1));
   Unexp.addLine(line(E.consume("}")));
   EXPECT_TRUE(Unexp.finished());
 
@@ -407,13 +416,15 @@ TEST_F(MacroCallReconstructorTest, NestedChildBlocks) {
   auto Expected = line({
       Chunk3Start,
       children({
-          line(Chunk3LBrace),
-          line({
-              Chunk2Start,
-              Chunk1,
-              Chunk2End,
-          }),
-          line(Chunk3RBrace),
+          line(Chunk3LBrace, 1),
+          line(
+              {
+                  Chunk2Start,
+                  Chunk1,
+                  Chunk2End,
+              },
+              2),
+          line(Chunk3RBrace, 1),
       }),
       Chunk3End,
   });
@@ -469,8 +480,8 @@ TEST_F(MacroCallReconstructorTest, MultipleToplevelUnwrappedLines) {
   auto Expected = line({
       U.consume("ID("),
       children({
-          line(U.consume("x;")),
-          line(U.consume("x")),
+          line(U.consume("x;"), 1),
+          line(U.consume("x"), 1),
       }),
       U.consume(", y)"),
   });
@@ -524,9 +535,9 @@ TEST_F(MacroCallReconstructorTest, NestedCallsMultipleLines) {
   auto Expected = line({
       Chunk2Start,
       children({
-          line({Chunk2LBrace}),
-          line({Chunk1, Chunk2Semi}),
-          line({Chunk2RBrace}),
+          line({Chunk2LBrace}, 1),
+          line({Chunk1, Chunk2Semi}, 1),
+          line({Chunk2RBrace}, 1),
       }),
       Chunk2End,
   });
@@ -556,15 +567,17 @@ TEST_F(MacroCallReconstructorTest, ParentOutsideMacroCall) {
   auto Expected = line({
       Prefix,
       children({
-          line({
-              U.consume("ID("),
-              children({
-                  line(U.consume("x;")),
-                  line(U.consume("y;")),
-                  line(U.consume("z;")),
-              }),
-              U.consume(")"),
-          }),
+          line(
+              {
+                  U.consume("ID("),
+                  children({
+                      line(U.consume("x;"), 2),
+                      line(U.consume("y;"), 2),
+                      line(U.consume("z;"), 2),
+                  }),
+                  U.consume(")"),
+              },
+              1),
       }),
       Postfix,
   });
@@ -590,7 +603,7 @@ TEST_F(MacroCallReconstructorTest, ChildrenSplitAcrossArguments) {
   Matcher U(Call, Lex);
   auto Expected = line({
       U.consume("CALL({"),
-      children(line(U.consume("a;"))),
+      children(line(U.consume("a;"), 1)),
       U.consume(", b; })"),
   });
   EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected));
@@ -620,16 +633,20 @@ TEST_F(MacroCallReconstructorTest, ChildrenAfterMacroCall) {
   Matcher U(Call, Lex);
   auto Expected = line({
       U.consume("CALL({"),
-      children(line(U.consume("a"))),
+      children(line(U.consume("a"), 1)),
       U.consume(", b)"),
       Semi,
-      children(line({
-          SecondLine,
-          children(line({
-              ThirdLine,
-              Postfix,
-          })),
-      })),
+      children(line(
+          {
+              SecondLine,
+              children(line(
+                  {
+                      ThirdLine,
+                      Postfix,
+                  },
+                  2)),
+          },
+          1)),
   });
   EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected));
 }
@@ -655,7 +672,37 @@ TEST_F(MacroCallReconstructorTest, InvalidCodeSplittingBracesAcrossArgs) {
   Matcher U(Call, Lex);
   auto Expected = line({
       Prefix,
-      children({line(U.consume("M({,x,)"))}),
+      children({line(U.consume("M({,x,)"), 1)}),
+  });
+  EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected));
+}
+
+TEST_F(MacroCallReconstructorTest, IndentLevelInExpandedCode) {
+  auto Macros = createExpander({"ID(a)=a"});
+  Expansion Exp(Lex, *Macros);
+  TokenList Call = Exp.expand("ID", {std::string("[] { { x; } }")});
+
+  MacroCallReconstructor Unexp(0, Exp.getUnexpanded());
+  Matcher E(Exp.getTokens(), Lex);
+  Unexp.addLine(line({
+      E.consume("[] {"),
+      children({
+          line(E.consume("{"), 1),
+          line(E.consume("x;"), 2),
+          line(E.consume("}"), 1),
+      }),
+      E.consume("}"),
+  }));
+  EXPECT_TRUE(Unexp.finished());
+  Matcher U(Call, Lex);
+  auto Expected = line({
+      U.consume("ID([] {"),
+      children({
+          line(U.consume("{"), 1),
+          line(U.consume("x;"), 2),
+          line(U.consume("}"), 1),
+      }),
+      U.consume("})"),
   });
   EXPECT_THAT(std::move(Unexp).takeResult(), matchesLine(Expected));
 }

From e09e0d52a03c7141a7d62fb4adf4d9fee32bebb8 Mon Sep 17 00:00:00 2001
From: tsitdikov <tsitdikov@google.com>
Date: Fri, 23 Feb 2024 12:54:15 +0000
Subject: [PATCH 171/546] Users/tsitdikov (#82757)

Fix Test ARM SME library and build rule.
---
 mlir/test/lib/Dialect/ArmSME/CMakeLists.txt            | 2 ++
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt b/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt
index de4971ff7eb3d..e942c7b8ac058 100644
--- a/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/ArmSME/CMakeLists.txt
@@ -8,6 +8,8 @@ add_mlir_library(MLIRArmSMETestPasses
   MLIRArithToArmSME
   MLIRArmSMEToLLVM
   MLIRArmSMEToSCF
+  MLIRArmSMETransforms
+  MLIRArmSVETransforms
   MLIRIR
   MLIRPass
   MLIRTransforms
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 497256573dfc5..68d9b23fd5643 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -914,6 +914,8 @@ cc_library(
         "//mlir:ArithToArmSME",
         "//mlir:ArmSMEToLLVM",
         "//mlir:ArmSMEToSCF",
+        "//mlir:ArmSMETransforms",
+        "//mlir:ArmSVETransforms",
         "//mlir:IR",
         "//mlir:Pass",
         "//mlir:Transforms",

From 3b70387c5486a057fe0b7d52c79f9decf9c9c95f Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 23 Feb 2024 20:57:56 +0800
Subject: [PATCH 172/546] [ValueTracking] Handle more integer intrinsics in
 `propagatesPoison` (#82749)

This patch extends `propagatesPoison` to handle more integer intrinsics.
It will turn more logical ands/ors into bitwise ands/ors.

See also https://reviews.llvm.org/D99671.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 15 +++++
 .../ScalarEvolution/exit-count-select-safe.ll |  4 +-
 llvm/unittests/Analysis/ValueTrackingTest.cpp | 59 ++++++++-----------
 3 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 04f317228b3ea..653b3d4ffd988 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7194,6 +7194,21 @@ bool llvm::propagatesPoison(const Use &PoisonOp) {
         // corresponding lanes are poison.
         return true;
       case Intrinsic::ctpop:
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+      case Intrinsic::abs:
+      case Intrinsic::smax:
+      case Intrinsic::smin:
+      case Intrinsic::umax:
+      case Intrinsic::umin:
+      case Intrinsic::bitreverse:
+      case Intrinsic::bswap:
+      case Intrinsic::sadd_sat:
+      case Intrinsic::ssub_sat:
+      case Intrinsic::sshl_sat:
+      case Intrinsic::uadd_sat:
+      case Intrinsic::usub_sat:
+      case Intrinsic::ushl_sat:
         return true;
       }
     }
diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll
index 2af1309cab7e4..d3cec77982af9 100644
--- a/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exit-count-select-safe.ll
@@ -177,9 +177,9 @@ define i32 @logical_or_3ops_redundant_uminseq_operand(i32 %n, i32 %m, i32 %k) {
 ; CHECK-NEXT:    %umin = call i32 @llvm.umin.i32(i32 %n, i32 %m)
 ; CHECK-NEXT:    --> (%n umin %m) U: full-set S: full-set Exits: (%n umin %m) LoopDispositions: { %loop: Invariant }
 ; CHECK-NEXT:    %cond_p3 = select i1 %cond_p0, i1 true, i1 %cond_p1
-; CHECK-NEXT:    --> (true + ((true + %cond_p0) umin_seq (true + %cond_p1))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> (true + ((true + %cond_p0) umin (true + %cond_p1))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:    %cond = select i1 %cond_p3, i1 true, i1 %cond_p2
-; CHECK-NEXT:    --> (true + ((true + %cond_p0) umin_seq (true + %cond_p1) umin_seq (true + %cond_p2))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
+; CHECK-NEXT:    --> (true + (((true + %cond_p0) umin (true + %cond_p1)) umin_seq (true + %cond_p2))) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop: Variant }
 ; CHECK-NEXT:  Determining loop execution counts for: @logical_or_3ops_redundant_uminseq_operand
 ; CHECK-NEXT:  Loop %loop: backedge-taken count is ((%n umin %m) umin_seq %k)
 ; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is -1
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 8104a32909eac..9e0abe7a16df9 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -824,42 +824,7 @@ TEST_F(ValueTrackingTest, ComputeNumSignBits_Shuffle_Pointers) {
 TEST(ValueTracking, propagatesPoison) {
   std::string AsmHead =
       "declare i32 @g(i32)\n"
-      "declare {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.smul.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)\n"
-      "declare float @llvm.sqrt.f32(float)\n"
-      "declare float @llvm.powi.f32.i32(float, i32)\n"
-      "declare float @llvm.sin.f32(float)\n"
-      "declare float @llvm.cos.f32(float)\n"
-      "declare float @llvm.pow.f32(float, float)\n"
-      "declare float @llvm.exp.f32(float)\n"
-      "declare float @llvm.exp2.f32(float)\n"
-      "declare float @llvm.log.f32(float)\n"
-      "declare float @llvm.log10.f32(float)\n"
-      "declare float @llvm.log2.f32(float)\n"
-      "declare float @llvm.fma.f32(float, float, float)\n"
-      "declare float @llvm.fabs.f32(float)\n"
-      "declare float @llvm.minnum.f32(float, float)\n"
-      "declare float @llvm.maxnum.f32(float, float)\n"
-      "declare float @llvm.minimum.f32(float, float)\n"
-      "declare float @llvm.maximum.f32(float, float)\n"
-      "declare float @llvm.copysign.f32(float, float)\n"
-      "declare float @llvm.floor.f32(float)\n"
-      "declare float @llvm.ceil.f32(float)\n"
-      "declare float @llvm.trunc.f32(float)\n"
-      "declare float @llvm.rint.f32(float)\n"
-      "declare float @llvm.nearbyint.f32(float)\n"
-      "declare float @llvm.round.f32(float)\n"
-      "declare float @llvm.roundeven.f32(float)\n"
-      "declare i32 @llvm.lround.f32(float)\n"
-      "declare i64 @llvm.llround.f32(float)\n"
-      "declare i32 @llvm.lrint.f32(float)\n"
-      "declare i64 @llvm.llrint.f32(float)\n"
-      "declare float @llvm.fmuladd.f32(float, float, float)\n"
-      "define void @f(i32 %x, i32 %y, float %fx, float %fy, "
+      "define void @f(i32 %x, i32 %y, i32 %shamt, float %fx, float %fy, "
       "i1 %cond, ptr %p) {\n";
   std::string AsmTail = "  ret void\n}";
   // (propagates poison?, IR instruction)
@@ -912,6 +877,28 @@ TEST(ValueTracking, propagatesPoison) {
       {true, "call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)", 0},
       {true, "call {i32, i1} @llvm.usub.with.overflow.i32(i32 %x, i32 %y)", 0},
       {true, "call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.sshl.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.usub.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.ushl.sat.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.ctpop.i32(i32 %x)", 0},
+      {true, "call i32 @llvm.ctlz.i32(i32 %x, i1 true)", 0},
+      {true, "call i32 @llvm.cttz.i32(i32 %x, i1 true)", 0},
+      {true, "call i32 @llvm.abs.i32(i32 %x, i1 true)", 0},
+      {true, "call i32 @llvm.smax.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.smin.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.umax.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.umin.i32(i32 %x, i32 %y)", 0},
+      {true, "call i32 @llvm.bitreverse.i32(i32 %x)", 0},
+      {true, "call i32 @llvm.bswap.i32(i32 %x)", 0},
+      {false, "call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt)", 0},
+      {false, "call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt)", 1},
+      {false, "call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %shamt)", 2},
+      {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 0},
+      {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 1},
+      {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 2},
       {false, "call float @llvm.sqrt.f32(float %fx)", 0},
       {false, "call float @llvm.powi.f32.i32(float %fx, i32 %x)", 0},
       {false, "call float @llvm.sin.f32(float %fx)", 0},

From 1197fcabc4b5f39dbe8a94b1ab6e92354f3f0dd2 Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree@users.noreply.github.com>
Date: Fri, 23 Feb 2024 08:01:56 -0500
Subject: [PATCH 173/546] [libcxx][test] Change UNSUPPORTED to XFAIL for
 target-related failures (#81513)

This is a followup from this discussion
https://github.com/llvm/llvm-project/pull/80735#discussion_r1486586017
to mark targets that were initially marked as UNSUPPORTED with an XFAIL
instead.
---
 .../directory_entry.mods/last_write_time.pass.cpp         | 2 +-
 .../support.dynamic/libcpp_deallocate.sh.cpp              | 2 +-
 .../support.dynamic/new_faligned_allocation.pass.cpp      | 2 +-
 .../PR30202_notify_from_pthread_created_thread.pass.cpp   | 2 +-
 .../thread.threads/thread.thread.this/sleep_for.pass.cpp  | 2 +-
 .../fs.op.funcs/fs.op.remove_all/toctou.pass.cpp          | 2 +-
 .../cmp/cmp.alg/strong_order_long_double.verify.cpp       | 8 ++++----
 .../language.support/support.dynamic/align_val_t.pass.cpp | 2 +-
 .../new.delete.array/delete_align_val_t_replace.pass.cpp  | 2 +-
 .../new.delete/new.delete.array/new.size_align.pass.cpp   | 2 +-
 .../new.size_align.replace.indirect.pass.cpp              | 2 +-
 .../new.delete.array/new.size_align.replace.pass.cpp      | 2 +-
 .../new.delete.array/new.size_align_nothrow.pass.cpp      | 2 +-
 .../new.size_align_nothrow.replace.indirect.pass.cpp      | 2 +-
 .../new.size_align_nothrow.replace.pass.cpp               | 2 +-
 .../new.delete/new.delete.array/nodiscard.verify.cpp      | 2 +-
 .../new.delete.single/delete_align_val_t_replace.pass.cpp | 2 +-
 .../new.delete/new.delete.single/new.size_align.pass.cpp  | 2 +-
 .../new.delete.single/new.size_align.replace.pass.cpp     | 2 +-
 .../new.delete.single/new.size_align_nothrow.pass.cpp     | 2 +-
 .../new.size_align_nothrow.replace.indirect.pass.cpp      | 2 +-
 .../new.size_align_nothrow.replace.pass.cpp               | 2 +-
 .../new.delete/new.delete.single/nodiscard.verify.cpp     | 2 +-
 .../support.rtti/type.info/type_info.equal.pass.cpp       | 4 ----
 .../locale.codecvt.members/char16_t_in.pass.cpp           | 2 +-
 .../locale.codecvt.members/char16_t_length.pass.cpp       | 2 +-
 .../locale.codecvt.members/char16_t_out.pass.cpp          | 2 +-
 .../locale.codecvt.members/char32_t_in.pass.cpp           | 2 +-
 .../locale.codecvt.members/char32_t_length.pass.cpp       | 2 +-
 .../locale.codecvt.members/char32_t_out.pass.cpp          | 2 +-
 .../format.functions/escaped_output.unicode.pass.cpp      | 3 ---
 .../format.range/format.range.fmtstr/format.pass.cpp      | 3 ---
 .../format.range/format.range.fmtstr/parse.pass.cpp       | 3 ---
 .../memory/temporary.buffer/overaligned.pass.cpp          | 2 +-
 34 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp b/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
index 0d1bb77583bed..26703f748d874 100644
--- a/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
+++ b/libcxx/test/libcxx/input.output/filesystems/class.directory_entry/directory_entry.mods/last_write_time.pass.cpp
@@ -18,7 +18,7 @@
 // UNSUPPORTED: windows
 
 // This test assumes that time is stored as a 64 bit value when on MVS it is stored as 32 bit
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 // <filesystem>
 
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 267f87bd3f6f8..f94ceaf57dbae 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -11,7 +11,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 // XFAIL: sanitizer-new-delete && !hwasan
 
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp b/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
index 9cf1b275abc55..69c46f00fb65d 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
@@ -11,7 +11,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 // REQUIRES: -faligned-allocation
 // ADDITIONAL_COMPILE_FLAGS: -faligned-allocation
diff --git a/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp b/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp
index fd863fb2d3fd4..b1a3f86e86437 100644
--- a/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.condition/PR30202_notify_from_pthread_created_thread.pass.cpp
@@ -14,7 +14,7 @@
 // UNSUPPORTED: c++03
 
 // PR30202 was fixed starting in macosx10.13.
-// UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
 
 // <condition_variable>
 
diff --git a/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp b/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp
index 9031359478967..47741d0851e89 100644
--- a/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.threads/thread.thread.this/sleep_for.pass.cpp
@@ -11,7 +11,7 @@
 // Until 58a0a70fb2f1, this_thread::sleep_for could sometimes get interrupted
 // by signals and this test would fail spuriously. Disable the test on the
 // corresponding system libraries.
-// UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11}}
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11}}
 
 // ALLOW_RETRIES: 3
 
diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
index 637139b81d546..5248ba24b4038 100644
--- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.remove_all/toctou.pass.cpp
@@ -36,7 +36,7 @@
 // UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx12.{{0|1|2}}
 
 // Windows doesn't support the necessary APIs to mitigate this issue.
-// UNSUPPORTED: target={{.+}}-windows-{{.+}}
+// XFAIL: target={{.+}}-windows-{{.+}}
 
 #include <cstdio>
 #include <filesystem>
diff --git a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
index fd16afeefb033..c9c2ba2002149 100644
--- a/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.alg/strong_order_long_double.verify.cpp
@@ -10,18 +10,18 @@
 
 // The following platforms have sizeof(long double) == sizeof(double), so this test doesn't apply to them.
 // This test does apply to aarch64 where Arm's AAPCS64 is followed. There they are different sizes.
-// UNSUPPORTED: target={{arm64|arm64e|armv(7|8)(l|m)?|powerpc|powerpc64}}-{{.+}}
+// XFAIL: target={{arm64|arm64e|armv(7|8)(l|m)?|powerpc|powerpc64}}-{{.+}}
 
 // MSVC configurations have long double equal to regular double on all
 // architectures.
-// UNSUPPORTED: target={{.+}}-pc-windows-msvc
+// XFAIL: target={{.+}}-pc-windows-msvc
 
 // ARM/AArch64 MinGW also has got long double equal to regular double, just
 // like MSVC (thus match both MinGW and MSVC here, for those architectures).
-// UNSUPPORTED: target={{aarch64|armv7}}-{{.*}}-windows-{{.+}}
+// XFAIL: target={{aarch64|armv7}}-{{.*}}-windows-{{.+}}
 
 // Android's 32-bit x86 target has long double equal to regular double.
-// UNSUPPORTED: target=i686-{{.+}}-android{{.*}}
+// XFAIL: target=i686-{{.+}}-android{{.*}}
 
 // <compare>
 
diff --git a/libcxx/test/std/language.support/support.dynamic/align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/align_val_t.pass.cpp
index c1f9f2f6cc6fe..28c72f0be7aed 100644
--- a/libcxx/test/std/language.support/support.dynamic/align_val_t.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/align_val_t.pass.cpp
@@ -12,7 +12,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <type_traits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp
index 33d5225586b96..60b88ec79248c 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp
@@ -12,7 +12,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
index c903c63f68c4a..dd8090aca5b28 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
index 66cbb4b9c8eb6..0b540e09bab3c 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
@@ -17,7 +17,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
index 4619a71629a7b..2d021ecb30e79 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp
index 0343d51f184d9..6ae8ceaf534e4 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
index f6959172ea24e..227b20f0b1e18 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -22,7 +22,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp
index b984e8cf0a43a..17d38e3de8cd8 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp
index 0f30cf0135a41..509fa98446f80 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp
@@ -19,7 +19,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp
index c2021c5d84e9c..c346c42f157b0 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp
@@ -12,7 +12,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
index 93edb32130c3e..dbb10a76ad9e9 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
index 87d061a9d1a3c..e5ef5f1669752 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.replace.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp
index 1c575729678d5..b9d8ea2f4e494 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
index 2e7fa132890b8..7eab0729f9ef1 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -21,7 +21,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
index 3c9e17b0b02be..9a5b53e039025 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
@@ -15,7 +15,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp
index 16d6a223eb535..3cda8ad9a1e69 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp
@@ -19,7 +19,7 @@
 
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <new>
 
diff --git a/libcxx/test/std/language.support/support.rtti/type.info/type_info.equal.pass.cpp b/libcxx/test/std/language.support/support.rtti/type.info/type_info.equal.pass.cpp
index 3f5dd96261923..8092f7c5cd01f 100644
--- a/libcxx/test/std/language.support/support.rtti/type.info/type_info.equal.pass.cpp
+++ b/libcxx/test/std/language.support/support.rtti/type.info/type_info.equal.pass.cpp
@@ -12,10 +12,6 @@
 
 // UNSUPPORTED: no-rtti
 
-// When we build for Windows on top of the VC runtime, `typeinfo::operator==` may not
-// be `constexpr` (depending on the version of the VC runtime). So this test can fail.
-// UNSUPPORTED: target={{.+}}-windows-msvc && !libcpp-no-vcruntime
-
 #include <typeinfo>
 #include <cassert>
 
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp
index 9204ea26b9c9f..9e1d0a1c7bac7 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_in.pass.cpp
@@ -20,7 +20,7 @@
 // Test is intended to convert between UTF8 and UTF16/32, it will fail on
 // z/OS since at default char type on z/OS is EBCDIC character which has
 // value different from ASCII character.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <locale>
 #include <string>
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_length.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_length.pass.cpp
index 98c5824a3d4a0..eeef2a80c20bc 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_length.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_length.pass.cpp
@@ -18,7 +18,7 @@
 // Test is intended to convert between UTF8 and UTF16/32, it will fail on
 // z/OS since at default char type on z/OS is EBCDIC character which has
 // value different from ASCII character.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <locale>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp
index a8f16fcc29a14..2db95b5620810 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_out.pass.cpp
@@ -20,7 +20,7 @@
 // Test is intended to convert between UTF8 and UTF16/32, it will fail on
 // z/OS since at default char type on z/OS is EBCDIC character which has
 // value different from ASCII character.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <locale>
 #include <string>
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp
index 74cc4793aaf9a..94602128f8cd3 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_in.pass.cpp
@@ -20,7 +20,7 @@
 // Test is intended to convert between UTF8 and UTF16/32, it will fail on
 // z/OS since at default char type on z/OS is EBCDIC character which has
 // value different from ASCII character.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <locale>
 #include <string>
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_length.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_length.pass.cpp
index f51a9db212a29..03d35830f4247 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_length.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_length.pass.cpp
@@ -18,7 +18,7 @@
 // Test is intended to convert between UTF8 and UTF16/32, it will fail on
 // z/OS since at default char type on z/OS is EBCDIC character which has
 // value different from ASCII character.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <locale>
 #include <cassert>
diff --git a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp
index 379b607a80eea..df58cc9f518d1 100644
--- a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_out.pass.cpp
@@ -20,7 +20,7 @@
 // Test is intended to convert between UTF8 and UTF16/32, it will fail on
 // z/OS since at default char type on z/OS is EBCDIC character which has
 // value different from ASCII character.
-// UNSUPPORTED: target={{.+}}-zos{{.*}}
+// XFAIL: target={{.+}}-zos{{.*}}
 
 #include <locale>
 #include <string>
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
index 23b26722d7be7..bf5c0a51f944a 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.functions/escaped_output.unicode.pass.cpp
@@ -11,9 +11,6 @@
 // This version runs the test when the platform has Unicode support.
 // UNSUPPORTED: libcpp-has-no-unicode
 
-// TODO FMT Investigate Windows issues.
-// UNSUPPORTED: msvc, target={{.+}}-windows-gnu
-
 // TODO FMT This test should not require std::to_chars(floating-point)
 // XFAIL: availability-fp_to_chars-missing
 
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
index fc993efd0db35..675a5e896ff57 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/format.pass.cpp
@@ -7,9 +7,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// TODO FMT Investigate why this fails.
-// UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
index a24fc06fae562..7acee9cb9dc51 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtstr/parse.pass.cpp
@@ -7,9 +7,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// TODO FMT Investigate why this fails.
-// UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0}}
-
 // <format>
 
 // template<ranges::input_range R, class charT>
diff --git a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
index 019a6aa038537..c928ba24f1bfe 100644
--- a/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
+++ b/libcxx/test/std/utilities/memory/temporary.buffer/overaligned.pass.cpp
@@ -11,7 +11,7 @@
 // Aligned allocations are not supported on macOS < 10.13
 // Note: use 'unsupported' instead of 'xfail' to ensure
 // we won't pass prior to c++17.
-// UNSUPPORTED: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12}}
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 

From ad49fe3e89c3b3950956548f14cdb5c159ba0aec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 23 Feb 2024 11:58:27 +0100
Subject: [PATCH 174/546] [clang][Interp] Don't return success for already
 failed global variables

We might be visiting them more than once. We used to return true for
second and subsequent cases, just because we had already visited it
before.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp                    | 6 ++++++
 .../PR20334-std_initializer_list_diagnosis_assertion.cpp    | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 27e0986192165..7f97d8ce9fb80 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2510,6 +2510,12 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::visitDecl(const VarDecl *VD) {
   assert(!VD->isInvalidDecl() && "Trying to constant evaluate an invalid decl");
 
+  // Global variable we've already seen but that's uninitialized means
+  // evaluating the initializer failed. Just return failure.
+  if (std::optional<unsigned> Index = P.getGlobal(VD);
+      Index && !P.getPtrGlobal(*Index).isInitialized())
+    return false;
+
   // Create and initialize the variable.
   if (!this->visitVarDecl(VD))
     return false;
diff --git a/clang/test/SemaCXX/PR20334-std_initializer_list_diagnosis_assertion.cpp b/clang/test/SemaCXX/PR20334-std_initializer_list_diagnosis_assertion.cpp
index ec672089b84aa..fb1feee01b29f 100644
--- a/clang/test/SemaCXX/PR20334-std_initializer_list_diagnosis_assertion.cpp
+++ b/clang/test/SemaCXX/PR20334-std_initializer_list_diagnosis_assertion.cpp
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 -std=c++11 -verify -emit-llvm-only %s
 // RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify %s -DCPP98
+// RUN: %clang_cc1 -std=c++11 -verify -emit-llvm-only %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify %s -DCPP98 -fexperimental-new-constant-interpreter
+
 
 namespace std {
   template <class _E>

From 492e8ba0384b038596e6b4a97313b7bdced5e868 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 14:28:57 +0100
Subject: [PATCH 175/546] [mlir] Fix memory leaks after #81759 (#82762)

This commit fixes memory leaks that were introduced by #81759. The way
ops and blocks are erased changed slightly.

The leaks were caused by an incorrect implementation of op builders:
blocks must be created with the supplied builder object. Otherwise, they
are not properly tracked by the dialect conversion and can leak during
rollback.
---
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 11 ++++++-----
 mlir/lib/Dialect/SCF/IR/SCF.cpp        | 15 ++++++++-------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 30b6cd74147e6..33ce5c159db4f 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -648,6 +648,8 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
                      TypeRange workgroupAttributions,
                      TypeRange privateAttributions, Value clusterSizeX,
                      Value clusterSizeY, Value clusterSizeZ) {
+  OpBuilder::InsertionGuard g(builder);
+
   // Add a WorkGroup attribution attribute. This attribute is required to
   // identify private attributions in the list of block argguments.
   result.addAttribute(getNumWorkgroupAttributionsAttrName(),
@@ -674,7 +676,7 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
   // attributions, where the first kNumConfigRegionAttributes arguments have
   // `index` type and the rest have the same types as the data operands.
   Region *kernelRegion = result.addRegion();
-  Block *body = new Block();
+  Block *body = builder.createBlock(kernelRegion);
   // TODO: Allow passing in proper locations here.
   for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i)
     body->addArgument(builder.getIndexType(), result.location);
@@ -683,7 +685,6 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
     body->addArgument(argTy, result.location);
   for (Type argTy : privateAttributions)
     body->addArgument(argTy, result.location);
-  kernelRegion->push_back(body);
   // Fill OperandSegmentSize Attribute.
   SmallVector<int32_t, 11> segmentSizes(11, 1);
   segmentSizes.front() = asyncDependencies.size();
@@ -1325,6 +1326,8 @@ void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
                       TypeRange workgroupAttributions,
                       TypeRange privateAttributions,
                       ArrayRef<NamedAttribute> attrs) {
+  OpBuilder::InsertionGuard g(builder);
+
   result.addAttribute(SymbolTable::getSymbolAttrName(),
                       builder.getStringAttr(name));
   result.addAttribute(getFunctionTypeAttrName(result.name),
@@ -1333,7 +1336,7 @@ void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
                       builder.getI64IntegerAttr(workgroupAttributions.size()));
   result.addAttributes(attrs);
   Region *body = result.addRegion();
-  Block *entryBlock = new Block;
+  Block *entryBlock = builder.createBlock(body);
 
   // TODO: Allow passing in proper locations here.
   for (Type argTy : type.getInputs())
@@ -1342,8 +1345,6 @@ void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
     entryBlock->addArgument(argTy, result.location);
   for (Type argTy : privateAttributions)
     entryBlock->addArgument(argTy, result.location);
-
-  body->getBlocks().push_back(entryBlock);
 }
 
 /// Parses a GPU function memory attribution.
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 119df9acd9e9e..233e702dbb229 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -306,17 +306,18 @@ void ConditionOp::getSuccessorRegions(
 void ForOp::build(OpBuilder &builder, OperationState &result, Value lb,
                   Value ub, Value step, ValueRange iterArgs,
                   BodyBuilderFn bodyBuilder) {
+  OpBuilder::InsertionGuard guard(builder);
+
   result.addOperands({lb, ub, step});
   result.addOperands(iterArgs);
   for (Value v : iterArgs)
     result.addTypes(v.getType());
   Type t = lb.getType();
   Region *bodyRegion = result.addRegion();
-  bodyRegion->push_back(new Block);
-  Block &bodyBlock = bodyRegion->front();
-  bodyBlock.addArgument(t, result.location);
+  Block *bodyBlock = builder.createBlock(bodyRegion);
+  bodyBlock->addArgument(t, result.location);
   for (Value v : iterArgs)
-    bodyBlock.addArgument(v.getType(), v.getLoc());
+    bodyBlock->addArgument(v.getType(), v.getLoc());
 
   // Create the default terminator if the builder is not provided and if the
   // iteration arguments are not provided. Otherwise, leave this to the caller
@@ -325,9 +326,9 @@ void ForOp::build(OpBuilder &builder, OperationState &result, Value lb,
     ForOp::ensureTerminator(*bodyRegion, builder, result.location);
   } else if (bodyBuilder) {
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(&bodyBlock);
-    bodyBuilder(builder, result.location, bodyBlock.getArgument(0),
-                bodyBlock.getArguments().drop_front());
+    builder.setInsertionPointToStart(bodyBlock);
+    bodyBuilder(builder, result.location, bodyBlock->getArgument(0),
+                bodyBlock->getArguments().drop_front());
   }
 }
 

From b8a7d8131e5ad2c21238e192e6f9c5b69512abe3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 07:30:53 -0600
Subject: [PATCH 176/546] [LLVM] Fix incorrect GPU triple detection for
 runtimes builds

Summary:
This block of code is used to prevent a GPU-based cross compiling build
from taking incompatible arguments. However this incorrectly used the
LLVM default triple instead of the runtimes target. Fix that so the bots
can continue to default the triple to NVPTX.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 40316b11ceed9..08ff49ded57a1 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -120,10 +120,10 @@ if( LLVM_ENABLE_ASSERTIONS )
   endif()
 endif()
 
-# If we are targeting a GPU architecture we want to ignore all the standard
-# flag handling.
-if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn" OR
-   "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx64")
+# If we are targeting a GPU architecture in a runtimes build we want to ignore
+# all the standard flag handling.
+if("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn" OR
+   "${LLVM_RUNTIMES_TARGET}" MATCHES "^nvptx64")
   return()
 endif()
 

From c747b24262205aeaa112e5c0de3f786d960427ae Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Fri, 23 Feb 2024 21:43:53 +0800
Subject: [PATCH 177/546] [NFC] Precommit a memcpy test for isOrEquivalentToAdd
 (#82758)

---
 .../CodeGen/LoongArch/intrinsic-memcpy.ll     | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll

diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
new file mode 100644
index 0000000000000..09453004dcef7
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s
+
+%Box = type [6 x i64]
+
+define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
+; CHECK-LABEL: box:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    slli.d $a2, $a1, 5
+; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    add.d $a3, $a2, $a1
+; CHECK-NEXT:    ldx.d $a1, $a1, $a2
+; CHECK-NEXT:    st.d $a1, $a0, 0
+; CHECK-NEXT:    ld.d $a1, $a3, 40
+; CHECK-NEXT:    st.d $a1, $a0, 40
+; CHECK-NEXT:    ld.d $a1, $a3, 32
+; CHECK-NEXT:    st.d $a1, $a0, 32
+; CHECK-NEXT:    ld.d $a1, $a3, 24
+; CHECK-NEXT:    st.d $a1, $a0, 24
+; CHECK-NEXT:    ld.d $a1, $a3, 16
+; CHECK-NEXT:    st.d $a1, $a0, 16
+; CHECK-NEXT:    ori $a1, $a3, 8
+; CHECK-NEXT:    ld.d $a1, $a1, 0
+; CHECK-NEXT:    st.d $a1, $a0, 8
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+  %1 = alloca [2 x %Box], align 16
+  %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) %b, ptr noundef nonnull align 16 dereferenceable(48) %2, i64 48, i1 false)
+  ret void
+}

From 71d47a0b00e9f48dc740556d7f452ffadf308731 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 23 Feb 2024 13:46:57 +0000
Subject: [PATCH 178/546] [RemoveDIs] Enable DPLabels conversion [3b/3]
 (#82639)

Enables conversion between llvm.dbg.label and DPLabel.
---
 .../include/llvm/IR/DebugProgramInstruction.h | 10 ++++++++
 llvm/lib/IR/BasicBlock.cpp                    | 18 +++++++------
 llvm/lib/IR/DebugProgramInstruction.cpp       | 25 +++++++++++++++++++
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 84b0f743d3c9b..97089098ee535 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -62,6 +62,8 @@ class BasicBlock;
 class MDNode;
 class Module;
 class DbgVariableIntrinsic;
+class DbgInfoIntrinsic;
+class DbgLabelInst;
 class DIAssignID;
 class DPMarker;
 class DPValue;
@@ -80,6 +82,7 @@ class raw_ostream;
 ///   clone
 ///   isIdenticalToWhenDefined
 ///   both print methods
+///   createDebugIntrinsic
 class DbgRecord : public ilist_node<DbgRecord> {
 public:
   /// Marker that this DbgRecord is linked into.
@@ -103,6 +106,11 @@ class DbgRecord : public ilist_node<DbgRecord> {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &O, ModuleSlotTracker &MST, bool IsForDebug) const;
   bool isIdenticalToWhenDefined(const DbgRecord &R) const;
+  /// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
+  /// \p InsertBefore Optional position to insert this intrinsic.
+  /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
+  DbgInfoIntrinsic *createDebugIntrinsic(Module *M,
+                                         Instruction *InsertBefore) const;
   ///@}
 
   /// Same as isIdenticalToWhenDefined but checks DebugLoc too.
@@ -177,6 +185,8 @@ class DPLabel : public DbgRecord {
   DPLabel *clone() const;
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
+  DbgLabelInst *createDebugIntrinsic(Module *M,
+                                     Instruction *InsertBefore) const;
 
   void setLabel(DILabel *NewLabel) { Label = NewLabel; }
   DILabel *getLabel() const { return Label; }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 0680754444f17..6ea876fde5ec6 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -81,6 +81,12 @@ void BasicBlock::convertToNewDbgValues() {
       continue;
     }
 
+    if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
+      DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
+      DLI->eraseFromParent();
+      continue;
+    }
+
     if (DPVals.empty())
       continue;
 
@@ -107,16 +113,12 @@ void BasicBlock::convertFromNewDbgValues() {
       continue;
 
     DPMarker &Marker = *Inst.DbgMarker;
-    for (DbgRecord &DR : Marker.getDbgValueRange()) {
-      if (auto *DPV = dyn_cast<DPValue>(&DR))
-        InstList.insert(Inst.getIterator(),
-                        DPV->createDebugIntrinsic(getModule(), nullptr));
-      else
-        llvm_unreachable("unsupported DbgRecord kind");
-    }
+    for (DbgRecord &DR : Marker.getDbgValueRange())
+      InstList.insert(Inst.getIterator(),
+                      DR.createDebugIntrinsic(getModule(), nullptr));
 
     Marker.eraseFromParent();
-  };
+  }
 
   // Assume no trailing DPValues: we could technically create them at the end
   // of the block, after a terminator, but this would be non-cannonical and
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 2ca4533afa96c..389bac4de6a1c 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -112,6 +112,17 @@ bool DbgRecord::isEquivalentTo(const DbgRecord &R) const {
   return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R);
 }
 
+DbgInfoIntrinsic *
+DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
+  switch (RecordKind) {
+  case ValueKind:
+    return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
+  case LabelKind:
+    return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
+  };
+  llvm_unreachable("unsupported DbgRecord kind");
+}
+
 DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
                                 DIExpression *Expr, const DILocation *DI) {
   return new DPValue(ValueAsMetadata::get(Location), DV, Expr, DI,
@@ -377,6 +388,20 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   return DVI;
 }
 
+DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
+                                            Instruction *InsertBefore) const {
+  auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label);
+  Value *Args[] = {
+      MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())};
+  DbgLabelInst *DbgLabel = cast<DbgLabelInst>(
+      CallInst::Create(LabelFn->getFunctionType(), LabelFn, Args));
+  DbgLabel->setTailCall();
+  DbgLabel->setDebugLoc(getDebugLoc());
+  if (InsertBefore)
+    DbgLabel->insertBefore(InsertBefore);
+  return DbgLabel;
+}
+
 Value *DPValue::getAddress() const {
   auto *MD = getRawAddress();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))

From 52ada07ef5df2829e90ca2dd48305465a55e8121 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Amiaux?= <benoit.amiaux@gmail.com>
Date: Fri, 23 Feb 2024 14:49:57 +0100
Subject: [PATCH 179/546] build_llvm_release.bat: add tarball export to x64
 release (#79840)

Like linux releases, export a tar.xz files containing most llvm tools,
including non toolchain utilities, llvm-config, llvm-link and others.

We do this by reconfiguring cmake one last time at the last step,
running the install target so we do not need to recompile anything.

Fix #51192
Fix #53052
---
 llvm/utils/release/build_llvm_release.bat | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 67bb22de7606d..dc63fdac1e4ad 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -287,7 +287,16 @@ ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit
 ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
 ninja package || exit /b 1
+
+:: generate tarball with install toolchain only off
+set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
+  -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
+ninja install || exit /b 1
+:: check llvm_config is present & returns something
+%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1
 cd ..
+7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
 
 exit /b 0
 ::==============================================================================

From be083dba95dfbbb0286d798cc06fbe021715bc03 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Fri, 23 Feb 2024 09:15:48 -0500
Subject: [PATCH 180/546] [RISCV][NFC] Allow SchedVar to be a def inside our
 scheduler model files. (#82634)

All SchedModel files have a line that looks like:

```
def SomeModel : SchedMachineModel;
let SchedModel = SomeModel in {
  ...
}
```

TableGen requires that all records defined within the top level `let`
must have a field `SchedModel` somewhere in their nested record
hierarchy (i.e. the record has a field `SchedModel : SchedMachineModel`
or recursively, one of its members has a field `SchedModel :
SchedMachineModel`).

Classes such as `SchedPredicate` have added a field `SchedModel :
SchedMachineModel`, even though the field is never used, just to supress
**errors** (not warnings) caused from having the top level let in the
model files. This decision was made to avoid having hundreds of the same
`let` statement littered in every scheduler model file.

The reason we have never seen an error for `SchedVar` before is because
`SchedVar` is never instantiated with a `def`. Instead, it is only
created as a value that is consumed by `SchedWriteVariant`:

```
... : SchedWriteVariant<[SchedVar<...>, SchedVar<...>]>;
```

There is a problem with this style of instantiation. In particular, the
problem arises as we try to take a class based approach to building
scheduler models. I will describe the problem from the bottom up.

The `LMULWriteResMXVariant` multiclass takes in a `SchedPredicateBase
Pred`. Today, the RISCVSchedSiFive7.td file defines `VLDSX0Pred` outside
the scope of any class. That means that `VLDSX0Pred` exists before
`LMULWriteResMXVariant` multiclass is instantiated. With this approach,
there is no error since the predicate is instantated in entirety before
the variant multiclass is instantiated. However, I have the intention to
move the definition of both the predicate and the variant multiclass
records inside a multiclass to factor out common parts between multiple
scheduler models.

I plan to have something like:

```
multiclass SiFive7Base<SiFive7BaseConfig c> {
  def VLDSX0Pred : ...;
  // Need defvar since record is prefixed with NAME.
  defvar VLDSX0Pred = !cast<...>(NAME # VLDSX0Pred);
  defm SiFive7 : LMULWriteResMXVariant<VLDSX0Pred>;
}

defm "SiFive7Version1" : SiFive7Base<SiFive7BaseConfig<...>>;
defm "SiFive7Version2" : SiFive7Base<SiFive7BaseConfig<...>>;
```

In this scheme, VLDSX0Pred is defined within the same multiclass
transaction that the `LMULWriteResMXVariant` is defined in. For some
reason, TableGen does not allow `Values` to reference records that were
created in the same parent record construction. If the `SchedVar` is not
a `def`, then it will not be able to find the record `NAME #
VLDSX0Pred`. Making it a def, allows TableGen to find `NAME #
VLDSX0Pred` in scope.

The simplest example of this is:

```
class A {}
class B<A a> { A x = a;}
class C<B b> { B y = b;}
multiclass D {
  def MyA : A;
  defvar aa = !cast<A>(NAME # MyA);
  // This works
  def : B<aa>;
  // This does not work because constructing B by value cannot find `NAME # MyA`
  // error: Undefined reference to record: 'MyA'
  def : C<B<aa>>;
  // To fix it, define it like such:
  def MyB : B<aa>;
  defvar bb = !cast<B>(NAME # MyB);
  def : C<bb>;
}
defm "" : D;
```

In summary, in order to use a class based approach to creating scheduler
resources to promote resusability, `SchedVar`s must be created using
defs instead of being instantiated by value so that it can resolve
records that were part of the instantiation of the parent record being
created. In order to do this without refactoring the top level `let`
statement that all scheduler model files use, we add an unused field
`SchedModel : SchedMachineModel` to `SchedVar`, similiar to what has
been done in `SchedPredicate`.
---
 llvm/include/llvm/Target/TargetSchedule.td |  2 ++
 llvm/lib/Target/RISCV/RISCVScheduleV.td    | 21 +++++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index e2781a5d1ea54..48c9387977c07 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -399,6 +399,8 @@ def NoSchedPred : MCSchedPredicate<TruePred>;
 class SchedVar<SchedPredicateBase pred, list<SchedReadWrite> selected> {
   SchedPredicateBase Predicate = pred;
   list<SchedReadWrite> Selected = selected;
+  // SchedModel silences warnings but is ignored.
+  SchedMachineModel SchedModel = ?;
 }
 
 // SchedModel silences warnings but is ignored.
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index d15cb611ae665..0be681de3daf6 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -88,20 +88,25 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
     let ReleaseAtCycles = noPredReleaseCycles;
   }
 
+  // Define SchedVars
+  def nameMX # PredSchedVar
+      : SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>;
+  def nameMX # NoPredSchedVar
+      : SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX #"_NoPred")]>;
+  // Allow multiclass to refer to SchedVars -- need to have NAME prefix.
+  defvar PredSchedVar = !cast<SchedVar>(NAME # nameMX # PredSchedVar);
+  defvar NoPredSchedVar = !cast<SchedVar>(NAME # nameMX # NoPredSchedVar);
+
   // Tie behavior to predicate
-  def NAME # nameMX # "_Variant" : SchedWriteVariant<[
-    SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
-    SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
-  ]>;
+  def NAME # nameMX # "_Variant"
+      : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
   def : SchedAlias<
     !cast<SchedReadWrite>(nameMX),
     !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>;
 
   if IsWorstCase then {
-    def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[
-      SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
-      SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
-    ]>;
+    def NAME # name # "_WorstCase_Variant"
+      : SchedWriteVariant<[PredSchedVar, NoPredSchedVar]>;
     def : SchedAlias<
       !cast<SchedReadWrite>(name # "_WorstCase"),
       !cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>;

From 3b232f066d40a3e91ac27e421a3baeaca0cd59ec Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 15:52:26 +0100
Subject: [PATCH 181/546] [mlir][linalg] `LinalgOp`: Disallow mixed
 tensor/buffer semantics (#80660)

Related discussion:
https://github.com/llvm/llvm-project/pull/73908/files#r1414913030.

This change fixes #73547.
---
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    |  5 ++
 mlir/test/Dialect/Linalg/canonicalize.mlir    | 55 +++++--------------
 .../Linalg/fusion-elementwise-ops.mlir        | 40 --------------
 mlir/test/Dialect/Linalg/invalid.mlir         | 10 ++++
 4 files changed, 29 insertions(+), 81 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 7eed7928456d5..3627ff6617eda 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -1041,6 +1041,11 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) {
 LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) {
   LinalgOp linalgOp = cast<LinalgOp>(op);
 
+  // Mixed tensor/buffer operands are not allowed.
+  if (!linalgOp.hasPureTensorSemantics() &&
+      !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0)
+    return op->emitOpError("expected to have pure tensor or buffer semantics");
+
   // Before checking indexing maps, we need to make sure the attributes
   // referenced by it are valid.
   if (linalgOp.hasDynamicIndexingMaps())
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 7adde3117deea..206d7e9f1ce8d 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -102,17 +102,16 @@ func.func @tensor.cast.unranked(%a : tensor<*xf32>, %b : tensor<*xf32>, %c : ten
 // -----
 
 // CHECK-LABEL: func @linalg_effects(
-//  CHECK-SAME:     %[[A:[a-z0-9]*]]: tensor<?x?xf32>
-//  CHECK-SAME:     %[[B:[a-z0-9]*]]: memref<?x?xf32>
-//  CHECK-SAME:     %[[C:[a-z0-9]*]]: tensor<?x?xf32>
-func.func @linalg_effects(%a : tensor<?x?xf32>, %b : memref<?x?xf32>, %c : tensor<?x?xf32>) {
+func.func @linalg_effects(
+    %a : tensor<?x?xf32>, %b : tensor<?x?xf32>, %c : tensor<?x?xf32>,
+    %d : memref<?x?xf32>, %e : memref<?x?xf32>, %f : memref<?x?xf32>) {
   // CHECK-NOT:   %{{.*}} = linalg.matmul
-  %t = linalg.matmul ins(%a, %b : tensor<?x?xf32>, memref<?x?xf32>)
+  %t = linalg.matmul ins(%a, %b : tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%c : tensor<?x?xf32>) -> tensor<?x?xf32>
 
   // CHECK:   linalg.matmul
-  linalg.matmul ins(%a, %c : tensor<?x?xf32>, tensor<?x?xf32>)
-               outs(%b : memref<?x?xf32>)
+  linalg.matmul ins(%d, %e : memref<?x?xf32>, memref<?x?xf32>)
+               outs(%f : memref<?x?xf32>)
   return
 }
 
@@ -889,11 +888,11 @@ func.func @fold_multi_use_generic_op_with_consumer(%arg0 : tensor<?x?x?xf32>) ->
 // -----
 
 #map = affine_map<(d0) -> (d0)>
-func.func @identity_mixed(%arg0 : tensor<?xf32>, %arg1: memref<?xf32>) {
+func.func @identity_buffer(%arg0 : memref<?xf32>, %arg1: memref<?xf32>) {
   linalg.generic {
     indexing_maps = [#map, #map],
     iterator_types = ["parallel"]
-  } ins(%arg0 : tensor<?xf32>)
+  } ins(%arg0 : memref<?xf32>)
     outs(%arg1 : memref<?xf32>) {
   ^bb0(%arg2 : f32, %arg3 : f32):
     linalg.yield %arg2 : f32
@@ -901,14 +900,13 @@ func.func @identity_mixed(%arg0 : tensor<?xf32>, %arg1: memref<?xf32>) {
   return
 }
 
-// There was a crash in EraseIdentityGenericOp for generic with mixed semantics.
-// For now, check generic remained unchanged.
-// CHECK-LABEL: func @identity_mixed
-//  CHECK-SAME:     (%[[ARG1:.*]]: tensor<?xf32>, %[[ARG2:.*]]: memref<?xf32>)
+// Do not erase ops with buffer semantics.
+// CHECK-LABEL: func @identity_buffer
+//  CHECK-SAME:     (%[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf32>)
 //       CHECK:     linalg.generic {
 //  CHECK-SAME:    indexing_maps = [#map, #map],
 //  CHECK-SAME:    iterator_types = ["parallel"]
-//  CHECK-SAME:  } ins(%[[ARG1]] : tensor<?xf32>)
+//  CHECK-SAME:  } ins(%[[ARG1]] : memref<?xf32>)
 //  CHECK-SAME:    outs(%[[ARG2]] : memref<?xf32>) {
 
 // -----
@@ -916,12 +914,12 @@ func.func @identity_mixed(%arg0 : tensor<?xf32>, %arg1: memref<?xf32>) {
 // Just make sure that we don't crash.
 
 // CHECK-LABEL: func @dedeplicate_regression_test
-func.func @dedeplicate_regression_test(%0: tensor<4xf32>, %1: memref<4xf32>) {
+func.func @dedeplicate_regression_test(%0: tensor<4xf32>, %1: tensor<4xf32>) {
   %36 = linalg.generic
     {indexing_maps = [affine_map<(d0) -> (d0)>,
                       affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
      iterator_types = ["parallel"]}
-    ins(%1, %1 : memref<4xf32>, memref<4xf32>)
+    ins(%1, %1 : tensor<4xf32>, tensor<4xf32>)
     outs(%0 : tensor<4xf32>) {
   ^bb0(%in: f32, %in_24: f32, %out: f32):
     linalg.yield %in : f32
@@ -937,31 +935,6 @@ func.func @dedeplicate_regression_test(%0: tensor<4xf32>, %1: memref<4xf32>) {
 
 // -----
 
-#map = affine_map<(d0) -> (d0)>
-func.func @cast_producer_mixed(%arg0 : tensor<5xf32>, %arg1: memref<?xf32>) {
-  %0 = tensor.cast %arg0 : tensor<5xf32> to tensor<?xf32>
-  linalg.generic {
-    indexing_maps = [#map, #map],
-    iterator_types = ["parallel"]
-  } ins(%0 : tensor<?xf32>)
-    outs(%arg1 : memref<?xf32>) {
-  ^bb0(%arg2 : f32, %arg3 : f32):
-    linalg.yield %arg2 : f32
-  }
-  return
-}
-
-// We need a mixed linalg as a bridge between tensor and memref worlds.
-// CHECK-LABEL: func @cast_producer_mixed
-//  CHECK-SAME:     (%[[ARG1:.*]]: tensor<5xf32>, %[[ARG2:.*]]: memref<?xf32>)
-//       CHECK:     linalg.generic {
-//  CHECK-SAME:    indexing_maps = [#map, #map],
-//  CHECK-SAME:    iterator_types = ["parallel"]
-//  CHECK-SAME:  } ins(%[[ARG1]] : tensor<5xf32>)
-//  CHECK-SAME:    outs(%[[ARG2]] : memref<?xf32>) {
-
-// -----
-
 // CHECK-LABEL: dead_softmax
 func.func @dead_softmax(%arg0: tensor<16x64x256xf32>) -> tensor<16x64x256xf32> {
   %0 = tensor.empty() : tensor<16x64x256xf32>
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index 9d8421cbab49d..15a4f6cdd3bbe 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -1110,43 +1110,3 @@ module {
 //   CHECK-DAG:     %[[T3:.+]] = arith.addf %[[T2]], %[[B1]]
 //       CHECK:     linalg.yield %[[T3]] : f32
 //       CHECK:   return %[[GENERIC]]
-
-// -----
-
-// CHECK-DAG: [[$MAP0:#[a-zA-Z0-9_]*]] = affine_map<(d0, d1) -> (d0, d1)>
-#map0 = affine_map<(d0, d1) -> (d0, d1)>
-
-// CHECK-LABEL: @mixed_fusion
-func.func @mixed_fusion(%arg0: tensor<?x?xf32>, %arg1 : tensor<?x?xf32>, %arg2 : tensor<?x?xf32>, %arg8 : memref<?x?xf32>)
-{
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
-  %1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
-  %2 = tensor.empty(%0, %1) : tensor<?x?xf32>
-  %3 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
-      ins(%arg0, %arg1 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%2 : tensor<?x?xf32>) {
-    ^bb0(%arg3: f32, %arg4: f32, %arg5: f32):
-      %4 = arith.addf %arg3, %arg4 : f32
-      linalg.yield %4 : f32
-  } -> tensor<?x?xf32>
-  // CHECK: linalg.generic {
-  // CHECK-SAME: indexing_maps = {{\[}}[[$MAP0]], [[$MAP0]], [[$MAP0]], [[$MAP0]]{{\]}}
-  linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
-      ins(%3, %arg2 : tensor<?x?xf32>, tensor<?x?xf32>)
-      outs(%arg8 : memref<?x?xf32>) {
-    // CHECK: ^{{[a-zA-Z0-9_]*}}
-    // CHECK-SAME: [[ARG0:%[a-zA-Z0-9_]*]]
-    // CHECK-SAME: [[ARG1:%[a-zA-Z0-9_]*]]
-    // CHECK-SAME: [[ARG2:%[a-zA-Z0-9_]*]]
-    ^bb0(%arg5: f32, %arg6: f32, %arg7: f32):
-      // CHECK: [[T1:%[a-zA-Z0-9_]*]] = arith.addf [[ARG0]], [[ARG1]]
-      // CHECK-NOT: linalg.yield
-      // CHECK: arith.mulf [[T1]], [[ARG2]]
-      // CHECK: linalg.yield
-      %5 = arith.mulf %arg5, %arg6 : f32
-      linalg.yield %5 : f32
-    }
-  return
-}
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 916c04f33e9c6..44c81c31ace0f 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -770,3 +770,13 @@ func.func @mmt4d_rank_mismatch(%A: tensor<16x16x8x1xf32>,
                      -> tensor<8x8xf32>
     return %res : tensor<8x8xf32>
 }
+
+// -----
+
+func.func @mixed_semantics(%a: tensor<?x?xf32>, %b: tensor<?x?xf32>, %c: memref<?x?xf32>) {
+  // expected-error @+1 {{expected to have pure tensor or buffer semantics}}
+  linalg.matmul ins(%a, %b: tensor<?x?xf32>, tensor<?x?xf32>)
+               outs(%c: memref<?x?xf32>)
+  return
+}
+

From 08cb1a62f6f401d66513a20e8689c1ef9059fc63 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Fri, 23 Feb 2024 15:40:44 +0000
Subject: [PATCH 182/546] [AArch64][SVE] Add intrinsincs to assembly mapping
 for svpmov (#81861)

This patch enables translation of svpmov intrinsic to the correct
assembly instruction, instead of function call.
---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 46 ++++++-----
 .../AArch64/sve2p1-intrinsics-pmov-to-pred.ll | 82 +++----------------
 .../sve2p1-intrinsics-pmov-to-vector.ll       | 45 ++--------
 3 files changed, 47 insertions(+), 126 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 921e5b95ae03e..6b045e412cd51 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1367,6 +1367,27 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty,
                  llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
+  
+  class SVE2_1VectorArg_Pred_Intrinsic
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyvector_ty],
+                            [IntrNoMem]>;
+
+  class SVE2_1VectorArgIndexed_Pred_Intrinsic
+    : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [llvm_anyvector_ty, llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+  
+  class SVE2_Pred_1VectorArgIndexed_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
+                            [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
+  class SVE2_Pred_1VectorArg_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                            [IntrNoMem]>;
 
   // NOTE: There is no relationship between these intrinsics beyond an attempt
   // to reuse currently identical class definitions.
@@ -3610,23 +3631,10 @@ def int_aarch64_sve_extq : AdvSIMD_2VectorArgIndexed_Intrinsic;
 //
 // SVE2.1 - Move predicate to/from vector
 //
-def int_aarch64_sve_pmov_to_pred_lane :
-    DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                          [llvm_anyvector_ty, llvm_i32_ty],
-                          [IntrNoMem, ImmArg<ArgIndex<1>>]>;
-
-def int_aarch64_sve_pmov_to_pred_lane_zero :
-    DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                          [llvm_anyvector_ty],
-                          [IntrNoMem]>;
-
-def int_aarch64_sve_pmov_to_vector_lane_merging :
-    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                          [LLVMMatchType<0>,
-                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty],
-                          [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+def int_aarch64_sve_pmov_to_pred_lane : SVE2_1VectorArgIndexed_Pred_Intrinsic; 
+    
+def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
 
-def int_aarch64_sve_pmov_to_vector_lane_zeroing :
-    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                          [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
-                          [IntrNoMem]>;
+def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
+   
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
index 7cae1d2c216b6..a592dcd4b8ce9 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
@@ -4,12 +4,7 @@
 define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov	p0.b, z0
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
 define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.h
+; CHECK-NEXT:    pmov	p1.h, z0[0]
+; CHECK-NEXT:    pmov	p2.h, z0[1]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
 define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #3 // =0x3
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.s
+; CHECK-NEXT:    pmov	p1.s, z0[0]
+; CHECK-NEXT:    pmov	p2.s, z0[3]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
 define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #7 // =0x7
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.d
+; CHECK-NEXT:    pmov	p1.d, z0[0]
+; CHECK-NEXT:    pmov	p2.d, z0[7]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
index 58b240b0fbd68..b7f36c6510233 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
@@ -6,12 +6,7 @@
 define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[1], p0.h
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1)
@@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca
 define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #3 // =0x3
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[3], p0.s
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3)
@@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca
 define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #7 // =0x7
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[7], p0.d
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7)
@@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca
 define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0, p0.b
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn)
@@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
 define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.h
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn)
@@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
 define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.s
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn)
@@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
 define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.d
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn)

From 55bc0488af077acb47be70542718d1bc17f3de4f Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Fri, 23 Feb 2024 08:00:58 -0800
Subject: [PATCH 183/546] Improve and modernize logging for
 Process::CompleteAttach() (#82717)

Target::SetArchitecture() does not necessarily set the triple that is
being passed in, and will unconditionally log the real architecture to
the log channel. By flipping the order between the log outputs, the
resulting combined log makes a lot more sense to read.
---
 lldb/source/Target/Process.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 23a8a66645c02..137795cb8cec9 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2937,14 +2937,11 @@ void Process::CompleteAttach() {
   DidAttach(process_arch);
 
   if (process_arch.IsValid()) {
+    LLDB_LOG(log,
+             "Process::{0} replacing process architecture with DidAttach() "
+             "architecture: \"{1}\"",
+             __FUNCTION__, process_arch.GetTriple().getTriple());
     GetTarget().SetArchitecture(process_arch);
-    if (log) {
-      const char *triple_str = process_arch.GetTriple().getTriple().c_str();
-      LLDB_LOGF(log,
-                "Process::%s replacing process architecture with DidAttach() "
-                "architecture: %s",
-                __FUNCTION__, triple_str ? triple_str : "<null>");
-    }
   }
 
   // We just attached.  If we have a platform, ask it for the process

From 5840aa95e3c2d93f400e638e7cbf167a693c75f5 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 23 Feb 2024 17:26:39 +0100
Subject: [PATCH 184/546] [mlir][Transforms] Fix crash in dialect conversion
 (#82783)

This is a follow-up to #82333. It is possible that the target block of a
`BlockTypeConversionRewrite` is detached, so the `MLIRContext` cannot be
taken from the block.
---
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index d015bd5290123..857b601acbc35 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1034,15 +1034,15 @@ void BlockTypeConversionRewrite::rollback() {
 
 LogicalResult BlockTypeConversionRewrite::materializeLiveConversions(
     function_ref<Operation *(Value)> findLiveUser) {
-  auto builder = OpBuilder::atBlockBegin(block, /*listener=*/&rewriterImpl);
-
   // Process the remapping for each of the original arguments.
   for (auto it : llvm::enumerate(origBlock->getArguments())) {
-    OpBuilder::InsertionGuard g(builder);
+    BlockArgument origArg = it.value();
+    // Note: `block` may be detached, so OpBuilder::atBlockBegin cannot be used.
+    OpBuilder builder(it.value().getContext(), /*listener=*/&rewriterImpl);
+    builder.setInsertionPointToStart(block);
 
     // If the type of this argument changed and the argument is still live, we
     // need to materialize a conversion.
-    BlockArgument origArg = it.value();
     if (rewriterImpl.mapping.lookupOrNull(origArg, origArg.getType()))
       continue;
     Operation *liveUser = findLiveUser(origArg);
@@ -1321,7 +1321,7 @@ LogicalResult ConversionPatternRewriterImpl::convertNonEntryRegionTypes(
 Block *ConversionPatternRewriterImpl::applySignatureConversion(
     Block *block, const TypeConverter *converter,
     TypeConverter::SignatureConversion &signatureConversion) {
-  MLIRContext *ctx = block->getParentOp()->getContext();
+  MLIRContext *ctx = eraseRewriter.getContext();
 
   // If no arguments are being changed or added, there is nothing to do.
   unsigned origArgCount = block->getNumArguments();

From 0b01320d28235ff54a98681414c7dd6024d348a7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 23 Feb 2024 16:54:28 +0000
Subject: [PATCH 185/546] [VPlan] Remove unused VPTransformState::CanonicalIV
 (NFCI).

Clean up unused member variable.
---
 llvm/lib/Transforms/Vectorize/VPlan.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 240d4bd628b05..a2a203c420051 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -385,9 +385,6 @@ struct VPTransformState {
 
   VPValue2ValueTy VPValue2Value;
 
-  /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
-  Value *CanonicalIV = nullptr;
-
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 

From 1408667fdd890edf7507ae2052360de20d81c19f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 23 Feb 2024 16:42:17 +0000
Subject: [PATCH 186/546] [mlir][ArmSME] Follow MLIR constant style in
 VectorLegalization.cpp (NFC)

---
 .../ArmSME/Transforms/VectorLegalization.cpp  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index e88f82c92eba7..26dfb38263372 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -40,12 +40,12 @@ namespace {
 //===----------------------------------------------------------------------===//
 
 // Common match failure reasons.
-static constexpr StringLiteral MATCH_FAILURE_NOT_SME_TILE_TYPE_MULTIPLE(
+static constexpr StringLiteral kMatchFailureNotSMETileTypeMultiple(
     "op vector size is not multiple of SME tiles");
-static constexpr StringLiteral MATCH_FAILURE_UNSUPPORTED_MASK_OP(
+static constexpr StringLiteral kMatchFailureUnsupportedMaskOp(
     "op mask is unsupported for legalization/decomposition");
 static constexpr StringLiteral
-    MATCH_FAILURE_NON_PERMUTATION_MAP("op affine map is not a permutation");
+    kMatchFailureNonPermutationMap("op affine map is not a permutation");
 
 /// An SMESubTile represents a single SME-sized sub-tile from decomposing a
 /// larger vector type. The (`row`, `col`) are the position of the tile in the
@@ -174,8 +174,8 @@ struct LegalizeVectorOuterProductOpsByDecomposition
                   OneToNPatternRewriter &rewriter) const override {
     auto vectorType = outerProductOp.getResultVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
-      return rewriter.notifyMatchFailure(
-          outerProductOp, MATCH_FAILURE_NOT_SME_TILE_TYPE_MULTIPLE);
+      return rewriter.notifyMatchFailure(outerProductOp,
+                                         kMatchFailureNotSMETileTypeMultiple);
 
     Value mask;
     Operation *rootOp = outerProductOp;
@@ -188,7 +188,7 @@ struct LegalizeVectorOuterProductOpsByDecomposition
 
     if (!isSupportedMaskOp(mask))
       return rewriter.notifyMatchFailure(outerProductOp,
-                                         MATCH_FAILURE_UNSUPPORTED_MASK_OP);
+                                         kMatchFailureUnsupportedMaskOp);
 
     ValueRange accSMETiles = adaptor.getAcc();
     auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
@@ -252,18 +252,18 @@ struct LegalizeTransferReadOpsByDecomposition
                   OneToNPatternRewriter &rewriter) const override {
     auto vectorType = readOp.getVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
-      return rewriter.notifyMatchFailure(
-          readOp, MATCH_FAILURE_NOT_SME_TILE_TYPE_MULTIPLE);
+      return rewriter.notifyMatchFailure(readOp,
+                                         kMatchFailureNotSMETileTypeMultiple);
 
     auto mask = readOp.getMask();
     if (!isSupportedMaskOp(mask))
       return rewriter.notifyMatchFailure(readOp,
-                                         MATCH_FAILURE_UNSUPPORTED_MASK_OP);
+                                         kMatchFailureUnsupportedMaskOp);
 
     auto permutationMap = readOp.getPermutationMap();
     if (!permutationMap.isPermutation())
       return rewriter.notifyMatchFailure(readOp,
-                                         MATCH_FAILURE_NON_PERMUTATION_MAP);
+                                         kMatchFailureNonPermutationMap);
 
     // Note: For 2D vector types the only non-identity permutation is a simple
     // tranpose [1, 0].
@@ -300,18 +300,18 @@ struct LegalizeTransferWriteOpsByDecomposition
                   OneToNPatternRewriter &rewriter) const override {
     auto vectorType = writeOp.getVectorType();
     if (!isMultipleOfSMETileVectorType(vectorType))
-      return rewriter.notifyMatchFailure(
-          writeOp, MATCH_FAILURE_NOT_SME_TILE_TYPE_MULTIPLE);
+      return rewriter.notifyMatchFailure(writeOp,
+                                         kMatchFailureNotSMETileTypeMultiple);
 
     auto mask = writeOp.getMask();
     if (!isSupportedMaskOp(mask))
       return rewriter.notifyMatchFailure(writeOp,
-                                         MATCH_FAILURE_UNSUPPORTED_MASK_OP);
+                                         kMatchFailureUnsupportedMaskOp);
 
     auto permutationMap = writeOp.getPermutationMap();
     if (!permutationMap.isPermutation())
       return rewriter.notifyMatchFailure(writeOp,
-                                         MATCH_FAILURE_NON_PERMUTATION_MAP);
+                                         kMatchFailureNonPermutationMap);
 
     // Note: For 2D vector types the only non-identity permutation is a simple
     // tranpose [1, 0].

From 24e7be426efe142c49bfab5cb278ffa313424176 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Fri, 23 Feb 2024 09:28:17 -0800
Subject: [PATCH 187/546] [NFC] clean up memtag-stack code (#80906)

we would replace the alloca with tagp for debug instructions, then
replace it back with the original alloca. it's easier to just skip the
replacement.
---
 llvm/lib/Target/AArch64/AArch64StackTagging.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 961dded132343..ef7c517732ef3 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -520,7 +519,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   for (auto &I : SInfo.AllocasToInstrument) {
     memtag::AllocaInfo &Info = I.second;
     assert(Info.AI && SIB.isInterestingAlloca(*Info.AI));
-    TrackingVH<Instruction> OldAI = Info.AI;
     memtag::alignAndPadAlloca(Info, kTagGranuleSize);
     AllocaInst *AI = Info.AI;
     int Tag = NextTag;
@@ -534,7 +532,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
                               ConstantInt::get(IRB.getInt64Ty(), Tag)});
     if (Info.AI->hasName())
       TagPCall->setName(Info.AI->getName() + ".tag");
-    Info.AI->replaceAllUsesWith(TagPCall);
+    // Does not replace metadata, so we don't have to handle DPValues.
+    Info.AI->replaceNonMetadataUsesWith(TagPCall);
     TagPCall->setOperand(0, Info.AI);
 
     // Calls to functions that may return twice (e.g. setjmp) confuse the
@@ -574,12 +573,6 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
       for (auto *II : Info.LifetimeEnd)
         II->eraseFromParent();
     }
-
-    // Fixup debug intrinsics to point to the new alloca.
-    for (auto *DVI : Info.DbgVariableIntrinsics)
-      DVI->replaceVariableLocationOp(OldAI, Info.AI);
-    for (auto *DPV : Info.DbgVariableRecords)
-      DPV->replaceVariableLocationOp(OldAI, Info.AI);
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime

From dfa1d9b027e677cf1379dffee0059261a34f3481 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Fri, 23 Feb 2024 19:34:55 +0200
Subject: [PATCH 188/546] [AMDGPU][NFC] Have helpers to deal with encoding
 fields. (#82772)

These are hoped to provide more convenient and less error prone
facilities to encode and decode fields than manually defined constants
and functions.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  5 +--
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  6 +--
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  2 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  9 +---
 llvm/lib/Target/AMDGPU/SIDefines.h            | 21 ---------
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    | 15 +++----
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 25 +++++------
 llvm/lib/Target/AMDGPU/SIModeRegister.cpp     | 15 +++----
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 20 ++-------
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 45 ++++++++++++++++---
 10 files changed, 74 insertions(+), 89 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0d3b158a0df73..13d7510729139 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4824,9 +4824,8 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
   return true;
 }
 
-static const unsigned SPDenormModeBitField =
-    AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
-    (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+static constexpr unsigned SPDenormModeBitField =
+    AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
 
 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5b32b34079f44..b7b471d8dc7b3 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7272,11 +7272,11 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
 
   if (trySkipId("hwreg", AsmToken::LParen)) {
     OperandInfoTy HwReg(OPR_ID_UNKNOWN);
-    OperandInfoTy Offset(OFFSET_DEFAULT_);
-    OperandInfoTy Width(WIDTH_DEFAULT_);
+    OperandInfoTy Offset(HwregOffset::Default);
+    OperandInfoTy Width(HwregSize::Default);
     if (parseHwregBody(HwReg, Offset, Width) &&
         validateHwreg(HwReg, Offset, Width)) {
-      ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
+      ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
     } else {
       return ParseStatus::Failure;
     }
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index a727134b53dec..00fa93cc1923a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -177,7 +177,7 @@ static bool isLdsDma(const MachineInstr &MI) {
 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
-  return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
+  return std::get<0>(AMDGPU::Hwreg::HwregEncoding::decode(RegOp->getImm()));
 }
 
 ScheduleHazardRecognizer::HazardType
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a45fea6701f35..a32be1e50a605 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1778,13 +1778,9 @@ void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O) {
-  unsigned Id;
-  unsigned Offset;
-  unsigned Width;
-
   using namespace llvm::AMDGPU::Hwreg;
   unsigned Val = MI->getOperand(OpNo).getImm();
-  decodeHwreg(Val, Id, Offset, Width);
+  auto [Id, Offset, Width] = HwregEncoding::decode(Val);
   StringRef HwRegName = getHwreg(Id, STI);
 
   O << "hwreg(";
@@ -1793,9 +1789,8 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo,
   } else {
     O << Id;
   }
-  if (Width != WIDTH_DEFAULT_ || Offset != OFFSET_DEFAULT_) {
+  if (Width != HwregSize::Default || Offset != HwregOffset::Default)
     O << ", " << Offset << ", " << Width;
-  }
   O << ')';
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 98310c3f70c4a..0b516bfffb9b6 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -549,33 +549,12 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
   ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
   ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
-
-  ID_SHIFT_ = 0,
-  ID_WIDTH_ = 6,
-  ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
 };
 
 enum Offset : unsigned { // Offset, (5) [10:6]
-  OFFSET_DEFAULT_ = 0,
-  OFFSET_SHIFT_ = 6,
-  OFFSET_WIDTH_ = 5,
-  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
-
   OFFSET_MEM_VIOL = 8,
 };
 
-enum WidthMinusOne : unsigned { // WidthMinusOne, (5) [15:11]
-  WIDTH_M1_DEFAULT_ = 31,
-  WIDTH_M1_SHIFT_ = 11,
-  WIDTH_M1_WIDTH_ = 5,
-  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
-};
-
-// Some values from WidthMinusOne mapped into Width domain.
-enum Width : unsigned {
-  WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
-};
-
 enum ModeRegisterMasks : uint32_t {
   FP_ROUND_MASK = 0xf << 0,  // Bits 0..3
   FP_DENORM_MASK = 0xf << 4, // Bits 4..7
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d02aee71870ec..4f106bf0dfb11 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -478,14 +478,13 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
         .addImm(0);
       Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
 
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
-        addReg(FlatScrInitLo).
-        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
-                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
-        addReg(FlatScrInitHi).
-        addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
-                       (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
+      using namespace AMDGPU::Hwreg;
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+          .addReg(FlatScrInitLo)
+          .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
+          .addReg(FlatScrInitHi)
+          .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
       return;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 257dff6ef6839..d8f528d866118 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3960,7 +3960,7 @@ SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
   assert(Op.getValueType() == MVT::i32);
 
   uint32_t BothRoundHwReg =
-      AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+      AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 4);
   SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
 
   SDValue IntrinID =
@@ -4195,8 +4195,8 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
 
   MachineBasicBlock::iterator I = LoopBB->end();
 
-  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
-    AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
+  const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
+      AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
 
   // Clear TRAP_STS.MEM_VIOL
   BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
@@ -4999,18 +4999,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     // Otherwise there was overflow and the result is hi2:0. In both cases the
     // result should represent the actual time at some point during the sequence
     // of three getregs.
+    using namespace AMDGPU::Hwreg;
     Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
-        .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
-                                           0, 32));
+        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
     Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
-        .addImm(
-            AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32));
+        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
     Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
-        .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI,
-                                           0, 32));
+        .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
         .addReg(RegHi1)
         .addReg(RegHi2);
@@ -5207,8 +5205,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     // FIXME: This could be predicates on the immediate, but tablegen doesn't
     // allow you to have a no side effect instruction in the output of a
     // sideeffecting pattern.
-    unsigned ID, Offset, Width;
-    AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+    auto [ID, Offset, Width] =
+        AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
     if (ID != AMDGPU::Hwreg::ID_MODE)
       return BB;
 
@@ -10495,9 +10493,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
                                      DenominatorScaled, Flags);
 
-  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
-                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
-                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+  using namespace AMDGPU::Hwreg;
+  const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
 
   const MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index e62ad026dc5c8..c01b1266a5530 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -225,11 +225,10 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
     unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask);
     unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset);
     unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+    using namespace AMDGPU::Hwreg;
     BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
         .addImm(Value)
-        .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
-                (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
-                (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+        .addImm(HwregEncoding::encode(ID_MODE, Offset, Width));
     ++NumSetregInserted;
     Changed = true;
     InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
@@ -276,15 +275,11 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
       // as we assume it has been inserted by a higher authority (this is
       // likely to be a very rare occurrence).
       unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
-      if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
-          AMDGPU::Hwreg::ID_MODE)
+      using namespace AMDGPU::Hwreg;
+      auto [Id, Offset, Width] = HwregEncoding::decode(Dst);
+      if (Id != ID_MODE)
         continue;
 
-      unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
-                        AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
-                       1;
-      unsigned Offset =
-          (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
       unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset;
 
       // If an InsertionPoint is set we will insert a setreg there.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index dacdf7b5cd9a8..ce91e05e5cc81 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1698,22 +1698,14 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? Idx : Opr[Idx].Encoding;
 }
 
-bool isValidHwreg(int64_t Id) {
-  return 0 <= Id && isUInt<ID_WIDTH_>(Id);
-}
+bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
 
 bool isValidHwregOffset(int64_t Offset) {
-  return 0 <= Offset && isUInt<OFFSET_WIDTH_>(Offset);
+  return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
 }
 
 bool isValidHwregWidth(int64_t Width) {
-  return 0 <= (Width - 1) && isUInt<WIDTH_M1_WIDTH_>(Width - 1);
-}
-
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
-  return (Id << ID_SHIFT_) |
-         (Offset << OFFSET_SHIFT_) |
-         ((Width - 1) << WIDTH_M1_SHIFT_);
+  return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
 }
 
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
@@ -1721,12 +1713,6 @@ StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? "" : Opr[Idx].Name;
 }
 
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width) {
-  Id = (Val & ID_MASK_) >> ID_SHIFT_;
-  Offset = (Val & OFFSET_MASK_) >> OFFSET_SHIFT_;
-  Width = ((Val & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1;
-}
-
 } // namespace Hwreg
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b38016a581603..6826cd2731950 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -322,6 +322,35 @@ getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs,
 
 } // end namespace IsaInfo
 
+// Represents a field in an encoded value.
+template <unsigned HighBit, unsigned LowBit, unsigned D = 0>
+struct EncodingField {
+  static_assert(HighBit >= LowBit, "Invalid bit range!");
+  static constexpr unsigned Offset = LowBit;
+  static constexpr unsigned Width = HighBit - LowBit + 1;
+
+  using ValueType = unsigned;
+  static constexpr ValueType Default = D;
+
+  ValueType Value;
+  constexpr EncodingField(ValueType Value) : Value(Value) {}
+
+  constexpr uint64_t encode() const { return Value; }
+  static ValueType decode(uint64_t Encoded) { return Encoded; }
+};
+
+// A helper for encoding and decoding multiple fields.
+template <typename... Fields> struct EncodingFields {
+  static constexpr uint64_t encode(Fields... Values) {
+    return ((Values.encode() << Values.Offset) | ...);
+  }
+
+  static std::tuple<typename Fields::ValueType...> decode(uint64_t Encoded) {
+    return {Fields::decode((Encoded >> Fields::Offset) &
+                           maxUIntN(Fields::Width))...};
+  }
+};
+
 LLVM_READONLY
 int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
 
@@ -1021,6 +1050,17 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
 
 namespace Hwreg {
 
+using HwregId = EncodingField<5, 0>;
+using HwregOffset = EncodingField<10, 6>;
+
+struct HwregSize : EncodingField<15, 11, 32> {
+  using EncodingField::EncodingField;
+  constexpr uint64_t encode() const { return Value - 1; }
+  static ValueType decode(uint64_t Encoded) { return Encoded + 1; }
+};
+
+using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
+
 LLVM_READONLY
 int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
 
@@ -1033,14 +1073,9 @@ bool isValidHwregOffset(int64_t Offset);
 LLVM_READNONE
 bool isValidHwregWidth(int64_t Width);
 
-LLVM_READNONE
-uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width);
-
 LLVM_READNONE
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
 
-void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width);
-
 } // namespace Hwreg
 
 namespace DepCtr {

From 0673fb6e773b0a37802208be4f666cef1f6b3470 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston.dang@gmail.com>
Date: Fri, 23 Feb 2024 09:53:55 -0800
Subject: [PATCH 189/546] [hwasan] Add missing printf parameter in
 __hwasan_handle_longjmp (#82559)

The diagnostic message had four format specifiers but only three
parameters. This patch adds what I assume to be the missing
parameter.
---
 compiler-rt/lib/hwasan/hwasan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/hwasan/hwasan.cpp b/compiler-rt/lib/hwasan/hwasan.cpp
index 52780becbdb26..ccdc0b4bc21bd 100644
--- a/compiler-rt/lib/hwasan/hwasan.cpp
+++ b/compiler-rt/lib/hwasan/hwasan.cpp
@@ -692,7 +692,7 @@ void __hwasan_handle_longjmp(const void *sp_dst) {
         "WARNING: HWASan is ignoring requested __hwasan_handle_longjmp: "
         "stack top: %p; target %p; distance: %p (%zd)\n"
         "False positive error reports may follow\n",
-        (void *)sp, (void *)dst, dst - sp);
+        (void *)sp, (void *)dst, dst - sp, dst - sp);
     return;
   }
   TagMemory(sp, dst - sp, 0);

From 0352d5eee06c214681696395a0442006e6d16656 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 11:59:46 -0600
Subject: [PATCH 190/546] [libc][NFC] Remove redundant external clock symbol
 for AMDGPU (#82794)

Summary:
The AMDGPU target needs an external clock symbol so the driver can set
the frequency with the correct value. This was left over from the
previous implementation and I forgot to remove it when actually
implementing the timing utilities.
---
 libc/startup/gpu/amdgpu/start.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 9d7f04c10b488..dcef719d169d0 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -15,12 +15,6 @@ extern "C" int main(int argc, char **argv, char **envp);
 
 namespace LIBC_NAMESPACE {
 
-// The AMDGPU architecture provides a fixed frequency clock used for obtaining
-// real time. However, the frequency of this clock varies between cards and can
-// only be obtained via the driver. The loader will set this so we can use it.
-extern "C" [[gnu::visibility("protected")]] uint64_t
-    [[clang::address_space(4)]] __llvm_libc_clock_freq = 0;
-
 extern "C" uintptr_t __init_array_start[];
 extern "C" uintptr_t __init_array_end[];
 extern "C" uintptr_t __fini_array_start[];

From 640ba3f8d1dcf25d8b34ce463fb6a7d58e7dc998 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 12:00:01 -0600
Subject: [PATCH 191/546] [libc] Fix standard cross build targeting the GPU
 (#82724)

Summary:
The GPU target has recently been changed to support standard `libc`
build rules. This means we should be able to build for it both in
`LLVM_ENABLE_PROJECTS` mode, or targeting the runtimes directory
directly as in the LLVM `libc` documentation. Previously this failed
because the version check on the compiler was too strict and the
`--target=` options were not being set on the link jobs unless in CMake
cross compiliation mode. This patch fixes those so the following config
should work now to build the GPU target directly if using NVPTX.

```
cmake ../runtimes -DCMAKE_BUILD_TYPE=Release \
  -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang \
  -DLLVM_ENABLE_RUNTIMES=libc -DLLVM_RUNTIMES_TARGET=nvptx64-nvidia-cuda \
  -DLLVM_DEFAULT_TARGET_TRIPLE=nvptx64-nvidia-cuda \
  -DLIBC_HDRGEN_EXE=/path/to/hdrgen/libc-hdrgen \
  -DLLVM_LIBC_FULL_BUILD=ON -GNinja
```
---
 libc/cmake/modules/LLVMLibCTestRules.cmake      | 4 ++++
 libc/cmake/modules/prepare_libc_gpu_build.cmake | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 1166c26e4d8a5..9d1e426a5b7b3 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -463,6 +463,7 @@ function(add_integration_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE 
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
@@ -470,6 +471,7 @@ function(add_integration_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE 
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
@@ -644,6 +646,7 @@ function(add_libc_hermetic_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE 
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
@@ -651,6 +654,7 @@ function(add_libc_hermetic_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE 
+      ${LIBC_COMPILE_OPTIONS_DEFAULT}
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 752182f67cc01..548990a1c18b8 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -5,8 +5,8 @@ endif()
 
 # Ensure the compiler is a valid clang when building the GPU target.
 set(req_ver "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
-if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
-        ${CMAKE_CXX_COMPILER_VERSION} VERSION_EQUAL "${req_ver}"))
+if(LLVM_VERSION_MAJOR AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
+   ${CMAKE_CXX_COMPILER_VERSION} VERSION_EQUAL "${req_ver}"))
   message(FATAL_ERROR "Cannot build libc for GPU. CMake compiler "
                       "'${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}' "
                       " is not 'Clang ${req_ver}'.")

From 3e9e5e277129041fe781f1f2bb04f69269d3fa1f Mon Sep 17 00:00:00 2001
From: "Kevin P. Neal" <kevin.neal@sas.com>
Date: Fri, 23 Feb 2024 12:58:34 -0500
Subject: [PATCH 192/546] [FPEnv][SystemZ] Correct strictfp test.

Correct llvm-reduce strictfp test to follow the rules documented in the
LangRef:
https://llvm.org/docs/LangRef.html#constrained-floating-point-intrinsics

This test needed the strictfp attribute added to function definitions.

Test changes verified with D146845.
---
 .../test/CodeGen/SystemZ/fp-strict-conv-17.ll | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
index eb0ff4b825609..3ff63242a6d82 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
@@ -20,7 +20,7 @@ declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata)
 declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata)
 
 ; Test signed i128->f128.
-define fp128 @f1(i128 %i) {
+define fp128 @f1(i128 %i) #0 {
 ; CHECK-LABEL: f1:
 ; CHECK: brasl %r14, __floattitf@PLT
 ; CHECK: br %r14
@@ -31,7 +31,7 @@ define fp128 @f1(i128 %i) {
 }
 
 ; Test signed i128->f64.
-define double @f2(i128 %i) {
+define double @f2(i128 %i) #0 {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __floattidf@PLT
 ; CHECK: br %r14
@@ -42,7 +42,7 @@ define double @f2(i128 %i) {
 }
 
 ; Test signed i128->f32.
-define float @f3(i128 %i) {
+define float @f3(i128 %i) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK: brasl %r14, __floattisf@PLT
 ; CHECK: br %r14
@@ -53,7 +53,7 @@ define float @f3(i128 %i) {
 }
 
 ; Test unsigned i128->f128.
-define fp128 @f4(i128 %i) {
+define fp128 @f4(i128 %i) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK: brasl %r14, __floatuntitf@PLT
 ; CHECK: br %r14
@@ -64,7 +64,7 @@ define fp128 @f4(i128 %i) {
 }
 
 ; Test unsigned i128->f64.
-define double @f5(i128 %i) {
+define double @f5(i128 %i) #0 {
 ; CHECK-LABEL: f5:
 ; CHECK: brasl %r14, __floatuntidf@PLT
 ; CHECK: br %r14
@@ -75,7 +75,7 @@ define double @f5(i128 %i) {
 }
 
 ; Test unsigned i128->f32.
-define float @f6(i128 %i) {
+define float @f6(i128 %i) #0 {
 ; CHECK-LABEL: f6:
 ; CHECK: brasl %r14, __floatuntisf@PLT
 ; CHECK: br %r14
@@ -86,7 +86,7 @@ define float @f6(i128 %i) {
 }
 
 ; Test signed f128->i128.
-define i128 @f7(fp128 %f) {
+define i128 @f7(fp128 %f) #0 {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, __fixtfti@PLT
 ; CHECK: br %r14
@@ -96,7 +96,7 @@ define i128 @f7(fp128 %f) {
 }
 
 ; Test signed f64->i128.
-define i128 @f8(double %f) {
+define i128 @f8(double %f) #0 {
 ; CHECK-LABEL: f8:
 ; CHECK: brasl %r14, __fixdfti@PLT
 ; CHECK: br %r14
@@ -106,7 +106,7 @@ define i128 @f8(double %f) {
 }
 
 ; Test signed f9->i128.
-define i128 @f9(float %f) {
+define i128 @f9(float %f) #0 {
 ; CHECK-LABEL: f9:
 ; CHECK: brasl %r14, __fixsfti@PLT
 ; CHECK: br %r14
@@ -116,7 +116,7 @@ define i128 @f9(float %f) {
 }
 
 ; Test unsigned f128->i128.
-define i128 @f10(fp128 %f) {
+define i128 @f10(fp128 %f) #0 {
 ; CHECK-LABEL: f10:
 ; CHECK: brasl %r14, __fixunstfti@PLT
 ; CHECK: br %r14
@@ -126,7 +126,7 @@ define i128 @f10(fp128 %f) {
 }
 
 ; Test unsigned f64->i128.
-define i128 @f11(double %f) {
+define i128 @f11(double %f) #0 {
 ; CHECK-LABEL: f11:
 ; CHECK: brasl %r14, __fixunsdfti@PLT
 ; CHECK: br %r14
@@ -136,7 +136,7 @@ define i128 @f11(double %f) {
 }
 
 ; Test unsigned f32->i128.
-define i128 @f12(float %f) {
+define i128 @f12(float %f) #0 {
 ; CHECK-LABEL: f12:
 ; CHECK: brasl %r14, __fixunssfti@PLT
 ; CHECK: br %r14

From 8fe4487e23e543568745ef461660b1d288805b81 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Fri, 23 Feb 2024 10:01:40 -0800
Subject: [PATCH 193/546] [OpenACC] Fix branch-in/out to not refer to a
 'region'

'region' is not a term of art in OpenACC, so switch it to refer to
'Compute Construct', which is accurate/reflects the standard.
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  2 +-
 clang/test/SemaOpenACC/no-branch-in-out.c        | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index ebda201361fb0..a7f2858477bee 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12204,5 +12204,5 @@ def err_acc_construct_appertainment
     : Error<"OpenACC construct '%0' cannot be used here; it can only "
             "be used in a statement context">;
 def err_acc_branch_in_out
-    : Error<"invalid branch %select{out of|into}0 OpenACC region">;
+    : Error<"invalid branch %select{out of|into}0 OpenACC Compute Construct">;
 } // end of sema component.
diff --git a/clang/test/SemaOpenACC/no-branch-in-out.c b/clang/test/SemaOpenACC/no-branch-in-out.c
index 622cf55f48473..33a171f1b68d5 100644
--- a/clang/test/SemaOpenACC/no-branch-in-out.c
+++ b/clang/test/SemaOpenACC/no-branch-in-out.c
@@ -14,7 +14,7 @@ void BreakContinue() {
     if (i == 2)
       continue;
 
-    break;  // expected-error{{invalid branch out of OpenACC region}}
+    break;  // expected-error{{invalid branch out of OpenACC Compute Construct}}
   }
 
   int j;
@@ -22,7 +22,7 @@ void BreakContinue() {
     case 0:
 #pragma acc parallel
     {
-      break; // expected-error{{invalid branch out of OpenACC region}}
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
     }
     case 1:
 #pragma acc parallel
@@ -34,7 +34,7 @@ void BreakContinue() {
 #pragma acc parallel
   for(int i = 0; i < 5; ++i) {
     if (i > 1)
-      break; // expected-error{{invalid branch out of OpenACC region}}
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
   }
 
 #pragma acc parallel
@@ -54,7 +54,7 @@ void BreakContinue() {
   for (int i =0; i < 5; ++i) {
 #pragma acc parallel
     {
-      continue; // expected-error{{invalid branch out of OpenACC region}}
+      continue; // expected-error{{invalid branch out of OpenACC Compute Construct}}
     }
   }
 
@@ -73,7 +73,7 @@ void BreakContinue() {
   for (int i =0; i < 5; ++i) {
 #pragma acc parallel
     {
-      break; // expected-error{{invalid branch out of OpenACC region}}
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
     }
   }
 
@@ -81,14 +81,14 @@ void BreakContinue() {
   while (j) {
     --j;
     if (j > 4)
-      break; // expected-error{{invalid branch out of OpenACC region}}
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
   }
 
 #pragma acc parallel
   do {
     --j;
     if (j > 4)
-      break; // expected-error{{invalid branch out of OpenACC region}}
+      break; // expected-error{{invalid branch out of OpenACC Compute Construct}}
   } while (j );
 
 }

From 962a6970f2827bcdda574426701c7c57f79a1ccf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 23 Feb 2024 10:15:49 -0800
Subject: [PATCH 194/546] [SelectionDAG] Remove unused VP strided load/store
 creation functions that build an MMO. (#82676)

The base case of these call InferPtrInfo. This is dangerous due to
#82657, but it turns out none of these are used.

It seemed best to reduce the surface area until these are needed.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      | 42 ----------
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 79 ++-----------------
 2 files changed, 8 insertions(+), 113 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 7bb12d8f065c9..2fc1ceafa927f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1476,49 +1476,14 @@ class SelectionDAG {
   SDValue getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl, SDValue Base,
                             SDValue Offset, ISD::MemIndexedMode AM);
 
-  SDValue getStridedLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
-                           EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
-                           SDValue Offset, SDValue Stride, SDValue Mask,
-                           SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
-                           Align Alignment, MachineMemOperand::Flags MMOFlags,
-                           const AAMDNodes &AAInfo,
-                           const MDNode *Ranges = nullptr,
-                           bool IsExpanding = false);
-  inline SDValue getStridedLoadVP(
-      ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
-      SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
-      SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
-      MaybeAlign Alignment = MaybeAlign(),
-      MachineMemOperand::Flags MMOFlags = MachineMemOperand::MONone,
-      const AAMDNodes &AAInfo = AAMDNodes(), const MDNode *Ranges = nullptr,
-      bool IsExpanding = false) {
-    // Ensures that codegen never sees a None Alignment.
-    return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride,
-                            Mask, EVL, PtrInfo, MemVT,
-                            Alignment.value_or(getEVTAlign(MemVT)), MMOFlags,
-                            AAInfo, Ranges, IsExpanding);
-  }
   SDValue getStridedLoadVP(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                            EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
                            SDValue Offset, SDValue Stride, SDValue Mask,
                            SDValue EVL, EVT MemVT, MachineMemOperand *MMO,
                            bool IsExpanding = false);
-  SDValue getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
-                           SDValue Stride, SDValue Mask, SDValue EVL,
-                           MachinePointerInfo PtrInfo, MaybeAlign Alignment,
-                           MachineMemOperand::Flags MMOFlags,
-                           const AAMDNodes &AAInfo,
-                           const MDNode *Ranges = nullptr,
-                           bool IsExpanding = false);
   SDValue getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr,
                            SDValue Stride, SDValue Mask, SDValue EVL,
                            MachineMemOperand *MMO, bool IsExpanding = false);
-  SDValue
-  getExtStridedLoadVP(ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT,
-                      SDValue Chain, SDValue Ptr, SDValue Stride, SDValue Mask,
-                      SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT,
-                      MaybeAlign Alignment, MachineMemOperand::Flags MMOFlags,
-                      const AAMDNodes &AAInfo, bool IsExpanding = false);
   SDValue getExtStridedLoadVP(ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT,
                               SDValue Chain, SDValue Ptr, SDValue Stride,
                               SDValue Mask, SDValue EVL, EVT MemVT,
@@ -1532,13 +1497,6 @@ class SelectionDAG {
                             MachineMemOperand *MMO, ISD::MemIndexedMode AM,
                             bool IsTruncating = false,
                             bool IsCompressing = false);
-  SDValue getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
-                                 SDValue Ptr, SDValue Stride, SDValue Mask,
-                                 SDValue EVL, MachinePointerInfo PtrInfo,
-                                 EVT SVT, Align Alignment,
-                                 MachineMemOperand::Flags MMOFlags,
-                                 const AAMDNodes &AAInfo,
-                                 bool IsCompressing = false);
   SDValue getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL, SDValue Val,
                                  SDValue Ptr, SDValue Stride, SDValue Mask,
                                  SDValue EVL, EVT SVT, MachineMemOperand *MMO,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index add92cf8b31e4..0ceda27d40665 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -9041,29 +9041,6 @@ SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
   return V;
 }
 
-SDValue SelectionDAG::getStridedLoadVP(
-    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
-    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
-    SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
-    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    const MDNode *Ranges, bool IsExpanding) {
-  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
-  MMOFlags |= MachineMemOperand::MOLoad;
-  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
-  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
-  // clients.
-  if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
-
-  uint64_t Size = MemoryLocation::UnknownSize;
-  MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
-                                                   Alignment, AAInfo, Ranges);
-  return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride, Mask,
-                          EVL, MemVT, MMO, IsExpanding);
-}
-
 SDValue SelectionDAG::getStridedLoadVP(
     ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
     SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
@@ -9098,17 +9075,6 @@ SDValue SelectionDAG::getStridedLoadVP(
   return V;
 }
 
-SDValue SelectionDAG::getStridedLoadVP(
-    EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride,
-    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, MaybeAlign Alignment,
-    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    const MDNode *Ranges, bool IsExpanding) {
-  SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
-                          Undef, Stride, Mask, EVL, PtrInfo, VT, Alignment,
-                          MMOFlags, AAInfo, Ranges, IsExpanding);
-}
-
 SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
                                        SDValue Ptr, SDValue Stride,
                                        SDValue Mask, SDValue EVL,
@@ -9119,18 +9085,6 @@ SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
                           Undef, Stride, Mask, EVL, VT, MMO, IsExpanding);
 }
 
-SDValue SelectionDAG::getExtStridedLoadVP(
-    ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
-    SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL,
-    MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment,
-    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    bool IsExpanding) {
-  SDValue Undef = getUNDEF(Ptr.getValueType());
-  return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
-                          Stride, Mask, EVL, PtrInfo, MemVT, Alignment,
-                          MMOFlags, AAInfo, nullptr, IsExpanding);
-}
-
 SDValue SelectionDAG::getExtStridedLoadVP(
     ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
     SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT,
@@ -9150,11 +9104,14 @@ SDValue SelectionDAG::getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL,
   auto MMOFlags =
       SLD->getMemOperand()->getFlags() &
       ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
-  return getStridedLoadVP(
-      AM, SLD->getExtensionType(), OrigLoad.getValueType(), DL, SLD->getChain(),
-      Base, Offset, SLD->getStride(), SLD->getMask(), SLD->getVectorLength(),
-      SLD->getPointerInfo(), SLD->getMemoryVT(), SLD->getAlign(), MMOFlags,
-      SLD->getAAInfo(), nullptr, SLD->isExpandingLoad());
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      SLD->getPointerInfo(), MMOFlags, SLD->getMemOperand()->getSize(),
+      SLD->getOriginalAlign(), SLD->getAAInfo());
+  return getStridedLoadVP(AM, SLD->getExtensionType(), OrigLoad.getValueType(),
+                          DL, SLD->getChain(), Base, Offset, SLD->getStride(),
+                          SLD->getMask(), SLD->getVectorLength(),
+                          SLD->getMemoryVT(), MMO, SLD->isExpandingLoad());
 }
 
 SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
@@ -9193,26 +9150,6 @@ SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
   return V;
 }
 
-SDValue SelectionDAG::getTruncStridedStoreVP(
-    SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride,
-    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT,
-    Align Alignment, MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
-    bool IsCompressing) {
-  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
-  MMOFlags |= MachineMemOperand::MOStore;
-  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
-
-  if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
-
-  MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MMOFlags, MemoryLocation::UnknownSize, Alignment, AAInfo);
-  return getTruncStridedStoreVP(Chain, DL, Val, Ptr, Stride, Mask, EVL, SVT,
-                                MMO, IsCompressing);
-}
-
 SDValue SelectionDAG::getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL,
                                              SDValue Val, SDValue Ptr,
                                              SDValue Stride, SDValue Mask,

From 42f6f95e084a9157a5801dba5e32a7af0616360a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 23 Feb 2024 18:44:02 +0000
Subject: [PATCH 195/546] [AMDGPU] Simplify AMDGPUDisassembler::getInstruction
 by removing Res. (#82775)

Remove all the code that set and tested Res. Change all convert*
functions to return void since none of them can fail. getInstruction
only has one main point of failure, after all calls to tryDecodeInst
have failed.
---
 .../Disassembler/AMDGPUDisassembler.cpp       | 252 ++++++++----------
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  18 +-
 2 files changed, 119 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 70e2275c58745..e1cca17bdbf43 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -453,7 +453,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
   Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
-  DecodeStatus Res = MCDisassembler::Fail;
+  // In case the opcode is not recognized we'll assume a Size of 4 bytes (unless
+  // there are fewer bytes left). This will be overridden on success.
+  Size = std::min((size_t)4, Bytes_.size());
+
   do {
     // ToDo: better to switch encoding length using some bit predicate
     // but it is unknown yet, so try all we can
@@ -462,87 +465,69 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     // encodings
     if (isGFX11Plus() && Bytes.size() >= 12 ) {
       DecoderUInt128 DecW = eat12Bytes(Bytes);
-      Res = tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
-                          DecW, Address, CS);
-      if (Res)
+
+      if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+                        DecW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
-                          DecW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+                        DecW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
         break;
     }
+
     // Reinitialize Bytes
     Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
     if (Bytes.size() >= 8) {
       const uint64_t QW = eatBytes<uint64_t>(Bytes);
 
-      if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
-        Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+          tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS))
+        break;
 
-      if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
-        Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) &&
+          tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS))
+        break;
 
       // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
       // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
       // table first so we print the correct name.
-      if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
-        Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts) &&
+          tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS))
+        break;
 
-      if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
-        Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureGFX940Insts) &&
+          tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS))
+        break;
 
-      if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
-        Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
-        if (Res)
-          break;
-      }
+      if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+          tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
+        break;
 
-      Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI,
-                          QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+                        Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI,
-                          QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+                        Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
         break;
 
-      Res = tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
         break;
     }
 
@@ -550,40 +535,42 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Bytes = Bytes_.slice(0, MaxInstBytesNum);
 
     // Try decode 32-bit instruction
-    if (Bytes.size() < 4) break;
-    const uint32_t DW = eatBytes<uint32_t>(Bytes);
-    Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
-    if (Res) break;
+    if (Bytes.size() >= 4) {
+      const uint32_t DW = eatBytes<uint32_t>(Bytes);
 
-    Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
+        break;
 
-    if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
-      Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
-      if (Res)
+      if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
         break;
-    }
 
-    if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
-      Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
-      if (Res) break;
-    }
+      if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
+          tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
-    if (Res) break;
+      if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding) &&
+          tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
-                        Address, CS);
-    if (Res) break;
+      if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
+        break;
 
-    Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
-                        Address, CS);
+      if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+                        Address, CS))
+        break;
+
+      if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+                        Address, CS))
+        break;
+    }
+
+    return MCDisassembler::Fail;
   } while (false);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP)) {
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DPP) {
     if (isMacDPP(MI))
       convertMacDPPInst(MI);
 
@@ -599,26 +586,26 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       convertVOP3DPPInst(MI); // Regular VOP3 case
   }
 
-  if (Res && AMDGPU::isMAC(MI.getOpcode())) {
+  if (AMDGPU::isMAC(MI.getOpcode())) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
   }
 
-  if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
-              MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp) {
     // Insert dummy unused src2_modifiers.
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src2_modifiers);
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+  if ((MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
       !AMDGPU::hasGDS(STI)) {
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-          (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
+  if (MCII->get(MI.getOpcode()).TSFlags &
+      (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD)) {
     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::cpol);
     if (CPolPos != -1) {
@@ -634,9 +621,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-              (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
-             (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
+  if ((MCII->get(MI.getOpcode()).TSFlags &
+       (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
+      (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
     // GFX90A lost TFE, its place is occupied by ACC.
     int TFEOpIdx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
@@ -647,8 +634,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-              (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF))) {
+  if (MCII->get(MI.getOpcode()).TSFlags &
+      (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) {
     int SWZOpIdx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
     if (SWZOpIdx != -1) {
@@ -658,7 +645,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) {
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG) {
     int VAddr0Idx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
     int RsrcIdx =
@@ -666,36 +653,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     unsigned NSAArgs = RsrcIdx - VAddr0Idx - 1;
     if (VAddr0Idx >= 0 && NSAArgs > 0) {
       unsigned NSAWords = (NSAArgs + 3) / 4;
-      if (Bytes.size() < 4 * NSAWords) {
-        Res = MCDisassembler::Fail;
-      } else {
-        for (unsigned i = 0; i < NSAArgs; ++i) {
-          const unsigned VAddrIdx = VAddr0Idx + 1 + i;
-          auto VAddrRCID =
-              MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
-          MI.insert(MI.begin() + VAddrIdx,
-                    createRegOperand(VAddrRCID, Bytes[i]));
-        }
-        Bytes = Bytes.slice(4 * NSAWords);
+      if (Bytes.size() < 4 * NSAWords)
+        return MCDisassembler::Fail;
+      for (unsigned i = 0; i < NSAArgs; ++i) {
+        const unsigned VAddrIdx = VAddr0Idx + 1 + i;
+        auto VAddrRCID =
+            MCII->get(MI.getOpcode()).operands()[VAddrIdx].RegClass;
+        MI.insert(MI.begin() + VAddrIdx, createRegOperand(VAddrRCID, Bytes[i]));
       }
+      Bytes = Bytes.slice(4 * NSAWords);
     }
 
-    if (Res)
-      Res = convertMIMGInst(MI);
+    convertMIMGInst(MI);
   }
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags &
-              (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)))
-    Res = convertMIMGInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags &
+      (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE))
+    convertMIMGInst(MI);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
-    Res = convertEXPInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP)
+    convertEXPInst(MI);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP))
-    Res = convertVINTERPInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VINTERP)
+    convertVINTERPInst(MI);
 
-  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA))
-    Res = convertSDWAInst(MI);
+  if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SDWA)
+    convertSDWAInst(MI);
 
   int VDstIn_Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                               AMDGPU::OpName::vdst_in);
@@ -716,27 +699,23 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   int ImmLitIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::imm);
   bool IsSOPK = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::SOPK;
-  if (Res && ImmLitIdx != -1 && !IsSOPK)
-    Res = convertFMAanyK(MI, ImmLitIdx);
+  if (ImmLitIdx != -1 && !IsSOPK)
+    convertFMAanyK(MI, ImmLitIdx);
 
-  // if the opcode was not recognized we'll assume a Size of 4 bytes
-  // (unless there are fewer bytes left)
-  Size = Res ? (MaxInstBytesNum - Bytes.size())
-             : std::min((size_t)4, Bytes_.size());
-  return Res;
+  Size = MaxInstBytesNum - Bytes.size();
+  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
   if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
     // The MCInst still has these fields even though they are no longer encoded
     // in the GFX11 instruction.
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::compr);
   }
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
   if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
       MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
       MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
@@ -749,10 +728,9 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
     // instruction.
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
   }
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
   if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
       STI.hasFeature(AMDGPU::FeatureGFX10)) {
     if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
@@ -769,7 +747,6 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
       insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
     }
   }
-  return MCDisassembler::Success;
 }
 
 struct VOPModifiers {
@@ -873,7 +850,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
                        AMDGPU::OpName::src2_modifiers);
 }
 
-DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
+void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
 
   int VDstInIdx =
@@ -904,10 +881,9 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
       insertNamedMCOperand(MI, MCOperand::createImm(0),
                            AMDGPU::OpName::src1_modifiers);
   }
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   convertTrue16OpSel(MI);
 
   int VDstInIdx =
@@ -927,13 +903,12 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
     insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
                          AMDGPU::OpName::op_sel);
   }
-  return MCDisassembler::Success;
 }
 
 // Note that before gfx10, the MIMG encoding provided no information about
 // VADDR size. Consequently, decoded instructions always show address as if it
 // has 1 dword, which could be not really so.
-DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
 
   int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
@@ -962,7 +937,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   if (BaseOpcode->BVH) {
     // Add A16 operand for intersect_ray instructions
     addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
-    return MCDisassembler::Success;
+    return;
   }
 
   bool IsAtomic = (VDstIdx != -1);
@@ -997,7 +972,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
         if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
           // The NSA encoding does not contain enough operands for the
           // combination of base opcode / dimension. Should this be an error?
-          return MCDisassembler::Success;
+          return;
         }
         IsPartialNSA = true;
       }
@@ -1016,12 +991,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     DstSize += 1;
 
   if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords)
-    return MCDisassembler::Success;
+    return;
 
   int NewOpcode =
       AMDGPU::getMIMGOpcode(Info->BaseOpcode, Info->MIMGEncoding, DstSize, AddrSize);
   if (NewOpcode == -1)
-    return MCDisassembler::Success;
+    return;
 
   // Widen the register to the correct number of enabled channels.
   unsigned NewVdata = AMDGPU::NoRegister;
@@ -1038,7 +1013,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     if (NewVdata == AMDGPU::NoRegister) {
       // It's possible to encode this such that the low register + enabled
       // components exceeds the register count.
-      return MCDisassembler::Success;
+      return;
     }
   }
 
@@ -1056,7 +1031,7 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
                                         &MRI.getRegClass(AddrRCID));
     if (!NewVAddrSA)
-      return MCDisassembler::Success;
+      return;
   }
 
   MI.setOpcode(NewOpcode);
@@ -1077,14 +1052,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
     MI.erase(MI.begin() + VAddr0Idx + AddrSize,
              MI.begin() + VAddr0Idx + Info->VAddrDwords);
   }
-
-  return MCDisassembler::Success;
 }
 
 // Opsel and neg bits are used in src_modifiers and standalone operands. Autogen
 // decoder only adds to src_modifiers, so manually add the bits to the other
 // operands.
-DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   auto Mods = collectVOPModifiers(MI, true);
@@ -1109,12 +1082,10 @@ DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::neg_hi))
     insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
                          AMDGPU::OpName::neg_hi);
-
-  return MCDisassembler::Success;
 }
 
 // Create dummy old operand and insert optional operands
-DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
+void AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
 
@@ -1131,11 +1102,9 @@ DecodeStatus AMDGPUDisassembler::convertVOPCDPPInst(MCInst &MI) const {
       AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::src1_modifiers))
     insertNamedMCOperand(MI, MCOperand::createImm(0),
                          AMDGPU::OpName::src1_modifiers);
-  return MCDisassembler::Success;
 }
 
-DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
-                                                int ImmLitIdx) const {
+void AMDGPUDisassembler::convertFMAanyK(MCInst &MI, int ImmLitIdx) const {
   assert(HasLiteral && "Should have decoded a literal");
   const MCInstrDesc &Desc = MCII->get(MI.getOpcode());
   unsigned DescNumOps = Desc.getNumOperands();
@@ -1151,7 +1120,6 @@ DecodeStatus AMDGPUDisassembler::convertFMAanyK(MCInst &MI,
         IsDeferredOp)
       Op.setImm(Literal);
   }
-  return MCDisassembler::Success;
 }
 
 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index dd0581576bd22..2e1b6fb1c740b 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -194,15 +194,15 @@ class AMDGPUDisassembler : public MCDisassembler {
   DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer,
                                        raw_string_ostream &KdStream) const;
 
-  DecodeStatus convertEXPInst(MCInst &MI) const;
-  DecodeStatus convertVINTERPInst(MCInst &MI) const;
-  DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
-  DecodeStatus convertSDWAInst(MCInst &MI) const;
-  DecodeStatus convertDPP8Inst(MCInst &MI) const;
-  DecodeStatus convertMIMGInst(MCInst &MI) const;
-  DecodeStatus convertVOP3DPPInst(MCInst &MI) const;
-  DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
-  DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
+  void convertEXPInst(MCInst &MI) const;
+  void convertVINTERPInst(MCInst &MI) const;
+  void convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
+  void convertSDWAInst(MCInst &MI) const;
+  void convertDPP8Inst(MCInst &MI) const;
+  void convertMIMGInst(MCInst &MI) const;
+  void convertVOP3DPPInst(MCInst &MI) const;
+  void convertVOP3PDPPInst(MCInst &MI) const;
+  void convertVOPCDPPInst(MCInst &MI) const;
   void convertMacDPPInst(MCInst &MI) const;
   void convertTrue16OpSel(MCInst &MI) const;
 

From a24421fef713e5b3c0a885cf36a62cc3257be1f3 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 23 Feb 2024 12:51:26 -0600
Subject: [PATCH 196/546] [flang][bbc] Fix dangling reference to `envDefaults`
 (#82800)

The lowering bridge stores the evvironment defaults (passed to the
constructor) as a reference. In the call to the constructor in bbc, the
defaults were passed as `{}`, which creates a temporary whose lifetime
ends immediately after the call.

The flang driver passes a member of the compilation instance to the
constructor, which presumably remains alive long enough, so storing the
reference in the bridge is justified. To avoid the dangling reference,
create an actual object `envDefaults` in bbc.
---
 flang/tools/bbc/bbc.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index c9358c83e795c..e701fdeb227a4 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -354,10 +354,11 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
   loweringOptions.setPolymorphicTypeImpl(enablePolymorphic);
   loweringOptions.setNoPPCNativeVecElemOrder(enableNoPPCNativeVecElemOrder);
   loweringOptions.setLowerToHighLevelFIR(useHLFIR || emitHLFIR);
+  std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
   auto burnside = Fortran::lower::LoweringBridge::create(
       ctx, semanticsContext, defKinds, semanticsContext.intrinsics(),
       semanticsContext.targetCharacteristics(), parsing.allCooked(),
-      targetTriple, kindMap, loweringOptions, {},
+      targetTriple, kindMap, loweringOptions, envDefaults,
       semanticsContext.languageFeatures(), targetMachine);
   burnside.lower(parseTree, semanticsContext);
   mlir::ModuleOp mlirModule = burnside.getModule();

From f8ce460e48ccc774354df75520d00a67ddbf84c0 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Fri, 23 Feb 2024 10:52:28 -0800
Subject: [PATCH 197/546] [mlir][sparse] cleanup sparse runtime library
 (#82807)

remove some obsoleted APIs from the library that have been fully
replaced with actual direct IR codegen
---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      |  9 ++--
 .../ExecutionEngine/SparseTensor/Storage.h    | 43 ++--------------
 .../ExecutionEngine/SparseTensorRuntime.h     | 24 +--------
 .../ExecutionEngine/SparseTensor/Storage.cpp  |  7 ---
 .../ExecutionEngine/SparseTensorRuntime.cpp   | 50 ++-----------------
 5 files changed, 12 insertions(+), 121 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index a00c9c31256c9..1c81d80ea7ec4 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -145,12 +145,9 @@ constexpr bool isComplexPrimaryType(PrimaryType valTy) {
 /// The actions performed by @newSparseTensor.
 enum class Action : uint32_t {
   kEmpty = 0,
-  kEmptyForward = 1,
-  kFromCOO = 2,
-  kFromReader = 4,
-  kToCOO = 5,
-  kPack = 7,
-  kSortCOOInPlace = 8,
+  kFromReader = 1,
+  kPack = 2,
+  kSortCOOInPlace = 3,
 };
 
 /// This enum defines all supported storage format without the level properties.
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index eff1aca42d3e8..fe0e08b5c8403 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -149,13 +149,6 @@ class SparseTensorStorageBase {
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETVALUES)
 #undef DECL_GETVALUES
 
-  /// Element-wise forwarding insertions. The first argument is the
-  /// dimension-coordinates for the value being inserted.
-#define DECL_FORWARDINGINSERT(VNAME, V)                                        \
-  virtual void forwardingInsert(const uint64_t *, V);
-  MLIR_SPARSETENSOR_FOREVERY_V(DECL_FORWARDINGINSERT)
-#undef DECL_FORWARDINGINSERT
-
   /// Element-wise insertion in lexicographic coordinate order. The first
   /// argument is the level-coordinates for the value being inserted.
 #define DECL_LEXINSERT(VNAME, V) virtual void lexInsert(const uint64_t *, V);
@@ -171,9 +164,6 @@ class SparseTensorStorageBase {
   MLIR_SPARSETENSOR_FOREVERY_V(DECL_EXPINSERT)
 #undef DECL_EXPINSERT
 
-  /// Finalizes forwarding insertions.
-  virtual void endForwardingInsert() = 0;
-
   /// Finalizes lexicographic insertions.
   virtual void endLexInsert() = 0;
 
@@ -248,7 +238,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   static SparseTensorStorage<P, C, V> *
   newEmpty(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
            const uint64_t *lvlSizes, const LevelType *lvlTypes,
-           const uint64_t *dim2lvl, const uint64_t *lvl2dim, bool forwarding);
+           const uint64_t *dim2lvl, const uint64_t *lvl2dim);
 
   /// Allocates a new sparse tensor and initializes it from the given COO.
   static SparseTensorStorage<P, C, V> *
@@ -284,13 +274,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     *out = &values;
   }
 
-  /// Partially specialize forwarding insertions based on template types.
-  void forwardingInsert(const uint64_t *dimCoords, V val) final {
-    assert(dimCoords && coo);
-    map.pushforward(dimCoords, lvlCursor.data());
-    coo->add(lvlCursor, val);
-  }
-
   /// Partially specialize lexicographical insertions based on template types.
   void lexInsert(const uint64_t *lvlCoords, V val) final {
     assert(lvlCoords);
@@ -345,21 +328,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
     }
   }
 
-  /// Finalizes forwarding insertions.
-  void endForwardingInsert() final {
-    // Ensure COO is sorted.
-    assert(coo);
-    coo->sort();
-    // Now actually insert the `elements`.
-    const auto &elements = coo->getElements();
-    const uint64_t nse = elements.size();
-    assert(values.size() == 0);
-    values.reserve(nse);
-    fromCOO(elements, 0, nse, 0);
-    delete coo;
-    coo = nullptr;
-  }
-
   /// Finalizes lexicographic insertions.
   void endLexInsert() final {
     if (!allDense) {
@@ -653,13 +621,10 @@ template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newEmpty(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const LevelType *lvlTypes,
-    const uint64_t *dim2lvl, const uint64_t *lvl2dim, bool forwarding) {
-  SparseTensorCOO<V> *lvlCOO = nullptr;
-  if (forwarding)
-    lvlCOO = new SparseTensorCOO<V>(lvlRank, lvlSizes);
+    const uint64_t *dim2lvl, const uint64_t *lvl2dim) {
   return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank, lvlSizes,
-                                          lvlTypes, dim2lvl, lvl2dim, lvlCOO,
-                                          !forwarding);
+                                          lvlTypes, dim2lvl, lvl2dim, nullptr,
+                                          true);
 }
 
 template <typename P, typename C, typename V>
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
index 8b0829aab0d8d..d916186c835c2 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
@@ -38,15 +38,12 @@ extern "C" {
 /// This is the "swiss army knife" method for materializing sparse
 /// tensors into the computation.  The types of the `ptr` argument and
 /// the result depend on the action, as explained in the following table,
-/// where "STS" means a sparse-tensor-storage object and "COO" means
-/// a coordinate-scheme object.
+/// where "STS" means a sparse-tensor-storage object.
 ///
 /// Action:         `ptr`:          Returns:
+/// ---------------------------------------------------------------------------
 /// kEmpty          -               STS, empty
-/// kEmptyForward   -               STS, empty, with forwarding COO
-/// kFromCOO        COO             STS, copied from the COO source
 /// kFromReader     reader          STS, input from reader
-/// kToCOO          STS             COO, copied from the STS source
 /// kPack           buffers         STS, from level buffers
 /// kSortCOOInPlace STS             STS, sorted in place
 MLIR_CRUNNERUTILS_EXPORT void *_mlir_ciface_newSparseTensor( // NOLINT
@@ -80,14 +77,6 @@ MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSEPOSITIONS)
 MLIR_SPARSETENSOR_FOREVERY_O(DECL_SPARSECOORDINATES)
 #undef DECL_SPARSECOORDINATES
 
-/// Tensor-storage method for a dim to lvl forwarding insertion.
-#define DECL_FORWARDINGINSERT(VNAME, V)                                        \
-  MLIR_CRUNNERUTILS_EXPORT void _mlir_ciface_forwardingInsert##VNAME(          \
-      void *tensor, StridedMemRefType<V, 0> *vref,                             \
-      StridedMemRefType<index_type, 1> *dimCoordsRef);                         \
-  MLIR_SPARSETENSOR_FOREVERY_V(DECL_FORWARDINGINSERT)
-#undef DECL_FORWARDINGINSERT
-
 /// Tensor-storage method to insert elements in lexicographical
 /// level-coordinate order.
 #define DECL_LEXINSERT(VNAME, V)                                               \
@@ -160,21 +149,12 @@ MLIR_CRUNNERUTILS_EXPORT index_type sparseLvlSize(void *tensor, index_type l);
 /// Tensor-storage method to get the size of the given dimension.
 MLIR_CRUNNERUTILS_EXPORT index_type sparseDimSize(void *tensor, index_type d);
 
-/// Tensor-storage method to finalize forwarding insertions.
-MLIR_CRUNNERUTILS_EXPORT void endForwardingInsert(void *tensor);
-
 /// Tensor-storage method to finalize lexicographic insertions.
 MLIR_CRUNNERUTILS_EXPORT void endLexInsert(void *tensor);
 
 /// Releases the memory for the tensor-storage object.
 MLIR_CRUNNERUTILS_EXPORT void delSparseTensor(void *tensor);
 
-/// Releases the memory for the coordinate-scheme object.
-#define DECL_DELCOO(VNAME, V)                                                  \
-  MLIR_CRUNNERUTILS_EXPORT void delSparseTensorCOO##VNAME(void *coo);
-MLIR_SPARSETENSOR_FOREVERY_V(DECL_DELCOO)
-#undef DECL_DELCOO
-
 /// Helper function to read a sparse tensor filename from the environment,
 /// defined with the naming convention ${TENSOR0}, ${TENSOR1}, etc.
 MLIR_CRUNNERUTILS_EXPORT char *getTensorFilename(index_type id);
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index 9e8b240899d80..bbe10b0dcdd46 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -74,13 +74,6 @@ MLIR_SPARSETENSOR_FOREVERY_FIXED_O(IMPL_GETCOORDINATES)
 MLIR_SPARSETENSOR_FOREVERY_V(IMPL_GETVALUES)
 #undef IMPL_GETVALUES
 
-#define IMPL_FORWARDINGINSERT(VNAME, V)                                        \
-  void SparseTensorStorageBase::forwardingInsert(const uint64_t *, V) {        \
-    FATAL_PIV("forwardingInsert" #VNAME);                                      \
-  }
-MLIR_SPARSETENSOR_FOREVERY_V(IMPL_FORWARDINGINSERT)
-#undef IMPL_FORWARDINGINSERT
-
 #define IMPL_LEXINSERT(VNAME, V)                                               \
   void SparseTensorStorageBase::lexInsert(const uint64_t *, V) {               \
     FATAL_PIV("lexInsert" #VNAME);                                             \
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index a5e75a77b4e47..0bc90b281b7ce 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -117,20 +117,7 @@ extern "C" {
     switch (action) {                                                          \
     case Action::kEmpty: {                                                     \
       return SparseTensorStorage<P, C, V>::newEmpty(                           \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
-          false);                                                              \
-    }                                                                          \
-    case Action::kEmptyForward: {                                              \
-      return SparseTensorStorage<P, C, V>::newEmpty(                           \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
-          true);                                                               \
-    }                                                                          \
-    case Action::kFromCOO: {                                                   \
-      assert(ptr && "Received nullptr for SparseTensorCOO object");            \
-      auto &coo = *static_cast<SparseTensorCOO<V> *>(ptr);                     \
-      return SparseTensorStorage<P, C, V>::newFromCOO(                         \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
-          coo);                                                                \
+          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim);   \
     }                                                                          \
     case Action::kFromReader: {                                                \
       assert(ptr && "Received nullptr for SparseTensorReader object");         \
@@ -138,11 +125,6 @@ extern "C" {
       return static_cast<void *>(reader.readSparseTensor<P, C, V>(             \
           lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim));                     \
     }                                                                          \
-    case Action::kToCOO: {                                                     \
-      assert(ptr && "Received nullptr for SparseTensorStorage object");        \
-      auto &tensor = *static_cast<SparseTensorStorage<P, C, V> *>(ptr);        \
-      return tensor.toCOO();                                                   \
-    }                                                                          \
     case Action::kPack: {                                                      \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       intptr_t *buffers = static_cast<intptr_t *>(ptr);                        \
@@ -341,21 +323,6 @@ MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSECOORDINATES)
 #undef IMPL_SPARSECOORDINATES
 #undef IMPL_GETOVERHEAD
 
-#define IMPL_FORWARDINGINSERT(VNAME, V)                                        \
-  void _mlir_ciface_forwardingInsert##VNAME(                                   \
-      void *t, StridedMemRefType<V, 0> *vref,                                  \
-      StridedMemRefType<index_type, 1> *dimCoordsRef) {                        \
-    assert(t &&vref);                                                          \
-    ASSERT_NO_STRIDE(dimCoordsRef);                                            \
-    const index_type *dimCoords = MEMREF_GET_PAYLOAD(dimCoordsRef);            \
-    assert(dimCoords);                                                         \
-    const V *value = MEMREF_GET_PAYLOAD(vref);                                 \
-    static_cast<SparseTensorStorageBase *>(t)->forwardingInsert(dimCoords,     \
-                                                                *value);       \
-  }
-MLIR_SPARSETENSOR_FOREVERY_V(IMPL_FORWARDINGINSERT)
-#undef IMPL_FORWARDINGINSERT
-
 #define IMPL_LEXINSERT(VNAME, V)                                               \
   void _mlir_ciface_lexInsert##VNAME(                                          \
       void *t, StridedMemRefType<index_type, 1> *lvlCoordsRef,                 \
@@ -427,8 +394,8 @@ void _mlir_ciface_getSparseTensorReaderDimSizes(
     const uint64_t cSize = MEMREF_GET_USIZE(cref);                             \
     const uint64_t vSize = MEMREF_GET_USIZE(vref);                             \
     ASSERT_USIZE_EQ(lvl2dimRef, dimRank);                                      \
-    assert(cSize >= lvlRank * vSize);                                          \
-    assert(vSize >= reader.getNSE() && "Not enough space in buffers");         \
+    assert(cSize >= lvlRank * reader.getNSE());                                \
+    assert(vSize >= reader.getNSE());                                          \
     (void)dimRank;                                                             \
     (void)cSize;                                                               \
     (void)vSize;                                                               \
@@ -488,10 +455,6 @@ index_type sparseDimSize(void *tensor, index_type d) {
   return static_cast<SparseTensorStorageBase *>(tensor)->getDimSize(d);
 }
 
-void endForwardingInsert(void *tensor) {
-  return static_cast<SparseTensorStorageBase *>(tensor)->endForwardingInsert();
-}
-
 void endLexInsert(void *tensor) {
   return static_cast<SparseTensorStorageBase *>(tensor)->endLexInsert();
 }
@@ -500,13 +463,6 @@ void delSparseTensor(void *tensor) {
   delete static_cast<SparseTensorStorageBase *>(tensor);
 }
 
-#define IMPL_DELCOO(VNAME, V)                                                  \
-  void delSparseTensorCOO##VNAME(void *coo) {                                  \
-    delete static_cast<SparseTensorCOO<V> *>(coo);                             \
-  }
-MLIR_SPARSETENSOR_FOREVERY_V(IMPL_DELCOO)
-#undef IMPL_DELCOO
-
 char *getTensorFilename(index_type id) {
   constexpr size_t bufSize = 80;
   char var[bufSize];

From 5874874c24720dc24fde12327f81369ef4af4e0b Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 23 Feb 2024 11:03:36 -0800
Subject: [PATCH 198/546] [SelectionDAG] Introducing the SelectionDAG pattern
 matching framework (#78654)

Akin to `llvm::PatternMatch` and `llvm::MIPatternMatch`, the
`llvm::SDPatternMatch` introduced in this patch provides a DSL-alike
framework to match SDValue / SDNode with a more succinct syntax.
---
 llvm/include/llvm/CodeGen/SDPatternMatch.h    | 694 ++++++++++++++++++
 llvm/unittests/CodeGen/CMakeLists.txt         |   1 +
 .../CodeGen/SelectionDAGPatternMatchTest.cpp  | 292 ++++++++
 3 files changed, 987 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGen/SDPatternMatch.h
 create mode 100644 llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
new file mode 100644
index 0000000000000..412bf42677cc5
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -0,0 +1,694 @@
+//==--------------- llvm/CodeGen/SDPatternMatch.h ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Contains matchers for matching SelectionDAG nodes and values.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SDPATTERNMATCH_H
+#define LLVM_CODEGEN_SDPATTERNMATCH_H
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+namespace SDPatternMatch {
+
+/// MatchContext can repurpose existing patterns to behave differently under
+/// a certain context. For instance, `m_Opc(ISD::ADD)` matches plain ADD nodes
+/// in normal circumstances, but matches VP_ADD nodes under a custom
+/// VPMatchContext. This design is meant to facilitate code / pattern reusing.
+class BasicMatchContext {
+  const SelectionDAG *DAG;
+  const TargetLowering *TLI;
+
+public:
+  explicit BasicMatchContext(const SelectionDAG *DAG)
+      : DAG(DAG), TLI(DAG ? &DAG->getTargetLoweringInfo() : nullptr) {}
+
+  explicit BasicMatchContext(const TargetLowering *TLI)
+      : DAG(nullptr), TLI(TLI) {}
+
+  // A valid MatchContext has to implement the following functions.
+
+  const SelectionDAG *getDAG() const { return DAG; }
+
+  const TargetLowering *getTLI() const { return TLI; }
+
+  /// Return true if N effectively has opcode Opcode.
+  bool match(SDValue N, unsigned Opcode) const {
+    return N->getOpcode() == Opcode;
+  }
+};
+
+template <typename Pattern, typename MatchContext>
+[[nodiscard]] bool sd_context_match(SDValue N, const MatchContext &Ctx,
+                                    Pattern &&P) {
+  return P.match(Ctx, N);
+}
+
+template <typename Pattern, typename MatchContext>
+[[nodiscard]] bool sd_context_match(SDNode *N, const MatchContext &Ctx,
+                                    Pattern &&P) {
+  return sd_context_match(SDValue(N, 0), Ctx, P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P) {
+  return sd_context_match(N, BasicMatchContext(DAG), P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDValue N, const SelectionDAG *DAG, Pattern &&P) {
+  return sd_context_match(N, BasicMatchContext(DAG), P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDNode *N, Pattern &&P) {
+  return sd_match(N, nullptr, P);
+}
+
+template <typename Pattern>
+[[nodiscard]] bool sd_match(SDValue N, Pattern &&P) {
+  return sd_match(N, nullptr, P);
+}
+
+// === Utilities ===
+struct Value_match {
+  SDValue MatchVal;
+
+  Value_match() = default;
+
+  explicit Value_match(SDValue Match) : MatchVal(Match) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    if (MatchVal)
+      return MatchVal == N;
+    return N.getNode();
+  }
+};
+
+/// Match any valid SDValue.
+inline Value_match m_Value() { return Value_match(); }
+
+inline Value_match m_Specific(SDValue N) {
+  assert(N);
+  return Value_match(N);
+}
+
+struct DeferredValue_match {
+  SDValue &MatchVal;
+
+  explicit DeferredValue_match(SDValue &Match) : MatchVal(Match) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return N == MatchVal;
+  }
+};
+
+/// Similar to m_Specific, but the specific value to match is determined by
+/// another sub-pattern in the same sd_match() expression. For instance,
+/// We cannot match `(add V, V)` with `m_Add(m_Value(X), m_Specific(X))` since
+/// `X` is not initialized at the time it got copied into `m_Specific`. Instead,
+/// we should use `m_Add(m_Value(X), m_Deferred(X))`.
+inline DeferredValue_match m_Deferred(SDValue &V) {
+  return DeferredValue_match(V);
+}
+
+struct Opcode_match {
+  unsigned Opcode;
+
+  explicit Opcode_match(unsigned Opc) : Opcode(Opc) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return Ctx.match(N, Opcode);
+  }
+};
+
+inline Opcode_match m_Opc(unsigned Opcode) { return Opcode_match(Opcode); }
+
+template <unsigned NumUses, typename Pattern> struct NUses_match {
+  Pattern P;
+
+  explicit NUses_match(const Pattern &P) : P(P) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    // SDNode::hasNUsesOfValue is pretty expensive when the SDNode produces
+    // multiple results, hence we check the subsequent pattern here before
+    // checking the number of value users.
+    return P.match(Ctx, N) && N->hasNUsesOfValue(NumUses, N.getResNo());
+  }
+};
+
+template <typename Pattern>
+inline NUses_match<1, Pattern> m_OneUse(const Pattern &P) {
+  return NUses_match<1, Pattern>(P);
+}
+template <unsigned N, typename Pattern>
+inline NUses_match<N, Pattern> m_NUses(const Pattern &P) {
+  return NUses_match<N, Pattern>(P);
+}
+
+inline NUses_match<1, Value_match> m_OneUse() {
+  return NUses_match<1, Value_match>(m_Value());
+}
+template <unsigned N> inline NUses_match<N, Value_match> m_NUses() {
+  return NUses_match<N, Value_match>(m_Value());
+}
+
+struct Value_bind {
+  SDValue &BindVal;
+
+  explicit Value_bind(SDValue &N) : BindVal(N) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    BindVal = N;
+    return true;
+  }
+};
+
+inline Value_bind m_Value(SDValue &N) { return Value_bind(N); }
+
+template <typename Pattern, typename PredFuncT> struct TLI_pred_match {
+  Pattern P;
+  PredFuncT PredFunc;
+
+  TLI_pred_match(const PredFuncT &Pred, const Pattern &P)
+      : P(P), PredFunc(Pred) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    assert(Ctx.getTLI() && "TargetLowering is required for this pattern.");
+    return PredFunc(*Ctx.getTLI(), N) && P.match(Ctx, N);
+  }
+};
+
+// Explicit deduction guide.
+template <typename PredFuncT, typename Pattern>
+TLI_pred_match(const PredFuncT &Pred, const Pattern &P)
+    -> TLI_pred_match<Pattern, PredFuncT>;
+
+/// Match legal SDNodes based on the information provided by TargetLowering.
+template <typename Pattern> inline auto m_LegalOp(const Pattern &P) {
+  return TLI_pred_match{[](const TargetLowering &TLI, SDValue N) {
+                          return TLI.isOperationLegal(N->getOpcode(),
+                                                      N.getValueType());
+                        },
+                        P};
+}
+
+/// Switch to a different MatchContext for subsequent patterns.
+template <typename NewMatchContext, typename Pattern> struct SwitchContext {
+  const NewMatchContext &Ctx;
+  Pattern P;
+
+  template <typename OrigMatchContext>
+  bool match(const OrigMatchContext &, SDValue N) {
+    return P.match(Ctx, N);
+  }
+};
+
+template <typename MatchContext, typename Pattern>
+inline SwitchContext<MatchContext, Pattern> m_Context(const MatchContext &Ctx,
+                                                      Pattern &&P) {
+  return SwitchContext<MatchContext, Pattern>{Ctx, std::move(P)};
+}
+
+// === Value type ===
+struct ValueType_bind {
+  EVT &BindVT;
+
+  explicit ValueType_bind(EVT &Bind) : BindVT(Bind) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    BindVT = N.getValueType();
+    return true;
+  }
+};
+
+/// Retreive the ValueType of the current SDValue.
+inline ValueType_bind m_VT(EVT &VT) { return ValueType_bind(VT); }
+
+template <typename Pattern, typename PredFuncT> struct ValueType_match {
+  PredFuncT PredFunc;
+  Pattern P;
+
+  ValueType_match(const PredFuncT &Pred, const Pattern &P)
+      : PredFunc(Pred), P(P) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return PredFunc(N.getValueType()) && P.match(Ctx, N);
+  }
+};
+
+// Explicit deduction guide.
+template <typename PredFuncT, typename Pattern>
+ValueType_match(const PredFuncT &Pred, const Pattern &P)
+    -> ValueType_match<Pattern, PredFuncT>;
+
+/// Match a specific ValueType.
+template <typename Pattern>
+inline auto m_SpecificVT(EVT RefVT, const Pattern &P) {
+  return ValueType_match{[=](EVT VT) { return VT == RefVT; }, P};
+}
+inline auto m_SpecificVT(EVT RefVT) {
+  return ValueType_match{[=](EVT VT) { return VT == RefVT; }, m_Value()};
+}
+
+inline auto m_Glue() { return m_SpecificVT(MVT::Glue); }
+inline auto m_OtherVT() { return m_SpecificVT(MVT::Other); }
+
+/// Match any integer ValueTypes.
+template <typename Pattern> inline auto m_IntegerVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isInteger(); }, P};
+}
+inline auto m_IntegerVT() {
+  return ValueType_match{[](EVT VT) { return VT.isInteger(); }, m_Value()};
+}
+
+/// Match any floating point ValueTypes.
+template <typename Pattern> inline auto m_FloatingPointVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isFloatingPoint(); }, P};
+}
+inline auto m_FloatingPointVT() {
+  return ValueType_match{[](EVT VT) { return VT.isFloatingPoint(); },
+                         m_Value()};
+}
+
+/// Match any vector ValueTypes.
+template <typename Pattern> inline auto m_VectorVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isVector(); }, P};
+}
+inline auto m_VectorVT() {
+  return ValueType_match{[](EVT VT) { return VT.isVector(); }, m_Value()};
+}
+
+/// Match fixed-length vector ValueTypes.
+template <typename Pattern> inline auto m_FixedVectorVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isFixedLengthVector(); }, P};
+}
+inline auto m_FixedVectorVT() {
+  return ValueType_match{[](EVT VT) { return VT.isFixedLengthVector(); },
+                         m_Value()};
+}
+
+/// Match scalable vector ValueTypes.
+template <typename Pattern> inline auto m_ScalableVectorVT(const Pattern &P) {
+  return ValueType_match{[](EVT VT) { return VT.isScalableVector(); }, P};
+}
+inline auto m_ScalableVectorVT() {
+  return ValueType_match{[](EVT VT) { return VT.isScalableVector(); },
+                         m_Value()};
+}
+
+/// Match legal ValueTypes based on the information provided by TargetLowering.
+template <typename Pattern> inline auto m_LegalType(const Pattern &P) {
+  return TLI_pred_match{[](const TargetLowering &TLI, SDValue N) {
+                          return TLI.isTypeLegal(N.getValueType());
+                        },
+                        P};
+}
+
+// === Patterns combinators ===
+template <typename... Preds> struct And {
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return true;
+  }
+};
+
+template <typename Pred, typename... Preds>
+struct And<Pred, Preds...> : And<Preds...> {
+  Pred P;
+  And(Pred &&p, Preds &&...preds)
+      : And<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {
+  }
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return P.match(Ctx, N) && And<Preds...>::match(Ctx, N);
+  }
+};
+
+template <typename... Preds> struct Or {
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    return false;
+  }
+};
+
+template <typename Pred, typename... Preds>
+struct Or<Pred, Preds...> : Or<Preds...> {
+  Pred P;
+  Or(Pred &&p, Preds &&...preds)
+      : Or<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    return P.match(Ctx, N) || Or<Preds...>::match(Ctx, N);
+  }
+};
+
+template <typename... Preds> And<Preds...> m_AllOf(Preds &&...preds) {
+  return And<Preds...>(std::forward<Preds>(preds)...);
+}
+
+template <typename... Preds> Or<Preds...> m_AnyOf(Preds &&...preds) {
+  return Or<Preds...>(std::forward<Preds>(preds)...);
+}
+
+// === Generic node matching ===
+template <unsigned OpIdx, typename... OpndPreds> struct Operands_match {
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    // Returns false if there are more operands than predicates;
+    return N->getNumOperands() == OpIdx;
+  }
+};
+
+template <unsigned OpIdx, typename OpndPred, typename... OpndPreds>
+struct Operands_match<OpIdx, OpndPred, OpndPreds...>
+    : Operands_match<OpIdx + 1, OpndPreds...> {
+  OpndPred P;
+
+  Operands_match(OpndPred &&p, OpndPreds &&...preds)
+      : Operands_match<OpIdx + 1, OpndPreds...>(
+            std::forward<OpndPreds>(preds)...),
+        P(std::forward<OpndPred>(p)) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (OpIdx < N->getNumOperands())
+      return P.match(Ctx, N->getOperand(OpIdx)) &&
+             Operands_match<OpIdx + 1, OpndPreds...>::match(Ctx, N);
+
+    // This is the case where there are more predicates than operands.
+    return false;
+  }
+};
+
+template <typename... OpndPreds>
+auto m_Node(unsigned Opcode, OpndPreds &&...preds) {
+  return m_AllOf(m_Opc(Opcode), Operands_match<0, OpndPreds...>(
+                                    std::forward<OpndPreds>(preds)...));
+}
+
+/// Provide number of operands that are not chain or glue, as well as the first
+/// index of such operand.
+template <bool ExcludeChain> struct EffectiveOperands {
+  unsigned Size = 0;
+  unsigned FirstIndex = 0;
+
+  explicit EffectiveOperands(SDValue N) {
+    const unsigned TotalNumOps = N->getNumOperands();
+    FirstIndex = TotalNumOps;
+    for (unsigned I = 0; I < TotalNumOps; ++I) {
+      // Count the number of non-chain and non-glue nodes (we ignore chain
+      // and glue by default) and retreive the operand index offset.
+      EVT VT = N->getOperand(I).getValueType();
+      if (VT != MVT::Glue && VT != MVT::Other) {
+        ++Size;
+        if (FirstIndex == TotalNumOps)
+          FirstIndex = I;
+      }
+    }
+  }
+};
+
+template <> struct EffectiveOperands<false> {
+  unsigned Size = 0;
+  unsigned FirstIndex = 0;
+
+  explicit EffectiveOperands(SDValue N) : Size(N->getNumOperands()) {}
+};
+
+// === Binary operations ===
+template <typename LHS_P, typename RHS_P, bool Commutable = false,
+          bool ExcludeChain = false>
+struct BinaryOpc_match {
+  unsigned Opcode;
+  LHS_P LHS;
+  RHS_P RHS;
+
+  BinaryOpc_match(unsigned Opc, const LHS_P &L, const RHS_P &R)
+      : Opcode(Opc), LHS(L), RHS(R) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
+      EffectiveOperands<ExcludeChain> EO(N);
+      assert(EO.Size == 2);
+      return (LHS.match(Ctx, N->getOperand(EO.FirstIndex)) &&
+              RHS.match(Ctx, N->getOperand(EO.FirstIndex + 1))) ||
+             (Commutable && LHS.match(Ctx, N->getOperand(EO.FirstIndex + 1)) &&
+              RHS.match(Ctx, N->getOperand(EO.FirstIndex)));
+    }
+
+    return false;
+  }
+};
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_BinOp(unsigned Opc, const LHS &L,
+                                                const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(Opc, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_c_BinOp(unsigned Opc, const LHS &L,
+                                                 const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(Opc, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false, true>
+m_ChainedBinOp(unsigned Opc, const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false, true>(Opc, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true, true>
+m_c_ChainedBinOp(unsigned Opc, const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true, true>(Opc, L, R);
+}
+
+// Common binary operations
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Add(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::ADD, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Sub(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SUB, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Mul(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::MUL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_UDiv(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::UDIV, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_SDiv(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SDIV, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_URem(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::UREM, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_SRem(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SREM, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Shl(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SHL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Sra(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SRA, L, R);
+}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_Srl(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::SRL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_FAdd(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::FADD, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_FSub(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::FSUB, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_FMul(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::FMUL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_FDiv(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::FDIV, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, false> m_FRem(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, false>(ISD::FREM, L, R);
+}
+
+// === Unary operations ===
+template <typename Opnd_P, bool ExcludeChain = false> struct UnaryOpc_match {
+  unsigned Opcode;
+  Opnd_P Opnd;
+
+  UnaryOpc_match(unsigned Opc, const Opnd_P &Op) : Opcode(Opc), Opnd(Op) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    if (sd_context_match(N, Ctx, m_Opc(Opcode))) {
+      EffectiveOperands<ExcludeChain> EO(N);
+      assert(EO.Size == 1);
+      return Opnd.match(Ctx, N->getOperand(EO.FirstIndex));
+    }
+
+    return false;
+  }
+};
+
+template <typename Opnd>
+inline UnaryOpc_match<Opnd> m_UnaryOp(unsigned Opc, const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(Opc, Op);
+}
+template <typename Opnd>
+inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
+                                                   const Opnd &Op) {
+  return UnaryOpc_match<Opnd, true>(Opc, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_ZExt(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::ZERO_EXTEND, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_SExt(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::SIGN_EXTEND, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_AnyExt(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::ANY_EXTEND, Op);
+}
+
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Trunc(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::TRUNCATE, Op);
+}
+
+// === Constants ===
+struct ConstantInt_match {
+  APInt *BindVal;
+
+  explicit ConstantInt_match(APInt *V) : BindVal(V) {}
+
+  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+    // The logics here are similar to that in
+    // SelectionDAG::isConstantIntBuildVectorOrConstantInt, but the latter also
+    // treats GlobalAddressSDNode as a constant, which is difficult to turn into
+    // APInt.
+    if (auto *C = dyn_cast_or_null<ConstantSDNode>(N.getNode())) {
+      if (BindVal)
+        *BindVal = C->getAPIntValue();
+      return true;
+    }
+
+    APInt Discard;
+    return ISD::isConstantSplatVector(N.getNode(),
+                                      BindVal ? *BindVal : Discard);
+  }
+};
+/// Match any interger constants or splat of an integer constant.
+inline ConstantInt_match m_ConstInt() { return ConstantInt_match(nullptr); }
+/// Match any interger constants or splat of an integer constant; return the
+/// specific constant or constant splat value.
+inline ConstantInt_match m_ConstInt(APInt &V) { return ConstantInt_match(&V); }
+
+struct SpecificInt_match {
+  APInt IntVal;
+
+  explicit SpecificInt_match(APInt APV) : IntVal(std::move(APV)) {}
+
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
+    APInt ConstInt;
+    if (sd_context_match(N, Ctx, m_ConstInt(ConstInt)))
+      return APInt::isSameValue(IntVal, ConstInt);
+    return false;
+  }
+};
+
+/// Match a specific integer constant or constant splat value.
+inline SpecificInt_match m_SpecificInt(APInt V) {
+  return SpecificInt_match(std::move(V));
+}
+inline SpecificInt_match m_SpecificInt(uint64_t V) {
+  return SpecificInt_match(APInt(64, V));
+}
+
+inline SpecificInt_match m_Zero() { return m_SpecificInt(0U); }
+inline SpecificInt_match m_AllOnes() { return m_SpecificInt(~0U); }
+
+/// Match true boolean value based on the information provided by
+/// TargetLowering.
+inline auto m_True() {
+  return TLI_pred_match{
+      [](const TargetLowering &TLI, SDValue N) {
+        APInt ConstVal;
+        if (sd_match(N, m_ConstInt(ConstVal)))
+          switch (TLI.getBooleanContents(N.getValueType())) {
+          case TargetLowering::ZeroOrOneBooleanContent:
+            return ConstVal.isOne();
+          case TargetLowering::ZeroOrNegativeOneBooleanContent:
+            return ConstVal.isAllOnes();
+          case TargetLowering::UndefinedBooleanContent:
+            return (ConstVal & 0x01) == 1;
+          }
+
+        return false;
+      },
+      m_Value()};
+}
+/// Match false boolean value based on the information provided by
+/// TargetLowering.
+inline auto m_False() {
+  return TLI_pred_match{
+      [](const TargetLowering &TLI, SDValue N) {
+        APInt ConstVal;
+        if (sd_match(N, m_ConstInt(ConstVal)))
+          switch (TLI.getBooleanContents(N.getValueType())) {
+          case TargetLowering::ZeroOrOneBooleanContent:
+          case TargetLowering::ZeroOrNegativeOneBooleanContent:
+            return ConstVal.isZero();
+          case TargetLowering::UndefinedBooleanContent:
+            return (ConstVal & 0x01) == 0;
+          }
+
+        return false;
+      },
+      m_Value()};
+}
+} // namespace SDPatternMatch
+} // namespace llvm
+#endif
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 6140e0d6fb370..dbbacdd95ec9f 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_unittest(CodeGenTests
   ScalableVectorMVTsTest.cpp
   SchedBoundary.cpp
   SelectionDAGAddressAnalysisTest.cpp
+  SelectionDAGPatternMatchTest.cpp
   TypeTraitsTest.cpp
   TargetOptionsTest.cpp
   TestAsmPrinter.cpp
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
new file mode 100644
index 0000000000000..17fc3ce8af267
--- /dev/null
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -0,0 +1,292 @@
+//===---- llvm/unittest/CodeGen/SelectionDAGPatternMatchTest.cpp  ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class SelectionDAGPatternMatchTest : public testing::Test {
+protected:
+  static void SetUpTestCase() {
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+  }
+
+  void SetUp() override {
+    StringRef Assembly = "@g = global i32 0\n"
+                         "@g_alias = alias i32, i32* @g\n"
+                         "define i32 @f() {\n"
+                         "  %1 = load i32, i32* @g\n"
+                         "  ret i32 %1\n"
+                         "}";
+
+    Triple TargetTriple("riscv64--");
+    std::string Error;
+    const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
+    // FIXME: These tests do not depend on RISCV specifically, but we have to
+    // initialize a target. A skeleton Target for unittests would allow us to
+    // always run these tests.
+    if (!T)
+      GTEST_SKIP();
+
+    TargetOptions Options;
+    TM = std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine *>(
+        T->createTargetMachine("riscv64", "", "+m,+f,+d,+v", Options,
+                               std::nullopt, std::nullopt,
+                               CodeGenOptLevel::Aggressive)));
+    if (!TM)
+      GTEST_SKIP();
+
+    SMDiagnostic SMError;
+    M = parseAssemblyString(Assembly, SMError, Context);
+    if (!M)
+      report_fatal_error(SMError.getMessage());
+    M->setDataLayout(TM->createDataLayout());
+
+    F = M->getFunction("f");
+    if (!F)
+      report_fatal_error("F?");
+    G = M->getGlobalVariable("g");
+    if (!G)
+      report_fatal_error("G?");
+    AliasedG = M->getNamedAlias("g_alias");
+    if (!AliasedG)
+      report_fatal_error("AliasedG?");
+
+    MachineModuleInfo MMI(TM.get());
+
+    MF = std::make_unique<MachineFunction>(*F, *TM, *TM->getSubtargetImpl(*F),
+                                           0, MMI);
+
+    DAG = std::make_unique<SelectionDAG>(*TM, CodeGenOptLevel::None);
+    if (!DAG)
+      report_fatal_error("DAG?");
+    OptimizationRemarkEmitter ORE(F);
+    DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
+  }
+
+  TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
+    return DAG->getTargetLoweringInfo().getTypeAction(Context, VT);
+  }
+
+  EVT getTypeToTransformTo(EVT VT) {
+    return DAG->getTargetLoweringInfo().getTypeToTransformTo(Context, VT);
+  }
+
+  LLVMContext Context;
+  std::unique_ptr<LLVMTargetMachine> TM;
+  std::unique_ptr<Module> M;
+  Function *F;
+  GlobalVariable *G;
+  GlobalAlias *AliasedG;
+  std::unique_ptr<MachineFunction> MF;
+  std::unique_ptr<SelectionDAG> DAG;
+};
+
+TEST_F(SelectionDAGPatternMatchTest, matchValueType) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Float32VT = EVT::getFloatingPointVT(32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Float32VT);
+  SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Op0, m_SpecificVT(Int32VT)));
+  EVT BindVT;
+  EXPECT_TRUE(sd_match(Op1, m_VT(BindVT)));
+  EXPECT_EQ(BindVT, Float32VT);
+  EXPECT_TRUE(sd_match(Op0, m_IntegerVT()));
+  EXPECT_TRUE(sd_match(Op1, m_FloatingPointVT()));
+  EXPECT_TRUE(sd_match(Op2, m_VectorVT()));
+  EXPECT_FALSE(sd_match(Op2, m_ScalableVectorVT()));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Float32VT = EVT::getFloatingPointVT(32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+  SDValue Op2 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, Float32VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
+  SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Add, Op0);
+  SDValue Mul = DAG->getNode(ISD::MUL, DL, Int32VT, Add, Sub);
+
+  SDValue SFAdd = DAG->getNode(ISD::STRICT_FADD, DL, {Float32VT, MVT::Other},
+                               {DAG->getEntryNode(), Op2, Op2});
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Sub, m_BinOp(ISD::SUB, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Sub, m_Sub(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Add, m_c_BinOp(ISD::ADD, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Add, m_Add(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(
+      Mul, m_Mul(m_OneUse(m_Opc(ISD::SUB)), m_NUses<2>(m_Specific(Add)))));
+  EXPECT_TRUE(
+      sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_SpecificVT(Float32VT),
+                                     m_SpecificVT(Float32VT))));
+  SDValue BindVal;
+  EXPECT_TRUE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_Value(BindVal),
+                                             m_Deferred(BindVal))));
+  EXPECT_FALSE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_OtherVT(),
+                                              m_SpecificVT(Float32VT))));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto Int64VT = EVT::getIntegerVT(Context, 64);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int64VT);
+
+  SDValue ZExt = DAG->getNode(ISD::ZERO_EXTEND, DL, Int64VT, Op0);
+  SDValue SExt = DAG->getNode(ISD::SIGN_EXTEND, DL, Int64VT, Op0);
+  SDValue Trunc = DAG->getNode(ISD::TRUNCATE, DL, Int32VT, Op1);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
+  EXPECT_TRUE(sd_match(SExt, m_SExt(m_Value())));
+  EXPECT_TRUE(sd_match(Trunc, m_Trunc(m_Specific(Op1))));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+  SDValue Arg0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+
+  SDValue Const3 = DAG->getConstant(3, DL, Int32VT);
+  SDValue Const87 = DAG->getConstant(87, DL, Int32VT);
+  SDValue Splat = DAG->getSplat(VInt32VT, DL, Arg0);
+  SDValue ConstSplat = DAG->getSplat(VInt32VT, DL, Const3);
+  SDValue Zero = DAG->getConstant(0, DL, Int32VT);
+  SDValue One = DAG->getConstant(1, DL, Int32VT);
+  SDValue AllOnes = DAG->getConstant(APInt::getAllOnes(32), DL, Int32VT);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Const87, m_ConstInt()));
+  EXPECT_FALSE(sd_match(Arg0, m_ConstInt()));
+  APInt ConstVal;
+  EXPECT_TRUE(sd_match(ConstSplat, m_ConstInt(ConstVal)));
+  EXPECT_EQ(ConstVal, 3);
+  EXPECT_FALSE(sd_match(Splat, m_ConstInt()));
+
+  EXPECT_TRUE(sd_match(Const87, m_SpecificInt(87)));
+  EXPECT_TRUE(sd_match(Const3, m_SpecificInt(ConstVal)));
+  EXPECT_TRUE(sd_match(AllOnes, m_AllOnes()));
+
+  EXPECT_TRUE(sd_match(Zero, DAG.get(), m_False()));
+  EXPECT_TRUE(sd_match(One, DAG.get(), m_True()));
+  EXPECT_FALSE(sd_match(AllOnes, DAG.get(), m_True()));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, patternCombinators) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
+  SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Add, Op0);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(
+      Sub, m_AnyOf(m_Opc(ISD::ADD), m_Opc(ISD::SUB), m_Opc(ISD::MUL))));
+  EXPECT_TRUE(sd_match(Add, m_AllOf(m_Opc(ISD::ADD), m_OneUse())));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchNode) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Add, m_Node(ISD::ADD, m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Add, m_Node(ISD::SUB, m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_Value())));
+  EXPECT_FALSE(
+      sd_match(Add, m_Node(ISD::ADD, m_Value(), m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_ConstInt(), m_Value())));
+}
+
+namespace {
+struct VPMatchContext : public SDPatternMatch::BasicMatchContext {
+  using SDPatternMatch::BasicMatchContext::BasicMatchContext;
+
+  bool match(SDValue OpVal, unsigned Opc) const {
+    if (!OpVal->isVPOpcode())
+      return OpVal->getOpcode() == Opc;
+
+    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(), false);
+    return BaseOpc.has_value() && *BaseOpc == Opc;
+  }
+};
+} // anonymous namespace
+TEST_F(SelectionDAGPatternMatchTest, matchContext) {
+  SDLoc DL;
+  auto BoolVT = EVT::getIntegerVT(Context, 1);
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+  auto MaskVT = EVT::getVectorVT(Context, BoolVT, 4);
+
+  SDValue Scalar0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue Vector0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+  SDValue Mask0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 3, MaskVT);
+
+  SDValue VPAdd = DAG->getNode(ISD::VP_ADD, DL, VInt32VT,
+                               {Vector0, Vector0, Mask0, Scalar0});
+  SDValue VPReduceAdd = DAG->getNode(ISD::VP_REDUCE_ADD, DL, Int32VT,
+                                     {Scalar0, VPAdd, Mask0, Scalar0});
+
+  using namespace SDPatternMatch;
+  VPMatchContext VPCtx(DAG.get());
+  EXPECT_TRUE(sd_context_match(VPAdd, VPCtx, m_Opc(ISD::ADD)));
+  // VP_REDUCE_ADD doesn't have a based opcode, so we use a normal
+  // sd_match before switching to VPMatchContext when checking VPAdd.
+  EXPECT_TRUE(sd_match(VPReduceAdd, m_Node(ISD::VP_REDUCE_ADD, m_Value(),
+                                           m_Context(VPCtx, m_Opc(ISD::ADD)),
+                                           m_Value(), m_Value())));
+}
+
+TEST_F(SelectionDAGPatternMatchTest, matchAdvancedProperties) {
+  SDLoc DL;
+  auto Int16VT = EVT::getIntegerVT(Context, 16);
+  auto Int64VT = EVT::getIntegerVT(Context, 64);
+
+  SDValue Op0 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int64VT);
+  SDValue Op1 = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int16VT);
+
+  SDValue Add = DAG->getNode(ISD::ADD, DL, Int64VT, Op0, Op0);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Op0, DAG.get(), m_LegalType(m_Value())));
+  EXPECT_FALSE(sd_match(Op1, DAG.get(), m_LegalType(m_Value())));
+  EXPECT_TRUE(sd_match(Add, DAG.get(),
+                       m_LegalOp(m_IntegerVT(m_Add(m_Value(), m_Value())))));
+}

From 07fd5ca3a8bd270b26b21ea28501f5edcb519709 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 23 Feb 2024 19:03:52 +0000
Subject: [PATCH 199/546] [gn build] Port 5874874c2472

---
 llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index 8aff3e4d230e4..df71d089ac0e2 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -40,6 +40,7 @@ unittest("CodeGenTests") {
     "ScalableVectorMVTsTest.cpp",
     "SchedBoundary.cpp",
     "SelectionDAGAddressAnalysisTest.cpp",
+    "SelectionDAGPatternMatchTest.cpp",
     "TargetOptionsTest.cpp",
     "TestAsmPrinter.cpp",
     "TypeTraitsTest.cpp",

From 59e5519c81c57a66424d657864ce69cb0efdc7d8 Mon Sep 17 00:00:00 2001
From: David Goldman <dallasftball@gmail.com>
Date: Fri, 23 Feb 2024 14:11:39 -0500
Subject: [PATCH 200/546] [clangd] Fix renaming single argument ObjC methods
 (#82396)

Use the legacy non-ObjC rename logic when dealing with selectors that
have zero or one arguments. In addition, make sure we don't add an extra
`:` during the rename.

Add a few more tests to verify this works (thanks to @ahoppen for the
tests and finding this bug).
---
 clang-tools-extra/clangd/refactor/Rename.cpp  |  19 ++-
 .../clangd/unittests/RenameTests.cpp          | 138 ++++++++++++++++++
 2 files changed, 152 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index 650862c99bcd1..4e135801f6853 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -811,8 +811,18 @@ renameWithinFile(ParsedAST &AST, const NamedDecl &RenameDecl,
       continue;
     Locs.push_back(RenameLoc);
   }
-  if (const auto *MD = dyn_cast<ObjCMethodDecl>(&RenameDecl))
-    return renameObjCMethodWithinFile(AST, MD, NewName, std::move(Locs));
+  if (const auto *MD = dyn_cast<ObjCMethodDecl>(&RenameDecl)) {
+    // The custom ObjC selector logic doesn't handle the zero arg selector
+    // case, as it relies on parsing selectors via the trailing `:`.
+    // We also choose to use regular rename logic for the single-arg selectors
+    // as the AST/Index has the right locations in that case.
+    if (MD->getSelector().getNumArgs() > 1)
+      return renameObjCMethodWithinFile(AST, MD, NewName, std::move(Locs));
+
+    // Eat trailing : for single argument methods since they're actually
+    // considered a separate token during rename.
+    NewName.consume_back(":");
+  }
   for (const auto &Loc : Locs) {
     if (auto Err = FilteredChanges.add(tooling::Replacement(
             SM, CharSourceRange::getTokenRange(Loc), NewName)))
@@ -930,10 +940,9 @@ renameOutsideFile(const NamedDecl &RenameDecl, llvm::StringRef MainFilePath,
     std::optional<Selector> Selector = std::nullopt;
     llvm::SmallVector<llvm::StringRef, 8> NewNames;
     if (const auto *MD = dyn_cast<ObjCMethodDecl>(&RenameDecl)) {
-      if (MD->getSelector().getNumArgs() > 1) {
-        RenameIdentifier = MD->getSelector().getNameForSlot(0).str();
+      RenameIdentifier = MD->getSelector().getNameForSlot(0).str();
+      if (MD->getSelector().getNumArgs() > 1)
         Selector = MD->getSelector();
-      }
     }
     NewName.split(NewNames, ":");
 
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index d91ef85d67271..7d9252110b27d 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -1943,6 +1943,144 @@ TEST(CrossFileRenameTests, WithUpToDateIndex) {
   }
 }
 
+TEST(CrossFileRenameTests, ObjC) {
+  MockCompilationDatabase CDB;
+  CDB.ExtraClangFlags = {"-xobjective-c"};
+  // rename is runnning on all "^" points in FooH.
+  struct Case {
+    llvm::StringRef FooH;
+    llvm::StringRef FooM;
+    llvm::StringRef NewName;
+    llvm::StringRef ExpectedFooH;
+    llvm::StringRef ExpectedFooM;
+  };
+  Case Cases[] = {// --- Zero arg selector
+                  {
+                      // Input
+                      R"cpp(
+        @interface Foo
+        - (int)performA^ction;
+        @end
+      )cpp",
+                      R"cpp(
+        @implementation Foo
+        - (int)performAction {
+          [self performAction];
+        }
+        @end
+      )cpp",
+                      // New name
+                      "performNewAction",
+                      // Expected
+                      R"cpp(
+        @interface Foo
+        - (int)performNewAction;
+        @end
+      )cpp",
+                      R"cpp(
+        @implementation Foo
+        - (int)performNewAction {
+          [self performNewAction];
+        }
+        @end
+      )cpp",
+                  },
+                  // --- Single arg selector
+                  {
+                      // Input
+                      R"cpp(
+        @interface Foo
+        - (int)performA^ction:(int)action;
+        @end
+      )cpp",
+                      R"cpp(
+        @implementation Foo
+        - (int)performAction:(int)action {
+          [self performAction:action];
+        }
+        @end
+      )cpp",
+                      // New name
+                      "performNewAction:",
+                      // Expected
+                      R"cpp(
+        @interface Foo
+        - (int)performNewAction:(int)action;
+        @end
+      )cpp",
+                      R"cpp(
+        @implementation Foo
+        - (int)performNewAction:(int)action {
+          [self performNewAction:action];
+        }
+        @end
+      )cpp",
+                  },
+                  // --- Multi arg selector
+                  {
+                      // Input
+                      R"cpp(
+        @interface Foo
+        - (int)performA^ction:(int)action with:(int)value;
+        @end
+      )cpp",
+                      R"cpp(
+        @implementation Foo
+        - (int)performAction:(int)action with:(int)value {
+          [self performAction:action with:value];
+        }
+        @end
+      )cpp",
+                      // New name
+                      "performNewAction:by:",
+                      // Expected
+                      R"cpp(
+        @interface Foo
+        - (int)performNewAction:(int)action by:(int)value;
+        @end
+      )cpp",
+                      R"cpp(
+        @implementation Foo
+        - (int)performNewAction:(int)action by:(int)value {
+          [self performNewAction:action by:value];
+        }
+        @end
+      )cpp",
+                  }};
+
+  trace::TestTracer Tracer;
+  for (const auto &T : Cases) {
+    SCOPED_TRACE(T.FooH);
+    Annotations FooH(T.FooH);
+    Annotations FooM(T.FooM);
+    std::string FooHPath = testPath("foo.h");
+    std::string FooMPath = testPath("foo.m");
+
+    MockFS FS;
+    FS.Files[FooHPath] = std::string(FooH.code());
+    FS.Files[FooMPath] = std::string(FooM.code());
+
+    auto ServerOpts = ClangdServer::optsForTest();
+    ServerOpts.BuildDynamicSymbolIndex = true;
+    ClangdServer Server(CDB, FS, ServerOpts);
+
+    // Add all files to clangd server to make sure the dynamic index has been
+    // built.
+    runAddDocument(Server, FooHPath, FooH.code());
+    runAddDocument(Server, FooMPath, FooM.code());
+
+    for (const auto &RenamePos : FooH.points()) {
+      EXPECT_THAT(Tracer.takeMetric("rename_files"), SizeIs(0));
+      auto FileEditsList =
+          llvm::cantFail(runRename(Server, FooHPath, RenamePos, T.NewName, {}));
+      EXPECT_THAT(Tracer.takeMetric("rename_files"), ElementsAre(2));
+      EXPECT_THAT(applyEdits(std::move(FileEditsList.GlobalChanges)),
+                  UnorderedElementsAre(Pair(Eq(FooHPath), Eq(T.ExpectedFooH)),
+                                       Pair(Eq(FooMPath), Eq(T.ExpectedFooM))));
+    }
+  }
+}
+
 TEST(CrossFileRenameTests, CrossFileOnLocalSymbol) {
   // cross-file rename should work for function-local symbols, even there is no
   // index provided.

From a64ff9630ccd305a63fca3ea9cc4bc4b49098495 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20Halkenh=C3=A4user?=
 <MichaelGerald.Halkenhauser@amd.com>
Date: Fri, 23 Feb 2024 20:17:32 +0100
Subject: [PATCH 201/546] [llvm-link] Improve missing file error message
 (#82514)

Add error messages showing the missing filenames.

Currently, we only get 'No such file or directory' without any(!)
further info. This patch will (only upon ENOENT error) iterate over all
requested files and print which ones are actually missing.
---
 llvm/tools/llvm-link/llvm-link.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index e6c219a8cd7ec..9e7f2c3ebac43 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -393,8 +393,16 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
   // Similar to some flags, internalization doesn't apply to the first file.
   bool InternalizeLinkedSymbols = false;
   for (const auto &File : Files) {
+    auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(File);
+
+    // When we encounter a missing file, make sure we expose its name.
+    if (auto EC = BufferOrErr.getError())
+      if (EC == std::errc::no_such_file_or_directory)
+        ExitOnErr(createStringError(EC, "No such file or directory: '%s'",
+                                    File.c_str()));
+
     std::unique_ptr<MemoryBuffer> Buffer =
-        ExitOnErr(errorOrToExpected(MemoryBuffer::getFileOrSTDIN(File)));
+        ExitOnErr(errorOrToExpected(std::move(BufferOrErr)));
 
     std::unique_ptr<Module> M =
         identify_magic(Buffer->getBuffer()) == file_magic::archive

From 6dd6d487d012a9000fe975133b7935c1f8c658eb Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Fri, 23 Feb 2024 11:28:20 -0800
Subject: [PATCH 202/546] [NFC] Make RingBuffer an atomic pointer (#82547)

This will allow us to atomically swap out RingBuffer and StackDepot.

Patched into AOSP and ran debuggerd_tests.
---
 compiler-rt/lib/scudo/standalone/combined.h | 148 ++++++++++++--------
 1 file changed, 86 insertions(+), 62 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index f13cf9498a793..cd5a07be1576e 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -177,6 +177,18 @@ class Allocator {
     mapAndInitializeRingBuffer();
   }
 
+  void enableRingBuffer() {
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (RB)
+      RB->Depot->enable();
+  }
+
+  void disableRingBuffer() {
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (RB)
+      RB->Depot->disable();
+  }
+
   // Initialize the embedded GWP-ASan instance. Requires the main allocator to
   // be functional, best called from PostInitCallback.
   void initGwpAsan() {
@@ -688,14 +700,12 @@ class Allocator {
     Quarantine.disable();
     Primary.disable();
     Secondary.disable();
-    if (Depot)
-      Depot->disable();
+    disableRingBuffer();
   }
 
   void enable() NO_THREAD_SAFETY_ANALYSIS {
     initThreadMaybe();
-    if (Depot)
-      Depot->enable();
+    enableRingBuffer();
     Secondary.enable();
     Primary.enable();
     Quarantine.enable();
@@ -920,12 +930,14 @@ class Allocator {
 
   const char *getStackDepotAddress() {
     initThreadMaybe();
-    return reinterpret_cast<char *>(Depot);
+    AllocationRingBuffer *RB = getRingBuffer();
+    return RB ? reinterpret_cast<char *>(RB->Depot) : nullptr;
   }
 
   uptr getStackDepotSize() {
     initThreadMaybe();
-    return StackDepotSize;
+    AllocationRingBuffer *RB = getRingBuffer();
+    return RB ? RB->StackDepotSize : 0;
   }
 
   const char *getRegionInfoArrayAddress() const {
@@ -938,12 +950,15 @@ class Allocator {
 
   const char *getRingBufferAddress() {
     initThreadMaybe();
-    return RawRingBuffer;
+    return reinterpret_cast<char *>(getRingBuffer());
   }
 
   uptr getRingBufferSize() {
     initThreadMaybe();
-    return RingBufferElements ? ringBufferSizeInBytes(RingBufferElements) : 0;
+    AllocationRingBuffer *RB = getRingBuffer();
+    return RB && RB->RingBufferElements
+               ? ringBufferSizeInBytes(RB->RingBufferElements)
+               : 0;
   }
 
   static const uptr MaxTraceSize = 64;
@@ -1048,10 +1063,6 @@ class Allocator {
   uptr GuardedAllocSlotSize = 0;
 #endif // GWP_ASAN_HOOKS
 
-  StackDepot *Depot = nullptr;
-  uptr StackDepotSize = 0;
-  MemMapT RawStackDepotMap;
-
   struct AllocationRingBuffer {
     struct Entry {
       atomic_uptr Ptr;
@@ -1061,16 +1072,23 @@ class Allocator {
       atomic_u32 DeallocationTrace;
       atomic_u32 DeallocationTid;
     };
-
+    StackDepot *Depot = nullptr;
+    uptr StackDepotSize = 0;
+    MemMapT RawRingBufferMap;
+    MemMapT RawStackDepotMap;
+    u32 RingBufferElements = 0;
     atomic_uptr Pos;
     // An array of Size (at least one) elements of type Entry is immediately
     // following to this struct.
   };
   // Pointer to memory mapped area starting with AllocationRingBuffer struct,
   // and immediately followed by Size elements of type Entry.
-  char *RawRingBuffer = {};
-  u32 RingBufferElements = 0;
-  MemMapT RawRingBufferMap;
+  atomic_uptr RingBufferAddress = {};
+
+  AllocationRingBuffer *getRingBuffer() {
+    return reinterpret_cast<AllocationRingBuffer *>(
+        atomic_load(&RingBufferAddress, memory_order_acquire));
+  }
 
   // The following might get optimized out by the compiler.
   NOINLINE void performSanityChecks() {
@@ -1259,27 +1277,24 @@ class Allocator {
     storeEndMarker(RoundNewPtr, NewSize, BlockEnd);
   }
 
-  StackDepot *getDepotIfEnabled(const Options &Options) {
-    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
-      return nullptr;
-    return Depot;
-  }
-
   void storePrimaryAllocationStackMaybe(const Options &Options, void *Ptr) {
-    auto *Depot = getDepotIfEnabled(Options);
-    if (!Depot)
+    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
+      return;
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (!RB)
       return;
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
-    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace(Depot);
+    Ptr32[MemTagAllocationTraceIndex] = collectStackTrace(RB->Depot);
     Ptr32[MemTagAllocationTidIndex] = getThreadID();
   }
 
-  void storeRingBufferEntry(void *Ptr, u32 AllocationTrace, u32 AllocationTid,
+  void storeRingBufferEntry(AllocationRingBuffer *RB, void *Ptr,
+                            u32 AllocationTrace, u32 AllocationTid,
                             uptr AllocationSize, u32 DeallocationTrace,
                             u32 DeallocationTid) {
-    uptr Pos = atomic_fetch_add(&getRingBuffer()->Pos, 1, memory_order_relaxed);
+    uptr Pos = atomic_fetch_add(&RB->Pos, 1, memory_order_relaxed);
     typename AllocationRingBuffer::Entry *Entry =
-        getRingBufferEntry(RawRingBuffer, Pos % RingBufferElements);
+        getRingBufferEntry(RB, Pos % RB->RingBufferElements);
 
     // First invalidate our entry so that we don't attempt to interpret a
     // partially written state in getSecondaryErrorInfo(). The fences below
@@ -1300,32 +1315,36 @@ class Allocator {
 
   void storeSecondaryAllocationStackMaybe(const Options &Options, void *Ptr,
                                           uptr Size) {
-    auto *Depot = getDepotIfEnabled(Options);
-    if (!Depot)
+    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
       return;
-    u32 Trace = collectStackTrace(Depot);
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (!RB)
+      return;
+    u32 Trace = collectStackTrace(RB->Depot);
     u32 Tid = getThreadID();
 
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
     Ptr32[MemTagAllocationTraceIndex] = Trace;
     Ptr32[MemTagAllocationTidIndex] = Tid;
 
-    storeRingBufferEntry(untagPointer(Ptr), Trace, Tid, Size, 0, 0);
+    storeRingBufferEntry(RB, untagPointer(Ptr), Trace, Tid, Size, 0, 0);
   }
 
   void storeDeallocationStackMaybe(const Options &Options, void *Ptr,
                                    u8 PrevTag, uptr Size) {
-    auto *Depot = getDepotIfEnabled(Options);
-    if (!Depot)
+    if (!UNLIKELY(Options.get(OptionBit::TrackAllocationStacks)))
+      return;
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (!RB)
       return;
     auto *Ptr32 = reinterpret_cast<u32 *>(Ptr);
     u32 AllocationTrace = Ptr32[MemTagAllocationTraceIndex];
     u32 AllocationTid = Ptr32[MemTagAllocationTidIndex];
 
-    u32 DeallocationTrace = collectStackTrace(Depot);
+    u32 DeallocationTrace = collectStackTrace(RB->Depot);
     u32 DeallocationTid = getThreadID();
 
-    storeRingBufferEntry(addFixedTag(untagPointer(Ptr), PrevTag),
+    storeRingBufferEntry(RB, addFixedTag(untagPointer(Ptr), PrevTag),
                          AllocationTrace, AllocationTid, Size,
                          DeallocationTrace, DeallocationTid);
   }
@@ -1434,7 +1453,7 @@ class Allocator {
     for (uptr I = Pos - 1; I != Pos - 1 - RingBufferElements &&
                            NextErrorReport != NumErrorReports;
          --I) {
-      auto *Entry = getRingBufferEntry(RingBufferPtr, I % RingBufferElements);
+      auto *Entry = getRingBufferEntry(RingBuffer, I % RingBufferElements);
       uptr EntryPtr = atomic_load_relaxed(&Entry->Ptr);
       if (!EntryPtr)
         continue;
@@ -1502,14 +1521,18 @@ class Allocator {
   }
 
   static typename AllocationRingBuffer::Entry *
-  getRingBufferEntry(char *RawRingBuffer, uptr N) {
+  getRingBufferEntry(AllocationRingBuffer *RB, uptr N) {
+    char *RBEntryStart =
+        &reinterpret_cast<char *>(RB)[sizeof(AllocationRingBuffer)];
     return &reinterpret_cast<typename AllocationRingBuffer::Entry *>(
-        &RawRingBuffer[sizeof(AllocationRingBuffer)])[N];
+        RBEntryStart)[N];
   }
   static const typename AllocationRingBuffer::Entry *
-  getRingBufferEntry(const char *RawRingBuffer, uptr N) {
+  getRingBufferEntry(const AllocationRingBuffer *RB, uptr N) {
+    const char *RBEntryStart =
+        &reinterpret_cast<const char *>(RB)[sizeof(AllocationRingBuffer)];
     return &reinterpret_cast<const typename AllocationRingBuffer::Entry *>(
-        &RawRingBuffer[sizeof(AllocationRingBuffer)])[N];
+        RBEntryStart)[N];
   }
 
   void mapAndInitializeRingBuffer() {
@@ -1549,15 +1572,14 @@ class Allocator {
     u32 RingSize = static_cast<u32>(TabSize * kFramesPerStack);
     DCHECK(isPowerOfTwo(RingSize));
 
-    StackDepotSize = sizeof(StackDepot) + sizeof(atomic_u64) * RingSize +
-                     sizeof(atomic_u32) * TabSize;
+    uptr StackDepotSize = sizeof(StackDepot) + sizeof(atomic_u64) * RingSize +
+                          sizeof(atomic_u32) * TabSize;
     MemMapT DepotMap;
     DepotMap.map(
         /*Addr=*/0U, roundUp(StackDepotSize, getPageSizeCached()),
         "scudo:stack_depot");
-    Depot = reinterpret_cast<StackDepot *>(DepotMap.getBase());
+    auto *Depot = reinterpret_cast<StackDepot *>(DepotMap.getBase());
     Depot->init(RingSize, TabSize);
-    RawStackDepotMap = DepotMap;
 
     MemMapT MemMap;
     MemMap.map(
@@ -1565,9 +1587,15 @@ class Allocator {
         roundUp(ringBufferSizeInBytes(AllocationRingBufferSize),
                 getPageSizeCached()),
         "scudo:ring_buffer");
-    RawRingBuffer = reinterpret_cast<char *>(MemMap.getBase());
-    RawRingBufferMap = MemMap;
-    RingBufferElements = AllocationRingBufferSize;
+    auto *RB = reinterpret_cast<AllocationRingBuffer *>(MemMap.getBase());
+    RB->RawRingBufferMap = MemMap;
+    RB->RingBufferElements = AllocationRingBufferSize;
+    RB->Depot = Depot;
+    RB->StackDepotSize = StackDepotSize;
+    RB->RawStackDepotMap = DepotMap;
+
+    atomic_store(&RingBufferAddress, reinterpret_cast<uptr>(RB),
+                 memory_order_release);
     static_assert(sizeof(AllocationRingBuffer) %
                           alignof(typename AllocationRingBuffer::Entry) ==
                       0,
@@ -1575,16 +1603,16 @@ class Allocator {
   }
 
   void unmapRingBuffer() {
-    auto *RingBuffer = getRingBuffer();
-    if (RingBuffer != nullptr) {
-      RawRingBufferMap.unmap(RawRingBufferMap.getBase(),
-                             RawRingBufferMap.getCapacity());
-    }
-    RawRingBuffer = nullptr;
-    if (Depot) {
-      RawStackDepotMap.unmap(RawStackDepotMap.getBase(),
-                             RawStackDepotMap.getCapacity());
-    }
+    AllocationRingBuffer *RB = getRingBuffer();
+    if (RB == nullptr)
+      return;
+    // N.B. because RawStackDepotMap is part of RawRingBufferMap, the order
+    // is very important.
+    RB->RawStackDepotMap.unmap(RB->RawStackDepotMap.getBase(),
+                               RB->RawStackDepotMap.getCapacity());
+    RB->RawRingBufferMap.unmap(RB->RawRingBufferMap.getBase(),
+                               RB->RawRingBufferMap.getCapacity());
+    atomic_store(&RingBufferAddress, 0, memory_order_release);
   }
 
   static constexpr size_t ringBufferSizeInBytes(u32 RingBufferElements) {
@@ -1599,10 +1627,6 @@ class Allocator {
     return (Bytes - sizeof(AllocationRingBuffer)) /
            sizeof(typename AllocationRingBuffer::Entry);
   }
-
-  inline AllocationRingBuffer *getRingBuffer() {
-    return reinterpret_cast<AllocationRingBuffer *>(RawRingBuffer);
-  }
 };
 
 } // namespace scudo

From a3a316e2875258929f062fbffb81e2a9d5b4ce48 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 14:10:53 -0600
Subject: [PATCH 203/546] [libc] Remove use of BlockStore for GPU atexit
 (#82823)

Summary:
The GPU backends have restrictions on the kinds of initializers they can
produce. The use of BlockStore here currently breaks the backends
through the use of recursive initializers. This prevents it from
actually being included in any builds. This patchs changes it to just
use a fixed size of 64 slots .The chances of someone exceeding the 64
slots in practice is very, very low.

However, this is primarily a bandaid solution as a real solution will
need to use a lock free data structure to push work in parallel.
Currently the mutexes on the GPU build do nothing, so they only work if
the user guards the use themselves.
---
 libc/src/stdlib/atexit.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index 10dff42b1be92..1513b7969f0db 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -28,7 +28,14 @@ struct AtExitUnit {
   constexpr AtExitUnit(AtExitCallback *c, void *p) : callback(c), payload(p) {}
 };
 
-#ifdef LIBC_COPT_PUBLIC_PACKAGING
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+// The GPU build cannot handle the potentially recursive definitions required by
+// the BlockStore class. Additionally, the liklihood that someone exceeds this
+// while executing on the GPU is extremely small.
+// FIXME: It is not generally safe to use 'atexit' on the GPU because the
+//        mutexes simply passthrough. We will need a lock free stack.
+using ExitCallbackList = FixedVector<AtExitUnit, 64>;
+#elif defined(LIBC_COPT_PUBLIC_PACKAGING)
 using ExitCallbackList = cpp::ReverseOrderBlockStore<AtExitUnit, 32>;
 #else
 // BlockStore uses dynamic memory allocation. To avoid dynamic memory

From 1a2ecbb3980a3005c2027eb5b69bbbe32c9c8294 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 14:11:31 -0600
Subject: [PATCH 204/546] [libc] Remove 'llvm-gpu-none' directory from build
 (#82816)

Summary:
This directory is leftover from when we handled both AMDGPU and NVPTX in
the same build and merged them into a pseudo triple. Now the only thing
it contains is the RPC server header. This gets rid of it, but now that
it's in the base install directory we should make it clear that it's an
LLVM libc header.
---
 libc/docs/gpu/rpc.rst                                  |  4 ++--
 libc/utils/gpu/loader/Loader.h                         |  2 +-
 libc/utils/gpu/server/CMakeLists.txt                   |  4 ++--
 .../gpu/server/{rpc_server.h => llvmlibc_rpc_server.h} |  0
 libc/utils/gpu/server/rpc_server.cpp                   |  2 +-
 openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp | 10 ++--------
 6 files changed, 8 insertions(+), 14 deletions(-)
 rename libc/utils/gpu/server/{rpc_server.h => llvmlibc_rpc_server.h} (100%)

diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index 78ae778671881..fb738138568f6 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -104,7 +104,7 @@ Some operations are too divergent to provide generic implementations for, such
 as allocating device accessible memory. For these cases, we provide a callback 
 registration scheme to add a custom handler for any given opcode through the 
 port API. More information can be found in the installed header 
-``<install>/include/gpu-none-llvm/rpc_server.h``.
+``<install>/include/llvmlibc_rpc_server.h``.
 
 Client Example
 --------------
@@ -194,7 +194,7 @@ but the following example shows how it can be used by a standard user.
   #include <cstdlib>
   #include <cuda_runtime.h>
   
-  #include <gpu-none-llvm/rpc_server.h>
+  #include <llvmlibc_rpc_server.h>
   
   [[noreturn]] void handle_error(cudaError_t err) {
     fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index d2b2ee5baebed..e2aabb08c11da 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
 #define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
 
-#include "utils/gpu/server/rpc_server.h"
+#include "utils/gpu/server/llvmlibc_rpc_server.h"
 
 #include "include/llvm-libc-types/test_rpc_opcodes_t.h"
 
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
index 94cdfe5bf6521..745a248d9b63d 100644
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -21,8 +21,8 @@ if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
 endif()
 
 # Install the server and associated header.
-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/rpc_server.h
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm/
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/llvmlibc_rpc_server.h
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
         COMPONENT libc-headers)
 install(TARGETS llvmlibc_rpc_server
         ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}"
diff --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/llvmlibc_rpc_server.h
similarity index 100%
rename from libc/utils/gpu/server/rpc_server.h
rename to libc/utils/gpu/server/llvmlibc_rpc_server.h
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 4e535a294a19e..707807a5cbaf7 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "rpc_server.h"
+#include "llvmlibc_rpc_server.h"
 
 #include "src/__support/RPC/rpc.h"
 #include "src/stdio/gpu/file.h"
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
index cb6a5086bc4dd..05ae5acb01ddf 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/RPC.cpp
@@ -12,14 +12,8 @@
 
 #include "PluginInterface.h"
 
-// This header file may be present in-tree or from an LLVM installation. The
-// installed version lives alongside the GPU headers so we do not want to
-// include it directly.
-#if __has_include(<gpu-none-llvm/rpc_server.h>)
-#include <gpu-none-llvm/rpc_server.h>
-#elif defined(LIBOMPTARGET_RPC_SUPPORT)
-// Just pull this out of the source if available.
-#include "rpc_server.h"
+#if defined(LIBOMPTARGET_RPC_SUPPORT)
+#include "llvmlibc_rpc_server.h"
 #endif
 
 using namespace llvm;

From b43dd08aa31f37cb9517a80f394631a7d8ff6b90 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 14:11:43 -0600
Subject: [PATCH 205/546] [libc] Install a single LLVM-IR version of the GPU
 library (#82791)

Summary:
Recent patches have allowed us to treat these libraries as direct
builds. This makes it easier to simply build them to a single LLVM-IR
file. This matches the way these files are presented by the ROCm and
CUDA toolchains and makes it easier to work with.
---
 libc/cmake/modules/LLVMLibCLibraryRules.cmake | 49 +++++++++++++++++--
 .../modules/prepare_libc_gpu_build.cmake      |  8 +++
 libc/lib/CMakeLists.txt                       | 36 ++++++++++----
 3 files changed, 80 insertions(+), 13 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index f15ffd5f9c218..9fba51f8ee7f4 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -89,7 +89,7 @@ endfunction()
 #     add_gpu_entrypoint_library(
 #       DEPENDS <list of add_entrypoint_object targets>
 #     )
-function(add_gpu_entrypoint_library target_name)
+function(add_gpu_entrypoint_library target_name base_target_name)
   cmake_parse_arguments(
     "ENTRYPOINT_LIBRARY"
     "" # No optional arguments
@@ -127,7 +127,7 @@ function(add_gpu_entrypoint_library target_name)
       COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
               "${prefix},file=$<JOIN:${object},,file=>" -o 
               ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
-      DEPENDS ${dep}
+      DEPENDS ${dep} ${base_target_name}
       COMMENT "Packaging LLVM offloading binary for '${object}'"
     )
     add_custom_target(${dep}.__gpubin__ DEPENDS ${dep}
@@ -140,7 +140,7 @@ function(add_gpu_entrypoint_library target_name)
       OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp"
       COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/stubs
       COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
-      DEPENDS ${dep} ${dep}.__gpubin__
+      DEPENDS ${dep} ${dep}.__gpubin__ ${base_target_name}
     )
     add_custom_target(${dep}.__stub__ 
                       DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
@@ -156,7 +156,8 @@ function(add_gpu_entrypoint_library target_name)
     target_compile_options(${dep}.__fatbin__ PRIVATE 
       --target=${LLVM_HOST_TRIPLE}
       "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
-    add_dependencies(${dep}.__fatbin__ ${dep} ${dep}.__stub__ ${dep}.__gpubin__)
+    add_dependencies(${dep}.__fatbin__
+                     ${dep} ${dep}.__stub__ ${dep}.__gpubin__ ${base_target_name})
 
     # Set the list of newly create fat binaries containing embedded device code.
     list(APPEND objects $<TARGET_OBJECTS:${dep}.__fatbin__>)
@@ -170,6 +171,46 @@ function(add_gpu_entrypoint_library target_name)
   set_target_properties(${target_name} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR})
 endfunction(add_gpu_entrypoint_library)
 
+# A rule to build a library from a collection of entrypoint objects and bundle
+# it in a single LLVM-IR bitcode file.
+# Usage:
+#     add_gpu_entrypoint_library(
+#       DEPENDS <list of add_entrypoint_object targets>
+#     )
+function(add_bitcode_entrypoint_library target_name base_target_name)
+  cmake_parse_arguments(
+    "ENTRYPOINT_LIBRARY"
+    "" # No optional arguments
+    "" # No single value arguments
+    "DEPENDS" # Multi-value arguments
+    ${ARGN}
+  )
+  if(NOT ENTRYPOINT_LIBRARY_DEPENDS)
+    message(FATAL_ERROR "'add_entrypoint_library' target requires a DEPENDS list "
+                        "of 'add_entrypoint_object' targets.")
+  endif()
+
+  get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS})
+  get_all_object_file_deps(all_deps "${fq_deps_list}")
+
+  set(objects "")
+  foreach(dep IN LISTS all_deps)
+    set(object $<$<STREQUAL:$<TARGET_NAME_IF_EXISTS:${dep}>,${dep}>:$<TARGET_OBJECTS:${dep}>>)
+    list(APPEND objects ${object})
+  endforeach()
+
+  set(output ${CMAKE_CURRENT_BINARY_DIR}/${target_name}.bc)
+  add_custom_command(
+    OUTPUT ${output}
+    COMMAND ${LIBC_LLVM_LINK} ${objects} -o ${output}
+    DEPENDS ${all_deps} ${base_target_name}
+    COMMENT "Linking LLVM-IR bitcode for ${base_target_name}"
+    COMMAND_EXPAND_LISTS
+  )
+  add_custom_target(${target_name} DEPENDS ${output} ${all_deps})
+  set_target_properties(${target_name} PROPERTIES TARGET_OBJECT ${output})
+endfunction(add_bitcode_entrypoint_library)
+
 # A rule to build a library from a collection of entrypoint objects.
 # Usage:
 #     add_entrypoint_library(
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 548990a1c18b8..7e9fc746ea91d 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -26,6 +26,14 @@ if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
                       "build")
 endif()
 
+# Identify llvm-link program so we can merge the output IR into a single blob.
+find_program(LIBC_LLVM_LINK
+             NAMES llvm-link NO_DEFAULT_PATH
+             PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
+if(NOT LIBC_LLVM_LINK)
+  message(FATAL_ERROR "Cannot find 'llvm-link' for the GPU build")
+endif()
+
 # Optionally set up a job pool to limit the number of GPU tests run in parallel.
 # This is sometimes necessary as running too many tests in parallel can cause
 # the GPU or driver to run out of resources.
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 615f4270646fb..e5ebd1e10084d 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -40,22 +40,33 @@ foreach(archive IN ZIP_LISTS
   # Add the offloading version of the library for offloading languages. These
   # are installed in the standard search path separate from the other libraries.
   if(LIBC_TARGET_OS_IS_GPU)
-    set(libc_gpu_archive_target ${archive_1}gpu)
-    set(libc_gpu_archive_name ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE})
-
     add_gpu_entrypoint_library(
-      ${libc_gpu_archive_target}
+      ${archive_1}gpu
+      ${archive_1}
       DEPENDS
         ${${archive_2}}
     )
     set_target_properties(
-      ${libc_gpu_archive_target}
+      ${archive_1}gpu
       PROPERTIES
-        ARCHIVE_OUTPUT_NAME ${libc_gpu_archive_name}
+        ARCHIVE_OUTPUT_NAME ${archive_0}gpu-${LIBC_TARGET_ARCHITECTURE}
+        ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR}
+    )
+    list(APPEND added_gpu_archive_targets ${archive_1}gpu)
+
+    add_bitcode_entrypoint_library(
+      ${archive_1}bitcode
+      ${archive_1}
+      DEPENDS
+        ${${archive_2}}
     )
-    set_target_properties(${libc_gpu_archive_target} PROPERTIES 
-                          ARCHIVE_OUTPUT_DIRECTORY ${LLVM_LIBRARY_OUTPUT_INTDIR})
-    list(APPEND added_gpu_archive_targets ${libc_gpu_archive_target})
+    set_target_properties(
+      ${archive_1}bitcode
+      PROPERTIES
+        OUTPUT_NAME ${archive_1}.bc
+    )
+    add_dependencies(${archive_1}gpu ${archive_1}bitcode)
+    list(APPEND added_gpu_bitcode_targets ${archive_1}bitcode)
   endif()
 endforeach()
 
@@ -71,6 +82,13 @@ if(LIBC_TARGET_OS_IS_GPU)
     ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
     COMPONENT libc
   )
+  foreach(file ${added_gpu_bitcode_targets})
+    install(FILES $<TARGET_PROPERTY:${file},TARGET_OBJECT>
+            DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
+            RENAME $<TARGET_PROPERTY:${file},OUTPUT_NAME>
+            COMPONENT libc
+    )
+  endforeach()
 endif()
 
 if(NOT LIBC_TARGET_OS_IS_BAREMETAL)

From 99660082cb387c9bf7974fef558c5f73d8b5a198 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 14:21:02 -0600
Subject: [PATCH 206/546] [Clang] Append target search paths for direct
 offloading compilation (#82699)

Summary:
Recent changes to the `libc` project caused the headers to be installed
to `include/<triple>` for the GPU and the libraries to be in
`lib/<triple>`. This means we should automatically append these search
paths so they can be found by default. This allows the following to work
targeting AMDGPU.

```shell
$ clang foo.c -flto -mcpu=native --target=amdgcn-amd-amdhsa -lc <install>/lib/amdgcn-amd-amdhsa/crt1.o
$ amdhsa-loader a.out
```
---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 1 +
 clang/lib/Driver/ToolChains/Clang.cpp  | 4 ++--
 clang/lib/Driver/ToolChains/Cuda.cpp   | 4 ++++
 clang/test/Driver/gpu-libc-headers.c   | 9 ++++++++-
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 60e8c123c591d..6fcbcffd6f0d6 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -625,6 +625,7 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   addLinkerCompressDebugSectionsOption(getToolChain(), Args, CmdArgs);
   Args.AddAllArgs(CmdArgs, options::OPT_L);
+  getToolChain().AddFilePathLibArgs(Args, CmdArgs);
   AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);
   if (C.getDriver().isUsingLTO())
     addLTOOptions(getToolChain(), Args, CmdArgs, Output, Inputs[0],
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7c0409f0c3097..6e1b7e8657d0d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1111,8 +1111,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
         C.getActiveOffloadKinds() == Action::OFK_None) {
       SmallString<128> P(llvm::sys::path::parent_path(D.InstalledDir));
       llvm::sys::path::append(P, "include");
-      llvm::sys::path::append(P, "gpu-none-llvm");
-      CmdArgs.push_back("-c-isystem");
+      llvm::sys::path::append(P, getToolChain().getTripleString());
+      CmdArgs.push_back("-internal-isystem");
       CmdArgs.push_back(Args.MakeArgString(P));
     } else if (C.getActiveOffloadKinds() == Action::OFK_OpenMP) {
       // TODO: CUDA / HIP include their own headers for some common functions
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 94d4982d102bb..ff3687ca7dae3 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -609,6 +609,10 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // Add paths specified in LIBRARY_PATH environment variable as -L options.
   addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
 
+  // Add standard library search paths passed on the command line.
+  Args.AddAllArgs(CmdArgs, options::OPT_L);
+  getToolChain().AddFilePathLibArgs(Args, CmdArgs);
+
   // Add paths for the default clang library path.
   SmallString<256> DefaultLibPath =
       llvm::sys::path::parent_path(TC.getDriver().Dir);
diff --git a/clang/test/Driver/gpu-libc-headers.c b/clang/test/Driver/gpu-libc-headers.c
index 74e9a764dfcb3..356a401550399 100644
--- a/clang/test/Driver/gpu-libc-headers.c
+++ b/clang/test/Driver/gpu-libc-headers.c
@@ -10,10 +10,17 @@
 // CHECK-HEADERS: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}llvm_libc_wrappers"{{.*}}"-isysroot" "./"
 // CHECK-HEADERS: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}llvm_libc_wrappers"{{.*}}"-isysroot" "./"
 
+// RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx90a --sysroot=./ \
+// RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-AMDGPU
+// RUN:   %clang -### --target=nvptx64-nvidia-cuda -march=sm_89 --sysroot=./ \
+// RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-NVPTX
+// CHECK-HEADERS-AMDGPU: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}amdgcn-amd-amdhsa"{{.*}}"-isysroot" "./"
+// CHECK-HEADERS-NVPTX: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}nvptx64-nvidia-cuda"{{.*}}"-isysroot" "./"
+
 // RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
 // RUN:     -nogpuinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
 // RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
 // RUN:     -nostdinc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
 // RUN:   %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -nogpulib \
 // RUN:     -nobuiltininc %s 2>&1 | FileCheck %s --check-prefix=CHECK-HEADERS-DISABLED
-// CHECK-HEADERS-DISABLED-NOT: "-cc1"{{.*}}"-c-isystem" "{{.*}}include{{.*}}gpu-none-llvm"
+// CHECK-HEADERS-DISABLED-NOT: "-cc1"{{.*}}"-internal-isystem" "{{.*}}include{{.*}}gpu-none-llvm"

From 1c2456d6593cea317a00627889b5c35766a732e0 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:37:36 -0800
Subject: [PATCH 207/546] [mlir][sparse] remove very thin header file from
 sparse runtime support (#82820)

---
 .../SparseTensor/ErrorHandling.h              | 34 --------
 .../mlir/ExecutionEngine/SparseTensor/File.h  | 10 ++-
 .../ExecutionEngine/SparseTensor/Storage.h    |  1 -
 .../lib/ExecutionEngine/SparseTensor/File.cpp | 84 ++++++++++++-------
 .../ExecutionEngine/SparseTensor/Storage.cpp  |  3 +-
 .../ExecutionEngine/SparseTensorRuntime.cpp   | 19 +++--
 .../llvm-project-overlay/mlir/BUILD.bazel     |  1 -
 7 files changed, 70 insertions(+), 82 deletions(-)
 delete mode 100644 mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h

diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h
deleted file mode 100644
index 6b39526211c77..0000000000000
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===- ErrorHandling.h - Helpers for errors ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an extremely lightweight API for fatal errors (not
-// arising from assertions).  The API does not attempt to be sophisticated
-// in any way, it's just the usual "I give up" style of error reporting.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_ERRORHANDLING_H
-#define MLIR_EXECUTIONENGINE_SPARSETENSOR_ERRORHANDLING_H
-
-#include <cstdio>
-#include <cstdlib>
-
-/// This macro helps minimize repetition of the printf-and-exit idiom,
-/// as well as ensuring that we print some additional output indicating
-/// where the error is coming from--- to make it easier to determine
-/// whether some particular error is coming from the runtime library's
-/// code vs from somewhere else in the MLIR stack.  (Since that can be
-/// hard to determine without the stacktraces provided by assertion failures.)
-#define MLIR_SPARSETENSOR_FATAL(...)                                           \
-  do {                                                                         \
-    fprintf(stderr, "SparseTensorUtils: " __VA_ARGS__);                        \
-    fprintf(stderr, "SparseTensorUtils: at %s:%d\n", __FILE__, __LINE__);      \
-    exit(1);                                                                   \
-  } while (0)
-
-#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_ERRORHANDLING_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index ccdc605d75643..f927b82628b1a 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -115,10 +115,12 @@ class SparseTensorReader final {
     SparseTensorReader *reader = new SparseTensorReader(filename);
     reader->openFile();
     reader->readHeader();
-    if (!reader->canReadAs(valTp))
-      MLIR_SPARSETENSOR_FATAL(
-          "Tensor element type %d not compatible with values in file %s\n",
-          static_cast<int>(valTp), filename);
+    if (!reader->canReadAs(valTp)) {
+      fprintf(stderr,
+              "Tensor element type %d not compatible with values in file %s\n",
+              static_cast<int>(valTp), filename);
+      exit(1);
+    }
     reader->assertMatchesShape(dimRank, dimShape);
     return reader;
   }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index fe0e08b5c8403..468782ebef4e3 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -20,7 +20,6 @@
 #include "mlir/ExecutionEngine/Float16bits.h"
 #include "mlir/ExecutionEngine/SparseTensor/ArithmeticUtils.h"
 #include "mlir/ExecutionEngine/SparseTensor/COO.h"
-#include "mlir/ExecutionEngine/SparseTensor/ErrorHandling.h"
 #include "mlir/ExecutionEngine/SparseTensor/MapRef.h"
 
 namespace mlir {
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/File.cpp b/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
index c49ec0998fb44..63b29ba34ba02 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
@@ -19,11 +19,15 @@ using namespace mlir::sparse_tensor;
 
 /// Opens the file for reading.
 void SparseTensorReader::openFile() {
-  if (file)
-    MLIR_SPARSETENSOR_FATAL("Already opened file %s\n", filename);
+  if (file) {
+    fprintf(stderr, "Already opened file %s\n", filename);
+    exit(1);
+  }
   file = fopen(filename, "r");
-  if (!file)
-    MLIR_SPARSETENSOR_FATAL("Cannot find file %s\n", filename);
+  if (!file) {
+    fprintf(stderr, "Cannot find file %s\n", filename);
+    exit(1);
+  }
 }
 
 /// Closes the file.
@@ -36,19 +40,23 @@ void SparseTensorReader::closeFile() {
 
 /// Attempts to read a line from the file.
 void SparseTensorReader::readLine() {
-  if (!fgets(line, kColWidth, file))
-    MLIR_SPARSETENSOR_FATAL("Cannot read next line of %s\n", filename);
+  if (!fgets(line, kColWidth, file)) {
+    fprintf(stderr, "Cannot read next line of %s\n", filename);
+    exit(1);
+  }
 }
 
 /// Reads and parses the file's header.
 void SparseTensorReader::readHeader() {
   assert(file && "Attempt to readHeader() before openFile()");
-  if (strstr(filename, ".mtx"))
+  if (strstr(filename, ".mtx")) {
     readMMEHeader();
-  else if (strstr(filename, ".tns"))
+  } else if (strstr(filename, ".tns")) {
     readExtFROSTTHeader();
-  else
-    MLIR_SPARSETENSOR_FATAL("Unknown format %s\n", filename);
+  } else {
+    fprintf(stderr, "Unknown format %s\n", filename);
+    exit(1);
+  }
   assert(isValid() && "Failed to read the header");
 }
 
@@ -57,7 +65,7 @@ void SparseTensorReader::readHeader() {
 void SparseTensorReader::assertMatchesShape(uint64_t rank,
                                             const uint64_t *shape) const {
   assert(rank == getRank() && "Rank mismatch");
-  for (uint64_t r = 0; r < rank; ++r)
+  for (uint64_t r = 0; r < rank; r++)
     assert((shape[r] == 0 || shape[r] == idata[2 + r]) &&
            "Dimension size mismatch");
 }
@@ -87,13 +95,13 @@ bool SparseTensorReader::canReadAs(PrimaryType valTy) const {
     // integer and floating primary-types.
     return isRealPrimaryType(valTy);
   }
-  MLIR_SPARSETENSOR_FATAL("Unknown ValueKind: %d\n",
-                          static_cast<uint8_t>(valueKind_));
+  fprintf(stderr, "Unknown ValueKind: %d\n", static_cast<uint8_t>(valueKind_));
+  return false;
 }
 
 /// Helper to convert C-style strings (i.e., '\0' terminated) to lower case.
 static inline void toLower(char *token) {
-  for (char *c = token; *c; ++c)
+  for (char *c = token; *c; c++)
     *c = tolower(*c);
 }
 
@@ -116,8 +124,10 @@ void SparseTensorReader::readMMEHeader() {
   char symmetry[64];
   // Read header line.
   if (fscanf(file, "%63s %63s %63s %63s %63s\n", header, object, format, field,
-             symmetry) != 5)
-    MLIR_SPARSETENSOR_FATAL("Corrupt header in %s\n", filename);
+             symmetry) != 5) {
+    fprintf(stderr, "Corrupt header in %s\n", filename);
+    exit(1);
+  }
   // Convert all to lowercase up front (to avoid accidental redundancy).
   toLower(header);
   toLower(object);
@@ -125,24 +135,27 @@ void SparseTensorReader::readMMEHeader() {
   toLower(field);
   toLower(symmetry);
   // Process `field`, which specify pattern or the data type of the values.
-  if (streq(field, "pattern"))
+  if (streq(field, "pattern")) {
     valueKind_ = ValueKind::kPattern;
-  else if (streq(field, "real"))
+  } else if (streq(field, "real")) {
     valueKind_ = ValueKind::kReal;
-  else if (streq(field, "integer"))
+  } else if (streq(field, "integer")) {
     valueKind_ = ValueKind::kInteger;
-  else if (streq(field, "complex"))
+  } else if (streq(field, "complex")) {
     valueKind_ = ValueKind::kComplex;
-  else
-    MLIR_SPARSETENSOR_FATAL("Unexpected header field value in %s\n", filename);
+  } else {
+    fprintf(stderr, "Unexpected header field value in %s\n", filename);
+    exit(1);
+  }
   // Set properties.
   isSymmetric_ = streq(symmetry, "symmetric");
   // Make sure this is a general sparse matrix.
   if (strne(header, "%%matrixmarket") || strne(object, "matrix") ||
       strne(format, "coordinate") ||
-      (strne(symmetry, "general") && !isSymmetric_))
-    MLIR_SPARSETENSOR_FATAL("Cannot find a general sparse matrix in %s\n",
-                            filename);
+      (strne(symmetry, "general") && !isSymmetric_)) {
+    fprintf(stderr, "Cannot find a general sparse matrix in %s\n", filename);
+    exit(1);
+  }
   // Skip comments.
   while (true) {
     readLine();
@@ -152,8 +165,10 @@ void SparseTensorReader::readMMEHeader() {
   // Next line contains M N NNZ.
   idata[0] = 2; // rank
   if (sscanf(line, "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", idata + 2, idata + 3,
-             idata + 1) != 3)
-    MLIR_SPARSETENSOR_FATAL("Cannot find size in %s\n", filename);
+             idata + 1) != 3) {
+    fprintf(stderr, "Cannot find size in %s\n", filename);
+    exit(1);
+  }
 }
 
 /// Read the "extended" FROSTT header. Although not part of the documented
@@ -168,12 +183,17 @@ void SparseTensorReader::readExtFROSTTHeader() {
       break;
   }
   // Next line contains RANK and NNZ.
-  if (sscanf(line, "%" PRIu64 "%" PRIu64 "\n", idata, idata + 1) != 2)
-    MLIR_SPARSETENSOR_FATAL("Cannot find metadata in %s\n", filename);
+  if (sscanf(line, "%" PRIu64 "%" PRIu64 "\n", idata, idata + 1) != 2) {
+    fprintf(stderr, "Cannot find metadata in %s\n", filename);
+    exit(1);
+  }
   // Followed by a line with the dimension sizes (one per rank).
-  for (uint64_t r = 0; r < idata[0]; ++r)
-    if (fscanf(file, "%" PRIu64, idata + 2 + r) != 1)
-      MLIR_SPARSETENSOR_FATAL("Cannot find dimension size %s\n", filename);
+  for (uint64_t r = 0; r < idata[0]; r++) {
+    if (fscanf(file, "%" PRIu64, idata + 2 + r) != 1) {
+      fprintf(stderr, "Cannot find dimension size %s\n", filename);
+      exit(1);
+    }
+  }
   readLine(); // end of line
   // The FROSTT format does not define the data type of the nonzero elements.
   valueKind_ = ValueKind::kUndefined;
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index bbe10b0dcdd46..aaa42a7e3a31b 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -51,7 +51,8 @@ SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
 
 // Helper macro for wrong "partial method specialization" errors.
 #define FATAL_PIV(NAME)                                                        \
-  MLIR_SPARSETENSOR_FATAL("<P,I,V> type mismatch for: " #NAME);
+  fprintf(stderr, "<P,I,V> type mismatch for: " #NAME);                        \
+  exit(1);
 
 #define IMPL_GETPOSITIONS(PNAME, P)                                            \
   void SparseTensorStorageBase::getPositions(std::vector<P> **, uint64_t) {    \
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 0bc90b281b7ce..731abcbbf1f39 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -52,7 +52,6 @@
 
 #include "mlir/ExecutionEngine/SparseTensor/ArithmeticUtils.h"
 #include "mlir/ExecutionEngine/SparseTensor/COO.h"
-#include "mlir/ExecutionEngine/SparseTensor/ErrorHandling.h"
 #include "mlir/ExecutionEngine/SparseTensor/File.h"
 #include "mlir/ExecutionEngine/SparseTensor/Storage.h"
 
@@ -139,8 +138,8 @@ extern "C" {
       return ptr;                                                              \
     }                                                                          \
     }                                                                          \
-    MLIR_SPARSETENSOR_FATAL("unknown action: %d\n",                            \
-                            static_cast<uint32_t>(action));                    \
+    fprintf(stderr, "unknown action %d\n", static_cast<uint32_t>(action));     \
+    exit(1);                                                                   \
   }
 
 #define CASE_SECSAME(p, v, P, V) CASE(p, p, v, P, P, V)
@@ -283,10 +282,10 @@ void *_mlir_ciface_newSparseTensor( // NOLINT
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kC32, uint64_t, complex32);
 
   // Unsupported case (add above if needed).
-  MLIR_SPARSETENSOR_FATAL(
-      "unsupported combination of types: <P=%d, C=%d, V=%d>\n",
-      static_cast<int>(posTp), static_cast<int>(crdTp),
-      static_cast<int>(valTp));
+  fprintf(stderr, "unsupported combination of types: <P=%d, C=%d, V=%d>\n",
+          static_cast<int>(posTp), static_cast<int>(crdTp),
+          static_cast<int>(valTp));
+  exit(1);
 }
 #undef CASE
 #undef CASE_SECSAME
@@ -468,8 +467,10 @@ char *getTensorFilename(index_type id) {
   char var[bufSize];
   snprintf(var, bufSize, "TENSOR%" PRIu64, id);
   char *env = getenv(var);
-  if (!env)
-    MLIR_SPARSETENSOR_FATAL("Environment variable %s is not set\n", var);
+  if (!env) {
+    fprintf(stderr, "Environment variable %s is not set\n", var);
+    exit(1);
+  }
   return env;
 }
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 853d136d9478f..59ee03d9a3213 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9300,7 +9300,6 @@ cc_library(
     hdrs = [
         "include/mlir/ExecutionEngine/SparseTensor/ArithmeticUtils.h",
         "include/mlir/ExecutionEngine/SparseTensor/COO.h",
-        "include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h",
         "include/mlir/ExecutionEngine/SparseTensor/File.h",
         "include/mlir/ExecutionEngine/SparseTensor/MapRef.h",
         "include/mlir/ExecutionEngine/SparseTensor/Storage.h",

From ae91a427ac8f9fc7368ec052995cec6a6aeb8ea8 Mon Sep 17 00:00:00 2001
From: Timothy Herchen <timothy.herchen@gmail.com>
Date: Fri, 23 Feb 2024 12:49:05 -0800
Subject: [PATCH 208/546] [X86][MC] Reject out-of-range control and debug
 registers encoded with APX (#82584)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #82557. APX specification states that the high bits found in REX2
used to encode GPRs can also be used to encode control and debug
registers, although all of them will #UD. Therefore, when disassembling
we reject attempts to create control or debug registers with a value of
16 or more.

See page 22 of the
[specification](https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html):

> Note that the R, X and B register identifiers can also address non-GPR
register types, such as vector registers, control registers and debug
registers. When any of them does, the highest-order bits REX2.R4,
REX2.X4 or REX2.B4 are generally ignored, except when the register being
addressed is a control or debug register. [...] The exception is that
REX2.R4 and REX2.R3 [*sic*] are not ignored when the R register
identifier addresses a control or debug register. Furthermore, if any
attempt is made to access a non-existent control register (CR*) or debug
register (DR*) using the REX2 prefix and one of the following
instructions:
“MOV CR*, r64”, “MOV r64, CR*”, “MOV DR*, r64”, “MOV r64, DR*”. #UD is
raised.

The invalid encodings are 64-bit only because `0xd5` is a valid
instruction in 32-bit mode.
---
 llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp | 4 ++++
 llvm/test/MC/Disassembler/X86/x86-64-err.txt         | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 5f85261361066..dbc2cef39d868 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -819,8 +819,12 @@ static int readModRM(struct InternalInstruction *insn) {
         *valid = 0;                                                            \
       return prefix##_ES + (index & 7);                                        \
     case TYPE_DEBUGREG:                                                        \
+      if (index > 15)                                                          \
+        *valid = 0;                                                            \
       return prefix##_DR0 + index;                                             \
     case TYPE_CONTROLREG:                                                      \
+      if (index > 15)                                                          \
+        *valid = 0;                                                            \
       return prefix##_CR0 + index;                                             \
     case TYPE_MVSIBX:                                                          \
       return prefix##_XMM0 + index;                                            \
diff --git a/llvm/test/MC/Disassembler/X86/x86-64-err.txt b/llvm/test/MC/Disassembler/X86/x86-64-err.txt
index 3eca239e60f5c..2d6c3e86ceaba 100644
--- a/llvm/test/MC/Disassembler/X86/x86-64-err.txt
+++ b/llvm/test/MC/Disassembler/X86/x86-64-err.txt
@@ -5,6 +5,10 @@
 # 32: into
 0xce
 
+# 64: invalid instruction encoding
+0xd5,0xc5,0x20,0xef
+# 64: invalid instruction encoding
+0xd5,0xc5,0x21,0xef
 # 64: invalid instruction encoding
 0xc4,0x62,0xf9,0x18,0x20
 # 64: invalid instruction encoding

From 99f31bab86c53ed5094f57ff8a05a6ea2c8e0c38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 23 Feb 2024 12:56:24 -0800
Subject: [PATCH 209/546] [flang][cuda] Fix semantic for the CONSTANT attribute
 (#82821)

Object with the CONSTANT attribute cannot be declared in the host
subprogram.

It can be declared in a module or a device subprogram.

Adapt the semantic check to trigger the error in host subprogram.
---
 flang/lib/Semantics/check-declarations.cpp    |  7 ++++++-
 flang/test/Lower/CUDA/cuda-data-attribute.cuf | 10 ----------
 flang/test/Semantics/cuf03.cuf                |  6 ++++++
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 2db3f9a27d8f4..e9adc086402d6 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -920,7 +920,12 @@ void CheckHelper::CheckObjectEntity(
     auto attr{*details.cudaDataAttr()};
     switch (attr) {
     case common::CUDADataAttr::Constant:
-      if (IsAllocatableOrPointer(symbol) || symbol.attrs().test(Attr::TARGET)) {
+      if (subpDetails && !inDeviceSubprogram) {
+        messages_.Say(
+            "Object '%s' with ATTRIBUTES(CONSTANT) may not be declared in a host subprogram"_err_en_US,
+            symbol.name());
+      } else if (IsAllocatableOrPointer(symbol) ||
+          symbol.attrs().test(Attr::TARGET)) {
         messages_.Say(
             "Object '%s' with ATTRIBUTES(CONSTANT) may not be allocatable, pointer, or target"_err_en_US,
             symbol.name());
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
index 7596c6b21efb0..94aa62352c2a0 100644
--- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -16,30 +16,20 @@ module cuda_var
 contains
 
 subroutine local_var_attrs
-  real, constant :: rc
   real, device :: rd
   real, allocatable, managed :: rm
   real, allocatable, pinned :: rp
 end subroutine
 
 ! CHECK-LABEL: func.func @_QMcuda_varPlocal_var_attrs()
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 ! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 ! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFlocal_var_attrsErc"} : (!fir.ref<f32>) -> !fir.ref<f32>
 ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
 ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
 ! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
 
-subroutine dummy_arg_constant(dc)
-  real, constant :: dc
-end subroutine
-! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_constant(
-! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dc", fir.cuda_attr = #fir.cuda<constant>}
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] {cuda_attr = #fir.cuda<constant>, uniq_name = "_QMcuda_varFdummy_arg_constantEdc"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-
 subroutine dummy_arg_device(dd)
   real, device :: dd
 end subroutine
diff --git a/flang/test/Semantics/cuf03.cuf b/flang/test/Semantics/cuf03.cuf
index bebfdadbdbb16..4260d7edd626f 100644
--- a/flang/test/Semantics/cuf03.cuf
+++ b/flang/test/Semantics/cuf03.cuf
@@ -55,5 +55,11 @@ module m
     real, managed :: ma(n) ! ok
     !WARNING: Pointer 'dp' may not be associated in a device subprogram
     real, device, pointer :: dp
+    real, constant :: rc ! ok
+  end subroutine
+
+  subroutine host()
+    !ERROR: Object 'rc' with ATTRIBUTES(CONSTANT) may not be declared in a host subprogram
+    real, constant :: rc
   end subroutine
 end module

From 5c90527b436d1c7d753c187aff1b45e6c8da1af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 23 Feb 2024 12:56:41 -0800
Subject: [PATCH 210/546] [flang][cuda] Allow object with SHARED attribute as
 definable (#82822)

A semantic error was raised in device subprogram like:

```
attributes(global) subroutine devsubr2()
   real, shared :: rs
   rs = 1
end subroutine
```

Object with the SHARED attribute can be can be read or written by all
threads in the block.


https://docs.nvidia.com/hpc-sdk/archive/24.1/compilers/cuda-fortran-prog-guide/index.html#cfpg-var-qual-attr-shared
---
 flang/lib/Semantics/definable.cpp | 5 +++--
 flang/test/Semantics/cuf03.cuf    | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index 2c57efbb40cd1..5c3fa905d6072 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -160,9 +160,10 @@ static std::optional<parser::Message> WhyNotDefinableBase(parser::CharBlock at,
             "'%s' is a host-associated allocatable and is not definable in a device subprogram"_err_en_US,
             original);
       } else if (*cudaDataAttr != common::CUDADataAttr::Device &&
-          *cudaDataAttr != common::CUDADataAttr::Managed) {
+          *cudaDataAttr != common::CUDADataAttr::Managed &&
+          *cudaDataAttr != common::CUDADataAttr::Shared) {
         return BlameSymbol(at,
-            "'%s' is not device or managed data and is not definable in a device subprogram"_err_en_US,
+            "'%s' is not device or managed or shared data and is not definable in a device subprogram"_err_en_US,
             original);
       }
     } else if (!isOwnedByDeviceCode) {
diff --git a/flang/test/Semantics/cuf03.cuf b/flang/test/Semantics/cuf03.cuf
index 4260d7edd626f..41bfbb7678136 100644
--- a/flang/test/Semantics/cuf03.cuf
+++ b/flang/test/Semantics/cuf03.cuf
@@ -62,4 +62,11 @@ module m
     !ERROR: Object 'rc' with ATTRIBUTES(CONSTANT) may not be declared in a host subprogram
     real, constant :: rc
   end subroutine
+
+  attributes(global) subroutine devsubr2()
+    real, shared :: rs
+
+    rs = 1 ! ok
+  end subroutine
+
 end module

From 47aee8b56d65e2bac5c7128424ff06134e454d83 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 23 Feb 2024 15:27:47 -0600
Subject: [PATCH 211/546] =?UTF-8?q?[flang][OpenMP]=20Set=20OpenMP=20attrib?=
 =?UTF-8?q?utes=20in=20MLIR=20module=20in=20bbc=20before=20lo=E2=80=A6=20(?=
 =?UTF-8?q?#82774)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…wering

Right now attributes like OpenMP version or target attributes for
offload are set after lowering in bbc. The flang frontend sets them
before lowering, making them available in the lowering process.

This change sets them before lowering in bbc as well.
---
 flang/tools/bbc/bbc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index e701fdeb227a4..73d740ff43942 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -360,7 +360,6 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
       semanticsContext.targetCharacteristics(), parsing.allCooked(),
       targetTriple, kindMap, loweringOptions, envDefaults,
       semanticsContext.languageFeatures(), targetMachine);
-  burnside.lower(parseTree, semanticsContext);
   mlir::ModuleOp mlirModule = burnside.getModule();
   if (enableOpenMP) {
     if (enableOpenMPGPU && !enableOpenMPDevice) {
@@ -376,6 +375,7 @@ static mlir::LogicalResult convertFortranSourceToMLIR(
     setOffloadModuleInterfaceAttributes(mlirModule, offloadModuleOpts);
     setOpenMPVersionAttribute(mlirModule, setOpenMPVersion);
   }
+  burnside.lower(parseTree, semanticsContext);
   std::error_code ec;
   std::string outputName = outputFilename;
   if (!outputName.size())

From dcf4ca558ce5c323ce9b0af93acc0c832024eb3c Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Fri, 23 Feb 2024 22:59:41 +0100
Subject: [PATCH 212/546] [OpenMP][MLIR][OMPIRBuilder] Add a small optional
 constant alloca raise function pass to finalize, utilised in convertTarget
 (#78818)

This patch seeks to add a mechanism to raise constant (not ConstantExpr
or runtime/dynamic) sized allocations into the entry block for select
functions that have been inserted into a list for processing. This
processing occurs during the finalize call, after OutlinedInfo regions
have completed. This currently has only been utilised for
createOutlinedFunction, which is triggered for TargetOp generation in
the OpenMP MLIR dialect lowering to LLVM-IR.

This currently is required for Target kernels generated by
createOutlinedFunction to avoid subsequent optimization passes doing
some unintentional malformed optimizations for AMD kernels (unsure if it
occurs for other vendors). If the allocas are generated inside of the
kernel and are not in the entry block and are subsequently passed to a
function this can lead to required instructions being erased or
manipulated in a way that causes the kernel to run into a HSA access
error.

This fix is related to a series of problems found in:
https://github.com/llvm/llvm-project/issues/74603

This problem primarily presents itself for Flang's HLFIR AssignOp
currently, when utilised with a scalar temporary constant on the RHS and
a descriptor type on the LHS. It will generate a call to a runtime
function, wrap the RHS temporary in a newly allocated descriptor (an
llvm struct), and pass both the LHS and RHS descriptor into the runtime
function call. This will currently be
embedded into the middle of the target region in the user entry block,
which means the allocas are also embedded in the middle, which seems to
pose
issues when later passes are executed. This issue may present itself in
other HLFIR operations or unrelated operations that generate allocas as
a by product, but for the moment, this one test case is the only
scenario I've found this problem.

Perhaps this is not the appropriate fix, I am very open to other
suggestions, I've tried a few others (at varying levels of the
flang/mlir compiler flow), but this one is the smallest and least
intrusive change set. The other two, that come to mind (but I've not
fully looked into, the former I tried a little with blocks but it had a
few issues I'd need to think through):

- Having a proper alloca only block (or region) generated for TargetOps
that we could merge into the entry block that's generated by
convertTarget's createOutlinedFunction.
- Or diverging a little from Clang's current target generation and using
the CodeExtractor to generate the user code as an outlined function
region invoked from the kernel we make, with our kernel arguments passed
into it. Similar to the current parallel generation. I am not sure how
well this would intermingle with the existing parallel generation though
that's layered in.

Both of these methods seem like quite a divergence from the current
status quo, which I am not entirely sure is merited for the small test
this change aims to fix.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   5 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  51 ++++++
 .../Frontend/OpenMPIRBuilderTest.cpp          | 150 ++++++++++++++++++
 .../omptarget-constant-alloca-raise.mlir      |  43 +++++
 4 files changed, 249 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 4cca4f4d6bc1c..589a9066ac57a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1506,6 +1506,11 @@ class OpenMPIRBuilder {
   /// Collection of regions that need to be outlined during finalization.
   SmallVector<OutlineInfo, 16> OutlineInfos;
 
+  /// A collection of candidate target functions that's constant allocas will
+  /// attempt to be raised on a call of finalize after all currently enqueued
+  /// outline info's have been processed.
+  SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
+
   /// Collection of owned canonical loop objects that eventually need to be
   /// free'd.
   std::forward_list<CanonicalLoopInfo> LoopInfos;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2e96772772e6e..09f59c81123ea 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -633,6 +633,29 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
+static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder,
+                                                     Function *Function) {
+  BasicBlock &EntryBlock = Function->getEntryBlock();
+  Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
+
+  // Loop over blocks looking for constant allocas, skipping the entry block
+  // as any allocas there are already in the desired location.
+  for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
+       Block++) {
+    for (auto Inst = Block->getReverseIterator()->begin();
+         Inst != Block->getReverseIterator()->end();) {
+      if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
+        Inst++;
+        if (!isa<ConstantData>(AllocaInst->getArraySize()))
+          continue;
+        AllocaInst->moveBeforePreserving(MoveLocInst);
+      } else {
+        Inst++;
+      }
+    }
+  }
+}
+
 void OpenMPIRBuilder::finalize(Function *Fn) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
@@ -737,6 +760,28 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
   // Remove work items that have been completed.
   OutlineInfos = std::move(DeferredOutlines);
 
+  // The createTarget functions embeds user written code into
+  // the target region which may inject allocas which need to
+  // be moved to the entry block of our target or risk malformed
+  // optimisations by later passes, this is only relevant for
+  // the device pass which appears to be a little more delicate
+  // when it comes to optimisations (however, we do not block on
+  // that here, it's up to the inserter to the list to do so).
+  // This notbaly has to occur after the OutlinedInfo candidates
+  // have been extracted so we have an end product that will not
+  // be implicitly adversely affected by any raises unless
+  // intentionally appended to the list.
+  // NOTE: This only does so for ConstantData, it could be extended
+  // to ConstantExpr's with further effort, however, they should
+  // largely be folded when they get here. Extending it to runtime
+  // defined/read+writeable allocation sizes would be non-trivial
+  // (need to factor in movement of any stores to variables the
+  // allocation size depends on, as well as the usual loads,
+  // otherwise it'll yield the wrong result after movement) and
+  // likely be more suitable as an LLVM optimisation pass.
+  for (Function *F : ConstantAllocaRaiseCandidates)
+    raiseUserConstantDataAllocasToEntryBlock(Builder, F);
+
   EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
       [](EmitMetadataErrorKind Kind,
          const TargetRegionEntryInfo &EntryInfo) -> void {
@@ -5043,6 +5088,12 @@ static Function *createOutlinedFunction(
 
   BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
 
+  // As we embed the user code in the middle of our target region after we
+  // generate entry code, we must move what allocas we can into the entry
+  // block to avoid possible breaking optimisations for device
+  if (OMPBuilder.Config.isTargetDevice())
+    OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
+
   // Insert target deinit call in the device compilation pass.
   Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
   if (OMPBuilder.Config.isTargetDevice())
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 7b54ae675f0c1..d923b25fda9f9 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5989,6 +5989,156 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHI()));
 }
 
+TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.setConfig(
+      OpenMPIRBuilderConfig(true, false, false, false, false, false, false));
+  OMPBuilder.initialize();
+
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  LoadInst *Value = nullptr;
+  StoreInst *TargetStore = nullptr;
+  llvm::SmallVector<llvm::Value *, 1> CapturedArgs = {
+      Constant::getNullValue(PointerType::get(Ctx, 0))};
+
+  auto SimpleArgAccessorCB =
+      [&](llvm::Argument &Arg, llvm::Value *Input, llvm::Value *&RetVal,
+          llvm::OpenMPIRBuilder::InsertPointTy AllocaIP,
+          llvm::OpenMPIRBuilder::InsertPointTy CodeGenIP) {
+        if (!OMPBuilder.Config.isTargetDevice()) {
+          RetVal = cast<llvm::Value>(&Arg);
+          return CodeGenIP;
+        }
+
+        Builder.restoreIP(AllocaIP);
+
+        llvm::Value *Addr = Builder.CreateAlloca(
+            Arg.getType()->isPointerTy()
+                ? Arg.getType()
+                : Type::getInt64Ty(Builder.getContext()),
+            OMPBuilder.M.getDataLayout().getAllocaAddrSpace());
+        llvm::Value *AddrAscast =
+            Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, Input->getType());
+        Builder.CreateStore(&Arg, AddrAscast);
+
+        Builder.restoreIP(CodeGenIP);
+
+        RetVal = Builder.CreateLoad(Arg.getType(), AddrAscast);
+
+        return Builder.saveIP();
+      };
+
+  llvm::OpenMPIRBuilder::MapInfosTy CombinedInfos;
+  auto GenMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
+      -> llvm::OpenMPIRBuilder::MapInfosTy & {
+    CreateDefaultMapInfos(OMPBuilder, CapturedArgs, CombinedInfos);
+    return CombinedInfos;
+  };
+
+  llvm::Value *RaiseAlloca = nullptr;
+
+  auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
+                       OpenMPIRBuilder::InsertPointTy CodeGenIP)
+      -> OpenMPIRBuilder::InsertPointTy {
+    Builder.restoreIP(CodeGenIP);
+    RaiseAlloca = Builder.CreateAlloca(Builder.getInt32Ty());
+    Value = Builder.CreateLoad(Type::getInt32Ty(Ctx), CapturedArgs[0]);
+    TargetStore = Builder.CreateStore(Value, RaiseAlloca);
+    return Builder.saveIP();
+  };
+
+  IRBuilder<>::InsertPoint EntryIP(&F->getEntryBlock(),
+                                   F->getEntryBlock().getFirstInsertionPt());
+  TargetRegionEntryInfo EntryInfo("parent", /*DeviceID=*/1, /*FileID=*/2,
+                                  /*Line=*/3, /*Count=*/0);
+
+  Builder.restoreIP(
+      OMPBuilder.createTarget(Loc, EntryIP, EntryIP, EntryInfo, /*NumTeams=*/-1,
+                              /*NumThreads=*/0, CapturedArgs, GenMapInfoCB,
+                              BodyGenCB, SimpleArgAccessorCB));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+
+  // Check outlined function
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+  EXPECT_NE(TargetStore, nullptr);
+  Function *OutlinedFn = TargetStore->getFunction();
+  EXPECT_NE(F, OutlinedFn);
+
+  EXPECT_TRUE(OutlinedFn->hasWeakODRLinkage());
+  // Account for the "implicit" first argument.
+  EXPECT_EQ(OutlinedFn->getName(), "__omp_offloading_1_2_parent_l3");
+  EXPECT_EQ(OutlinedFn->arg_size(), 2U);
+  EXPECT_TRUE(OutlinedFn->getArg(1)->getType()->isPointerTy());
+
+  // Check entry block, to see if we have raised our alloca
+  // from the body to the entry block.
+  auto &EntryBlock = OutlinedFn->getEntryBlock();
+
+  // Check that we have moved our alloca created in the
+  // BodyGenCB function, to the top of the function.
+  Instruction *Alloca1 = EntryBlock.getFirstNonPHI();
+  EXPECT_NE(Alloca1, nullptr);
+  EXPECT_TRUE(isa<AllocaInst>(Alloca1));
+  EXPECT_EQ(Alloca1, RaiseAlloca);
+
+  // Verify we have not altered the rest of the function
+  // inappropriately with our alloca movement.
+  auto *Alloca2 = Alloca1->getNextNode();
+  EXPECT_TRUE(isa<AllocaInst>(Alloca2));
+  auto *Store2 = Alloca2->getNextNode();
+  EXPECT_TRUE(isa<StoreInst>(Store2));
+
+  auto *InitCall = dyn_cast<CallInst>(Store2->getNextNode());
+  EXPECT_NE(InitCall, nullptr);
+  EXPECT_EQ(InitCall->getCalledFunction()->getName(), "__kmpc_target_init");
+  EXPECT_EQ(InitCall->arg_size(), 2U);
+  EXPECT_TRUE(isa<GlobalVariable>(InitCall->getArgOperand(0)));
+  auto *KernelEnvGV = cast<GlobalVariable>(InitCall->getArgOperand(0));
+  EXPECT_TRUE(isa<ConstantStruct>(KernelEnvGV->getInitializer()));
+  auto *KernelEnvC = cast<ConstantStruct>(KernelEnvGV->getInitializer());
+  EXPECT_TRUE(isa<ConstantStruct>(KernelEnvC->getAggregateElement(0U)));
+  auto *ConfigC = cast<ConstantStruct>(KernelEnvC->getAggregateElement(0U));
+  EXPECT_EQ(ConfigC->getAggregateElement(0U),
+            ConstantInt::get(Type::getInt8Ty(Ctx), true));
+  EXPECT_EQ(ConfigC->getAggregateElement(1U),
+            ConstantInt::get(Type::getInt8Ty(Ctx), true));
+  EXPECT_EQ(ConfigC->getAggregateElement(2U),
+            ConstantInt::get(Type::getInt8Ty(Ctx), OMP_TGT_EXEC_MODE_GENERIC));
+
+  auto *EntryBlockBranch = EntryBlock.getTerminator();
+  EXPECT_NE(EntryBlockBranch, nullptr);
+  EXPECT_EQ(EntryBlockBranch->getNumSuccessors(), 2U);
+
+  // Check user code block
+  auto *UserCodeBlock = EntryBlockBranch->getSuccessor(0);
+  EXPECT_EQ(UserCodeBlock->getName(), "user_code.entry");
+  auto *Load1 = UserCodeBlock->getFirstNonPHI();
+  EXPECT_TRUE(isa<LoadInst>(Load1));
+  auto *Load2 = Load1->getNextNode();
+  EXPECT_TRUE(isa<LoadInst>(Load2));
+  EXPECT_EQ(Load2, Value);
+  EXPECT_EQ(Load2->getNextNode(), TargetStore);
+  auto *Deinit = TargetStore->getNextNode();
+  EXPECT_NE(Deinit, nullptr);
+
+  auto *DeinitCall = dyn_cast<CallInst>(Deinit);
+  EXPECT_NE(DeinitCall, nullptr);
+  EXPECT_EQ(DeinitCall->getCalledFunction()->getName(), "__kmpc_target_deinit");
+  EXPECT_EQ(DeinitCall->arg_size(), 0U);
+
+  EXPECT_TRUE(isa<ReturnInst>(DeinitCall->getNextNode()));
+
+  // Check exit block
+  auto *ExitBlock = EntryBlockBranch->getSuccessor(1);
+  EXPECT_EQ(ExitBlock->getName(), "worker.exit");
+  EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHI()));
+}
+
 TEST_F(OpenMPIRBuilderTest, CreateTask) {
   using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
   OpenMPIRBuilder OMPBuilder(*M);
diff --git a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
new file mode 100644
index 0000000000000..be521fbe1f01a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// A small condensed version of a problem requiring constant alloca raising in
+// Target Region Entries for user injected code, found in an issue in the Flang
+// compiler. Certain LLVM IR optimisation passes will perform runtime breaking 
+// transformations on allocations not found to be in the entry block, current
+// OpenMP dialect lowering of TargetOp's will inject user allocations after
+// compiler generated entry code, in a seperate block, this test checks that
+// a small function which attempts to raise some of these (specifically 
+// constant sized) allocations performs its task reasonably in these 
+// scenarios. 
+
+module attributes {omp.is_target_device = true} {
+  llvm.func @_QQmain() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %2 = llvm.alloca %1 x !llvm.struct<(ptr)> : (i64) -> !llvm.ptr
+    %3 = omp.map_info var_ptr(%2 : !llvm.ptr, !llvm.struct<(ptr)>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+    omp.target map_entries(%3 -> %arg0 : !llvm.ptr) {
+    ^bb0(%arg0: !llvm.ptr):
+      %4 = llvm.mlir.constant(1 : i32) : i32
+      %5 = llvm.alloca %4 x !llvm.struct<(ptr)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+      %6 = llvm.mlir.constant(50 : i32) : i32
+      %7 = llvm.mlir.constant(1 : i64) : i64
+      %8 = llvm.alloca %7 x i32 : (i64) -> !llvm.ptr
+      llvm.store %6, %8 : i32, !llvm.ptr
+      %9 = llvm.mlir.undef : !llvm.struct<(ptr)>
+      %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr)> 
+      llvm.store %10, %5 : !llvm.struct<(ptr)>, !llvm.ptr
+      %88 = llvm.call @_ExternalCall(%arg0, %5) : (!llvm.ptr, !llvm.ptr) -> !llvm.struct<()>
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.func @_ExternalCall(!llvm.ptr, !llvm.ptr) -> !llvm.struct<()>
+}
+
+// CHECK:      define weak_odr protected void @{{.*}}QQmain_l{{.*}}({{.*}}, {{.*}}) {
+// CHECK-NEXT: entry:
+// CHECK-NEXT:  %[[MOVED_ALLOCA1:.*]] = alloca { ptr }, align 8
+// CHECK-NEXT:  %[[MOVED_ALLOCA2:.*]] = alloca i32, i64 1, align 4
+// CHECK-NEXT:  %[[MAP_ARG_ALLOCA:.*]] = alloca ptr, align 8
+
+// CHECK: user_code.entry:                                  ; preds = %entry

From 25940956e68ec82d841e5748565e7250580e1d36 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:00:15 -0800
Subject: [PATCH 213/546] Replace ArchSpec::PiecewiseCompare() with
 Triple::operator==() (#82804)

Looking ast the definition of both functions this is *almost* an NFC
change, except that Triple also looks at the SubArch (important) and
ObjectFormat (less so).

This fixes a bug that only manifests with how Xcode uses the SBAPI to
attach to a process by name: it guesses the architecture based on the
system. If the system is arm64 and the Process is arm64e Target fails to
update the triple because it deemed the two to be equivalent.

rdar://123338218
---
 lldb/include/lldb/Utility/ArchSpec.h          |  5 ----
 lldb/source/Target/Target.cpp                 |  8 +-----
 lldb/source/Utility/ArchSpec.cpp              | 17 -----------
 lldb/test/API/macosx/arm64e-attach/Makefile   |  2 ++
 .../macosx/arm64e-attach/TestArm64eAttach.py  | 28 +++++++++++++++++++
 lldb/test/API/macosx/arm64e-attach/main.c     |  2 ++
 6 files changed, 33 insertions(+), 29 deletions(-)
 create mode 100644 lldb/test/API/macosx/arm64e-attach/Makefile
 create mode 100644 lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
 create mode 100644 lldb/test/API/macosx/arm64e-attach/main.c

diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index a226a3a5a9b71..50830b889b911 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -505,11 +505,6 @@ class ArchSpec {
 
   bool IsFullySpecifiedTriple() const;
 
-  void PiecewiseTripleCompare(const ArchSpec &other, bool &arch_different,
-                              bool &vendor_different, bool &os_different,
-                              bool &os_version_different,
-                              bool &env_different) const;
-
   /// Detect whether this architecture uses thumb code exclusively
   ///
   /// Some embedded ARM chips (e.g. the ARM Cortex M0-7 line) can only execute
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e17bfcb5d5e2a..e982a30a3ae4f 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1568,14 +1568,8 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
 
       if (m_arch.GetSpec().IsCompatibleMatch(other)) {
         compatible_local_arch = true;
-        bool arch_changed, vendor_changed, os_changed, os_ver_changed,
-            env_changed;
 
-        m_arch.GetSpec().PiecewiseTripleCompare(other, arch_changed,
-                                                vendor_changed, os_changed,
-                                                os_ver_changed, env_changed);
-
-        if (!arch_changed && !vendor_changed && !os_changed && !env_changed)
+        if (m_arch.GetSpec().GetTriple() == other.GetTriple())
           replace_local_arch = false;
       }
     }
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index fb0e985a0d565..07ef435ef451d 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -1421,23 +1421,6 @@ bool ArchSpec::IsFullySpecifiedTriple() const {
   return true;
 }
 
-void ArchSpec::PiecewiseTripleCompare(
-    const ArchSpec &other, bool &arch_different, bool &vendor_different,
-    bool &os_different, bool &os_version_different, bool &env_different) const {
-  const llvm::Triple &me(GetTriple());
-  const llvm::Triple &them(other.GetTriple());
-
-  arch_different = (me.getArch() != them.getArch());
-
-  vendor_different = (me.getVendor() != them.getVendor());
-
-  os_different = (me.getOS() != them.getOS());
-
-  os_version_different = (me.getOSMajorVersion() != them.getOSMajorVersion());
-
-  env_different = (me.getEnvironment() != them.getEnvironment());
-}
-
 bool ArchSpec::IsAlwaysThumbInstructions() const {
   std::string Status;
   if (GetTriple().getArch() == llvm::Triple::arm ||
diff --git a/lldb/test/API/macosx/arm64e-attach/Makefile b/lldb/test/API/macosx/arm64e-attach/Makefile
new file mode 100644
index 0000000000000..c9319d6e6888a
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/Makefile
@@ -0,0 +1,2 @@
+C_SOURCES := main.c
+include Makefile.rules
diff --git a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
new file mode 100644
index 0000000000000..0dc8700ed02dd
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
@@ -0,0 +1,28 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestArm64eAttach(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    # On Darwin systems, arch arm64e means ARMv8.3 with ptrauth ABI used.
+    @skipIf(archs=no_match(["arm64e"]))
+    def test(self):
+        # Skip this test if not running on AArch64 target that supports PAC
+        if not self.isAArch64PAuth():
+            self.skipTest("Target must support pointer authentication.")
+        self.build()
+        popen = self.spawnSubprocess(self.getBuildArtifact(), [])
+        error = lldb.SBError()
+        # This simulates how Xcode attaches to a process by pid/name.
+        target = self.dbg.CreateTarget("", "arm64", "", True, error)
+        listener = lldb.SBListener("my.attach.listener")
+        process = target.AttachToProcessWithID(listener, popen.pid, error)
+        self.assertSuccess(error)
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertEqual(target.GetTriple().split('-')[0], "arm64e",
+                         "target triple is updated correctly")
+        error = process.Kill()
+        self.assertSuccess(error)
diff --git a/lldb/test/API/macosx/arm64e-attach/main.c b/lldb/test/API/macosx/arm64e-attach/main.c
new file mode 100644
index 0000000000000..7baf2ffd8f289
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/main.c
@@ -0,0 +1,2 @@
+int getchar();
+int main() { return getchar(); }

From 69c0b2febe01108f50db6e8ed21cd8b2e6088caf Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 23 Feb 2024 16:34:00 -0600
Subject: [PATCH 214/546] [libc][NFC] Remove all trailing spaces from libc
 (#82831)

Summary:
There are a lot of random training spaces on various lines. This patch
just got rid of all of them with `sed 's/\ \+$//g'.
---
 libc/AOR_v20.02/string/arm/memchr.S           |   8 +-
 libc/CMakeLists.txt                           |   2 +-
 libc/benchmarks/automemcpy/README.md          |   4 +-
 .../cmake/modules/CheckCompilerFeatures.cmake |   2 +-
 .../modules/LLVMLibCCompileOptionRules.cmake  |   2 +-
 libc/cmake/modules/LLVMLibCLibraryRules.cmake |   8 +-
 libc/cmake/modules/LLVMLibCObjectRules.cmake  |   2 +-
 libc/cmake/modules/LLVMLibCTestRules.cmake    |  14 +-
 libc/config/darwin/x86_64/entrypoints.txt     |   2 +-
 libc/config/linux/x86_64/exclude.txt          |   4 +-
 libc/config/windows/entrypoints.txt           |   2 +-
 libc/docs/contributing.rst                    |   2 +-
 libc/docs/date_and_time.rst                   |   6 +-
 libc/docs/dev/clang_tidy_checks.rst           |   2 +-
 libc/docs/dev/config_options.rst              |   2 +-
 libc/docs/dev/printf_behavior.rst             |   4 +-
 libc/docs/full_cross_build.rst                |   2 +-
 libc/docs/gpu/motivation.rst                  |  30 +--
 libc/docs/gpu/rpc.rst                         | 204 +++++++++---------
 libc/docs/gpu/testing.rst                     |   4 +-
 libc/docs/gpu/using.rst                       |  18 +-
 libc/docs/libc_search.rst                     |   6 +-
 libc/docs/math/index.rst                      |   6 +-
 libc/docs/math/log.rst                        |   6 +-
 libc/docs/porting.rst                         |   2 +-
 libc/docs/stdio.rst                           |   2 +-
 libc/docs/strings.rst                         |  14 +-
 libc/include/fcntl.h.def                      |   2 +-
 libc/include/sched.h.def                      |   2 +-
 libc/include/spawn.h.def                      |   2 +-
 libc/spec/bsd_ext.td                          |   2 +-
 libc/spec/gnu_ext.td                          |  10 +-
 libc/spec/llvm_libc_ext.td                    |   2 +-
 libc/spec/posix.td                            |   4 +-
 libc/src/__support/HashTable/CMakeLists.txt   |   2 +-
 libc/src/math/generic/CMakeLists.txt          |   2 +-
 libc/src/search/hsearch/CMakeLists.txt        |   2 +-
 libc/src/stdio/printf_core/CMakeLists.txt     |   2 +-
 libc/src/stdio/scanf_core/CMakeLists.txt      |   2 +-
 libc/src/stdlib/CMakeLists.txt                |   6 +-
 libc/src/wchar/CMakeLists.txt                 |   2 +-
 libc/startup/linux/CMakeLists.txt             |   6 +-
 libc/test/integration/scudo/CMakeLists.txt    |   2 +-
 libc/test/src/__support/CMakeLists.txt        |   2 +-
 libc/test/src/fenv/CMakeLists.txt             |   2 +-
 .../math/differential_testing/CMakeLists.txt  |   2 +-
 libc/test/src/math/smoke/CMakeLists.txt       |   2 +-
 libc/test/utils/UnitTest/CMakeLists.txt       |   4 +-
 libc/utils/MPFRWrapper/CMakeLists.txt         |   8 +-
 libc/utils/gpu/server/CMakeLists.txt          |   2 +-
 libc/utils/mathtools/GenerateHPDConstants.py  |  24 +--
 libc/utils/mathtools/ryu_tablegen.py          |   2 +-
 52 files changed, 229 insertions(+), 229 deletions(-)

diff --git a/libc/AOR_v20.02/string/arm/memchr.S b/libc/AOR_v20.02/string/arm/memchr.S
index a9bba052fb7ee..c03d471f61a6b 100644
--- a/libc/AOR_v20.02/string/arm/memchr.S
+++ b/libc/AOR_v20.02/string/arm/memchr.S
@@ -49,7 +49,7 @@ __memchr_arm:
 	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
 
 	cmp	r2,#16		@ If it's short don't bother with anything clever
-	blt	20f 
+	blt	20f
 
 	tst	r0, #7		@ If it's already aligned skip the next bit
 	beq	10f
@@ -62,7 +62,7 @@ __memchr_arm:
 	beq	50f		@ If it matches exit found
 	tst	r0, #7
 	bne	5b		@ If not aligned yet then do next byte
-	
+
 10:
 	@ At this point, we are aligned, we know we have at least 8 bytes to work with
 	push	{r4,r5,r6,r7}
@@ -71,7 +71,7 @@ __memchr_arm:
 	bic	r4, r2, #7	@ Number of double words to work with
 	mvns	r7, #0		@ all F's
 	movs	r3, #0
-	
+
 15:
 	ldmia	r0!,{r5,r6}
 	subs	r4, r4, #8
@@ -87,7 +87,7 @@ __memchr_arm:
 	pop	{r4,r5,r6,r7}
 	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
 	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
- 
+
 20:
 	cbz	r2, 40f		@ 0 length or hit the end already then not found
 
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 6a57fcec26e47..75fcc91757b80 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -162,7 +162,7 @@ elseif(LIBC_CMAKE_VERBOSE_LOGGING)
   message(STATUS "Path for config files is: ${LIBC_CONFIG_PATH}")
 endif()
 
-# option(LIBC_ENABLE_WIDE_CHARACTERS 
+# option(LIBC_ENABLE_WIDE_CHARACTERS
 # "Whether to enable wide character functions on supported platforms. This may
 # also set flags to enable or disable wide character support within other
 # functions (e.g. printf)." ON)
diff --git a/libc/benchmarks/automemcpy/README.md b/libc/benchmarks/automemcpy/README.md
index 88d0b7ece9b9f..8583368993ef0 100644
--- a/libc/benchmarks/automemcpy/README.md
+++ b/libc/benchmarks/automemcpy/README.md
@@ -17,7 +17,7 @@ git clone https://github.com/Z3Prover/z3.git
 python scripts/mk_make.py --prefix=<Z3_INSTALL_DIR>
 cd build
 make -j
-make install 
+make install
 ```
 
 ## Configuration
@@ -68,7 +68,7 @@ Make sure to save the results of the benchmark as a json file.
      By default, each function is benchmarked for at least one second, here we lower it to 200ms.
 
  - `--benchmark_filter="BM_Memset|BM_Bzero"`
- 
+
      By default, all functions are benchmarked, here we restrict them to `memset` and `bzero`.
 
 Other options might be useful, use `--help` for more information.
diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index 9789d72f99dc4..c3f50df1dda53 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -57,7 +57,7 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
     if(${feature} STREQUAL "float128")
       set(LIBC_COMPILER_HAS_FLOAT128 TRUE)
     elseif(${feature} STREQUAL "fixed_point")
-      set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)      
+      set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)
     endif()
   endif()
 endforeach()
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 408e25b3469c0..c7ccd392354cb 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -6,7 +6,7 @@ function(_get_compile_options_from_flags output_var)
   endif()
   check_flag(ADD_SSE4_2_FLAG ${ROUND_OPT_FLAG} ${flags})
   check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${flags})
-  
+
   if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
     if(ADD_FMA_FLAG)
       if(LIBC_TARGET_ARCHITECTURE_IS_X86)
diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
index 9fba51f8ee7f4..c376faafcf2d7 100644
--- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake
+++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake
@@ -125,7 +125,7 @@ function(add_gpu_entrypoint_library target_name base_target_name)
       OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin"
       COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/binary
       COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
-              "${prefix},file=$<JOIN:${object},,file=>" -o 
+              "${prefix},file=$<JOIN:${object},,file=>" -o
               ${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin
       DEPENDS ${dep} ${base_target_name}
       COMMENT "Packaging LLVM offloading binary for '${object}'"
@@ -142,7 +142,7 @@ function(add_gpu_entrypoint_library target_name base_target_name)
       COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp
       DEPENDS ${dep} ${dep}.__gpubin__ ${base_target_name}
     )
-    add_custom_target(${dep}.__stub__ 
+    add_custom_target(${dep}.__stub__
                       DEPENDS ${dep}.__gpubin__ "${CMAKE_CURRENT_BINARY_DIR}/stubs/${name}.cpp")
 
     add_library(${dep}.__fatbin__
@@ -151,9 +151,9 @@ function(add_gpu_entrypoint_library target_name base_target_name)
     )
 
     # This is always compiled for the LLVM host triple instead of the native GPU
-    # triple that is used by default in the build. 
+    # triple that is used by default in the build.
     target_compile_options(${dep}.__fatbin__ BEFORE PRIVATE -nostdlib)
-    target_compile_options(${dep}.__fatbin__ PRIVATE 
+    target_compile_options(${dep}.__fatbin__ PRIVATE
       --target=${LLVM_HOST_TRIPLE}
       "SHELL:-Xclang -fembed-offload-object=${CMAKE_CURRENT_BINARY_DIR}/binary/${name}.gpubin")
     add_dependencies(${dep}.__fatbin__
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 78536f4eec55a..8a84c82206ba6 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -284,7 +284,7 @@ function(create_entrypoint_object fq_target_name)
 
   # The NVPTX target cannot use LTO for the internal targets used for testing.
   if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-    target_compile_options(${internal_target_name} PRIVATE 
+    target_compile_options(${internal_target_name} PRIVATE
                            -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 9d1e426a5b7b3..76ce6754bd733 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -318,8 +318,8 @@ function(add_libc_fuzzer target_name)
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
 
-  target_link_libraries(${fq_target_name} PRIVATE 
-    ${link_object_files} 
+  target_link_libraries(${fq_target_name} PRIVATE
+    ${link_object_files}
     ${LIBC_FUZZER_LINK_LIBRARIES}
   )
 
@@ -352,7 +352,7 @@ endif()
 # system libc are linked in to the final executable. The final exe is fully
 # statically linked. The libc that the final exe links to consists of only
 # the object files of the DEPENDS targets.
-# 
+#
 # Usage:
 #   add_integration_test(
 #     <target name>
@@ -462,7 +462,7 @@ function(add_integration_test test_name)
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-    target_link_options(${fq_build_target_name} PRIVATE 
+    target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
@@ -470,7 +470,7 @@ function(add_integration_test test_name)
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
-    target_link_options(${fq_build_target_name} PRIVATE 
+    target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
@@ -645,7 +645,7 @@ function(add_libc_hermetic_test test_name)
   endforeach()
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-    target_link_options(${fq_build_target_name} PRIVATE 
+    target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
@@ -653,7 +653,7 @@ function(add_libc_hermetic_test test_name)
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
-    target_link_options(${fq_build_target_name} PRIVATE 
+    target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index 91493cb77b1d8..5a1a6a15ef30c 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -16,7 +16,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.ctype.toascii
     libc.src.ctype.tolower
     libc.src.ctype.toupper
-    
+
     # string.h entrypoints
     libc.src.string.bcmp
     libc.src.string.bzero
diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt
index efe3eb9f4671b..2c218b753b176 100644
--- a/libc/config/linux/x86_64/exclude.txt
+++ b/libc/config/linux/x86_64/exclude.txt
@@ -9,12 +9,12 @@ try_compile(
 )
 
 if(NOT has_sys_random)
-  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS 
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
     libc.src.sys.stat.stat
   )
   # If we're doing a fullbuild we provide the random header ourselves.
   if(NOT LLVM_LIBC_FULL_BUILD)
-    list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS 
+    list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
       libc.src.sys.random.getrandom
     )
   endif()
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 5c3a2e287b952..1c9ed7bbcfed6 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -116,7 +116,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.acoshf
     libc.src.math.asinf
     libc.src.math.asinhf
-    libc.src.math.atanf    
+    libc.src.math.atanf
     libc.src.math.atanhf
     libc.src.math.copysign
     libc.src.math.copysignf
diff --git a/libc/docs/contributing.rst b/libc/docs/contributing.rst
index 50151348f8805..b92575ed4108f 100644
--- a/libc/docs/contributing.rst
+++ b/libc/docs/contributing.rst
@@ -29,7 +29,7 @@ a list of open projects that one can start with:
    #. One is about adding CMake facilities to optionally link the libc's overlay
       static archive (see :ref:`overlay_mode`) with other LLVM tools/executables.
    #. The other is about putting plumbing in place to release the overlay static
-      archive (see :ref:`overlay_mode`) as part of the LLVM binary releases. 
+      archive (see :ref:`overlay_mode`) as part of the LLVM binary releases.
 
 #. **Implement Linux syscall wrappers** - A large portion of the POSIX API can
    be implemented as syscall wrappers on Linux. A good number have already been
diff --git a/libc/docs/date_and_time.rst b/libc/docs/date_and_time.rst
index 8d25ea6e94613..303dd3fa12df2 100644
--- a/libc/docs/date_and_time.rst
+++ b/libc/docs/date_and_time.rst
@@ -26,7 +26,7 @@ Implementation Status
 
 * To check date and time functions enabled for Windows:
 
-  - `windows-x86_64 <https://github.com/llvm/llvm-project/tree/main/libc/config/windows/entrypoints.txt>`_ 
+  - `windows-x86_64 <https://github.com/llvm/llvm-project/tree/main/libc/config/windows/entrypoints.txt>`_
 
   - windows-aarch64 - to be added
 
@@ -38,11 +38,11 @@ Implementation Status
 
 * To check date and time functions enabled for GPU:
 
-  - `gpu-entrypoints <https://github.com/llvm/llvm-project/tree/main/libc/config/gpu/entrypoints.txt>`_ 
+  - `gpu-entrypoints <https://github.com/llvm/llvm-project/tree/main/libc/config/gpu/entrypoints.txt>`_
 
 * To check date and time functions enabled for embedded system:
 
-  - `barebone-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/baremetal/arm/entrypoints.txt>`_ 
+  - `barebone-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/baremetal/arm/entrypoints.txt>`_
 
   - barebone-riscv32 - to be added
 
diff --git a/libc/docs/dev/clang_tidy_checks.rst b/libc/docs/dev/clang_tidy_checks.rst
index 8ad806993a043..3feb5375ef113 100644
--- a/libc/docs/dev/clang_tidy_checks.rst
+++ b/libc/docs/dev/clang_tidy_checks.rst
@@ -75,7 +75,7 @@ a public header with non-namespaced functions like ``string.h`` is included.
 This check ensures any function call resolves to a function within the
 LIBC_NAMESPACE namespace.
 
-There are exceptions for the following functions: 
+There are exceptions for the following functions:
 ``__errno_location`` so that ``errno`` can be set;
 ``malloc``, ``calloc``, ``realloc``, ``aligned_alloc``, and ``free`` since they
 are always external and can be intercepted.
diff --git a/libc/docs/dev/config_options.rst b/libc/docs/dev/config_options.rst
index db70342ed74ce..47f4baef8ebf1 100644
--- a/libc/docs/dev/config_options.rst
+++ b/libc/docs/dev/config_options.rst
@@ -66,7 +66,7 @@ example, the option-dictionary is:
    {
      "LIBC_CONF_PRINTF_DISABLE_FLOAT": {
        "value": false,
-       "doc": 
+       "doc":
      },
      ...
    }
diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst
index bc60aa43ee2b6..7128c738d1924 100644
--- a/libc/docs/dev/printf_behavior.rst
+++ b/libc/docs/dev/printf_behavior.rst
@@ -2,7 +2,7 @@
 Printf Behavior Under All Conditions
 ====================================
 
-Introduction: 
+Introduction:
 =============
 On the "defining undefined behavior" page, I said you should write down your
 decisions regarding undefined behavior in your functions. This is that document
@@ -102,7 +102,7 @@ behavior.
 LIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE
 -------------------------------------------------
 When set, the float to string decimal conversion algorithm will use a larger
-table to accelerate long double conversions. This larger table is around 5MB of 
+table to accelerate long double conversions. This larger table is around 5MB of
 size when compiled.
 
 LIBC_COPT_FLOAT_TO_STR_USE_DYADIC_FLOAT
diff --git a/libc/docs/full_cross_build.rst b/libc/docs/full_cross_build.rst
index cb824730603ad..f06464534f152 100644
--- a/libc/docs/full_cross_build.rst
+++ b/libc/docs/full_cross_build.rst
@@ -87,7 +87,7 @@ After configuring the build with the above ``cmake`` command, one can build the
 the libc for the target with the following command:
 
 .. code-block:: sh
-   
+
    $> ninja libc libm
 
 The above ``ninja`` command will build the libc static archives ``libc.a`` and
diff --git a/libc/docs/gpu/motivation.rst b/libc/docs/gpu/motivation.rst
index 171287c3f996f..7e5336dbbe5de 100644
--- a/libc/docs/gpu/motivation.rst
+++ b/libc/docs/gpu/motivation.rst
@@ -11,14 +11,14 @@ Motivation and Limitations
 Motivation
 ==========
 
-This project aims to provide a large subset of the C standard library to users 
-of GPU accelerators. We deliberately choose to only implement a subset of the C 
-library as some features are not expressly useful or easily implemented on the 
-GPU. This will be discussed further in `Limitations <libc_gpu_limitations>`_. 
-The main motivation behind this project is to provide the well understood C 
+This project aims to provide a large subset of the C standard library to users
+of GPU accelerators. We deliberately choose to only implement a subset of the C
+library as some features are not expressly useful or easily implemented on the
+GPU. This will be discussed further in `Limitations <libc_gpu_limitations>`_.
+The main motivation behind this project is to provide the well understood C
 library as a firm base for GPU development.
 
-The main idea behind this project is that programming GPUs can be as 
+The main idea behind this project is that programming GPUs can be as
 straightforward as programming on CPUs. This project aims to validate the GPU as
 a more general-purpose target. The implementations here will also enable more
 complex implementations of other libraries on the GPU, such as ``libc++``.
@@ -31,10 +31,10 @@ toolchain. We also aim to provide these functions in a format compatible with
 offloading in ``Clang`` so that we can treat the C library for the GPU as a
 standard static library.
 
-A valuable use for providing C library features on the GPU is for testing. For 
-this reason we build `tests on the GPU <libc_gpu_testing>`_ that can run a unit 
-test as if it were being run on the CPU. This also helps users port applications 
-that traditionally were run on the CPU. With this support, we can expand test 
+A valuable use for providing C library features on the GPU is for testing. For
+this reason we build `tests on the GPU <libc_gpu_testing>`_ that can run a unit
+test as if it were being run on the CPU. This also helps users port applications
+that traditionally were run on the CPU. With this support, we can expand test
 coverage for the GPU backend to the existing LLVM C library tests.
 
 .. _libc_gpu_limitations:
@@ -43,9 +43,9 @@ Limitations
 ===========
 
 We only implement a subset of the standard C library. The GPU does not
-currently support thread local variables in all cases, so variables like 
-``errno`` are not provided. Furthermore, the GPU under the OpenCL execution 
-model cannot safely provide a mutex interface. This means that features like 
-file buffering are not implemented on the GPU. We can also not easily provide 
-threading features on the GPU due to the execution model so these will be 
+currently support thread local variables in all cases, so variables like
+``errno`` are not provided. Furthermore, the GPU under the OpenCL execution
+model cannot safely provide a mutex interface. This means that features like
+file buffering are not implemented on the GPU. We can also not easily provide
+threading features on the GPU due to the execution model so these will be
 ignored, as will features like ``locale`` or ``time``.
diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index fb738138568f6..7b0b35af4da88 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -11,62 +11,62 @@ Remote Procedure Calls
 Remote Procedure Call Implementation
 ====================================
 
-Traditionally, the C library abstracts over several functions that interface 
-with the platform's operating system through system calls. The GPU however does 
+Traditionally, the C library abstracts over several functions that interface
+with the platform's operating system through system calls. The GPU however does
 not provide an operating system that can handle target dependent operations.
-Instead, we implemented remote procedure calls to interface with the host's 
+Instead, we implemented remote procedure calls to interface with the host's
 operating system while executing on a GPU.
 
-We implemented remote procedure calls using unified virtual memory to create a 
-shared communicate channel between the two processes. This memory is often 
-pinned memory that can be accessed asynchronously and atomically by multiple 
-processes simultaneously. This supports means that we can simply provide mutual 
-exclusion on a shared better to swap work back and forth between the host system 
-and the GPU. We can then use this to create a simple client-server protocol 
+We implemented remote procedure calls using unified virtual memory to create a
+shared communicate channel between the two processes. This memory is often
+pinned memory that can be accessed asynchronously and atomically by multiple
+processes simultaneously. This supports means that we can simply provide mutual
+exclusion on a shared better to swap work back and forth between the host system
+and the GPU. We can then use this to create a simple client-server protocol
 using this shared memory.
 
-This work treats the GPU as a client and the host as a server. The client 
-initiates a communication while the server listens for them. In order to 
-communicate between the host and the device, we simply maintain a buffer of 
-memory and two mailboxes. One mailbox is write-only while the other is 
-read-only. This exposes three primitive operations: using the buffer, giving 
-away ownership, and waiting for ownership. This is implemented as a half-duplex 
-transmission channel between the two sides. We decided to assign ownership of 
-the buffer to the client when the inbox and outbox bits are equal and to the 
+This work treats the GPU as a client and the host as a server. The client
+initiates a communication while the server listens for them. In order to
+communicate between the host and the device, we simply maintain a buffer of
+memory and two mailboxes. One mailbox is write-only while the other is
+read-only. This exposes three primitive operations: using the buffer, giving
+away ownership, and waiting for ownership. This is implemented as a half-duplex
+transmission channel between the two sides. We decided to assign ownership of
+the buffer to the client when the inbox and outbox bits are equal and to the
 server when they are not.
 
-In order to make this transmission channel thread-safe, we abstract ownership of 
-the given mailbox pair and buffer around a port, effectively acting as a lock 
-and an index into the allocated buffer slice. The server and device have 
-independent locks around the given port. In this scheme, the buffer can be used 
-to communicate intent and data generically with the server. We them simply 
+In order to make this transmission channel thread-safe, we abstract ownership of
+the given mailbox pair and buffer around a port, effectively acting as a lock
+and an index into the allocated buffer slice. The server and device have
+independent locks around the given port. In this scheme, the buffer can be used
+to communicate intent and data generically with the server. We them simply
 provide multiple copies of this protocol and expose them as multiple ports.
 
-If this were simply a standard CPU system, this would be sufficient. However, 
-GPUs have my unique architectural challenges. First, GPU threads execute in 
-lock-step with each other in groups typically called warps or wavefronts. We 
-need to target the smallest unit of independent parallelism, so the RPC 
-interface needs to handle an entire group of threads at once. This is done by 
-increasing the size of the buffer and adding a thread mask argument so the 
-server knows which threads are active when it handles the communication. Second, 
-GPUs generally have no forward progress guarantees. In order to guarantee we do 
-not encounter deadlocks while executing it is required that the number of ports 
-matches the maximum amount of hardware parallelism on the device. It is also 
-very important that the thread mask remains consistent while interfacing with 
+If this were simply a standard CPU system, this would be sufficient. However,
+GPUs have my unique architectural challenges. First, GPU threads execute in
+lock-step with each other in groups typically called warps or wavefronts. We
+need to target the smallest unit of independent parallelism, so the RPC
+interface needs to handle an entire group of threads at once. This is done by
+increasing the size of the buffer and adding a thread mask argument so the
+server knows which threads are active when it handles the communication. Second,
+GPUs generally have no forward progress guarantees. In order to guarantee we do
+not encounter deadlocks while executing it is required that the number of ports
+matches the maximum amount of hardware parallelism on the device. It is also
+very important that the thread mask remains consistent while interfacing with
 the port.
 
 .. image:: ./rpc-diagram.svg
    :width: 75%
    :align: center
 
-The above diagram outlines the architecture of the RPC interface. For clarity 
-the following list will explain the operations done by the client and server 
+The above diagram outlines the architecture of the RPC interface. For clarity
+the following list will explain the operations done by the client and server
 respectively when initiating a communication.
 
 First, a communication from the perspective of the client:
 
 * The client searches for an available port and claims the lock.
-* The client checks that the port is still available to the current device and 
+* The client checks that the port is still available to the current device and
   continues if so.
 * The client writes its data to the fixed-size packet and toggles its outbox.
 * The client waits until its inbox matches its outbox.
@@ -75,51 +75,51 @@ First, a communication from the perspective of the client:
 
 Now, the same communication from the perspective of the server:
 
-* The server searches for an available port with pending work and claims the 
+* The server searches for an available port with pending work and claims the
   lock.
 * The server checks that the port is still available to the current device.
-* The server reads the opcode to perform the expected operation, in this 
+* The server reads the opcode to perform the expected operation, in this
   case a receive and then send.
 * The server reads the data from the fixed-size packet.
 * The server writes its data to the fixed-size packet and toggles its outbox.
-* The server closes the port and continues searching for ports that need to be 
+* The server closes the port and continues searching for ports that need to be
   serviced
 
-This architecture currently requires that the host periodically checks the RPC 
-server's buffer for ports with pending work. Note that a port can be closed 
-without waiting for its submitted work to be completed. This allows us to model 
-asynchronous operations that do not need to wait until the server has completed 
-them. If an operation requires more data than the fixed size buffer, we simply 
+This architecture currently requires that the host periodically checks the RPC
+server's buffer for ports with pending work. Note that a port can be closed
+without waiting for its submitted work to be completed. This allows us to model
+asynchronous operations that do not need to wait until the server has completed
+them. If an operation requires more data than the fixed size buffer, we simply
 send multiple packets back and forth in a streaming fashion.
 
 Server Library
 --------------
 
-The RPC server's basic functionality is provided by the LLVM C library. A static 
-library called ``libllvmlibc_rpc_server.a`` includes handling for the basic 
-operations, such as printing or exiting. This has a small API that handles 
+The RPC server's basic functionality is provided by the LLVM C library. A static
+library called ``libllvmlibc_rpc_server.a`` includes handling for the basic
+operations, such as printing or exiting. This has a small API that handles
 setting up the unified buffer and an interface to check the opcodes.
 
-Some operations are too divergent to provide generic implementations for, such 
-as allocating device accessible memory. For these cases, we provide a callback 
-registration scheme to add a custom handler for any given opcode through the 
-port API. More information can be found in the installed header 
+Some operations are too divergent to provide generic implementations for, such
+as allocating device accessible memory. For these cases, we provide a callback
+registration scheme to add a custom handler for any given opcode through the
+port API. More information can be found in the installed header
 ``<install>/include/llvmlibc_rpc_server.h``.
 
 Client Example
 --------------
 
-The Client API is not currently exported by the LLVM C library. This is 
-primarily due to being written in C++ and relying on internal data structures. 
-It uses a simple send and receive interface with a fixed-size packet. The 
-following example uses the RPC interface to call a function pointer on the 
+The Client API is not currently exported by the LLVM C library. This is
+primarily due to being written in C++ and relying on internal data structures.
+It uses a simple send and receive interface with a fixed-size packet. The
+following example uses the RPC interface to call a function pointer on the
 server.
 
-This code first opens a port with the given opcode to facilitate the 
-communication. It then copies over the argument struct to the server using the 
-``send_n`` interface to stream arbitrary bytes. The next send operation provides 
-the server with the function pointer that will be executed. The final receive 
-operation is a no-op and simply forces the client to wait until the server is 
+This code first opens a port with the given opcode to facilitate the
+communication. It then copies over the argument struct to the server using the
+``send_n`` interface to stream arbitrary bytes. The next send operation provides
+the server with the function pointer that will be executed. The final receive
+operation is a no-op and simply forces the client to wait until the server is
 done. It can be omitted if asynchronous execution is desired.
 
 .. code-block:: c++
@@ -137,23 +137,23 @@ done. It can be omitted if asynchronous execution is desired.
 Server Example
 --------------
 
-This example shows the server-side handling of the previous client example. When 
-the server is checked, if there are any ports with pending work it will check 
-the opcode and perform the appropriate action. In this case, the action is to 
+This example shows the server-side handling of the previous client example. When
+the server is checked, if there are any ports with pending work it will check
+the opcode and perform the appropriate action. In this case, the action is to
 call a function pointer provided by the client.
 
-In this example, the server simply runs forever in a separate thread for 
-brevity's sake. Because the client is a GPU potentially handling several threads 
-at once, the server needs to loop over all the active threads on the GPU. We 
-abstract this into the ``lane_size`` variable, which is simply the device's warp 
-or wavefront size. The identifier is simply the threads index into the current 
-warp or wavefront. We allocate memory to copy the struct data into, and then 
-call the given function pointer with that copied data. The final send simply 
-signals completion and uses the implicit thread mask to delete the temporary 
+In this example, the server simply runs forever in a separate thread for
+brevity's sake. Because the client is a GPU potentially handling several threads
+at once, the server needs to loop over all the active threads on the GPU. We
+abstract this into the ``lane_size`` variable, which is simply the device's warp
+or wavefront size. The identifier is simply the threads index into the current
+warp or wavefront. We allocate memory to copy the struct data into, and then
+call the given function pointer with that copied data. The final send simply
+signals completion and uses the implicit thread mask to delete the temporary
 data.
 
 .. code-block:: c++
-  
+
   for(;;) {
     auto port = server.try_open(index);
     if (!port)
@@ -181,11 +181,11 @@ data.
 CUDA Server Example
 -------------------
 
-The following code shows an example of using the exported RPC interface along 
-with the C library to manually configure a working server using the CUDA 
-language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client`` 
-in the GPU executable as an indicator for whether or not the server can be 
-checked. These details should ideally be handled by the GPU language runtime, 
+The following code shows an example of using the exported RPC interface along
+with the C library to manually configure a working server using the CUDA
+language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client``
+in the GPU executable as an indicator for whether or not the server can be
+checked. These details should ideally be handled by the GPU language runtime,
 but the following example shows how it can be used by a standard user.
 
 .. code-block:: cuda
@@ -193,30 +193,30 @@ but the following example shows how it can be used by a standard user.
   #include <cstdio>
   #include <cstdlib>
   #include <cuda_runtime.h>
-  
+
   #include <llvmlibc_rpc_server.h>
-  
+
   [[noreturn]] void handle_error(cudaError_t err) {
     fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
     exit(EXIT_FAILURE);
   }
-  
+
   [[noreturn]] void handle_error(rpc_status_t err) {
     fprintf(stderr, "RPC error: %d\n", err);
     exit(EXIT_FAILURE);
   }
-  
+
   // The handle to the RPC client provided by the C library.
   extern "C" __device__ void *__llvm_libc_rpc_client;
-  
+
   __global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
-  
+
   // Obtain the RPC client's handle from the device. The CUDA language cannot look
   // up the symbol directly like the driver API, so we launch a kernel to read it.
   void *get_rpc_client() {
     void *rpc_client = nullptr;
     void **rpc_client_d = nullptr;
-  
+
     if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
       handle_error(err);
     get_client_ptr<<<1, 1>>>(rpc_client_d);
@@ -227,7 +227,7 @@ but the following example shows how it can be used by a standard user.
       handle_error(err);
     return rpc_client;
   }
-  
+
   // Routines to allocate mapped memory that both the host and the device can
   // access asychonrously to communicate with eachother.
   void *alloc_host(size_t size, void *) {
@@ -236,64 +236,64 @@ but the following example shows how it can be used by a standard user.
       handle_error(err);
     return sharable_ptr;
   };
-  
+
   void free_host(void *ptr, void *) {
     if (cudaError_t err = cudaFreeHost(ptr))
       handle_error(err);
   }
-  
+
   // The device-side overload of the standard C function to call.
   extern "C" __device__ int puts(const char *);
-  
+
   // Calls the C library function from the GPU C library.
   __global__ void hello() { puts("Hello world!"); }
-  
+
   int main() {
     int device = 0;
     // Initialize the RPC server to run on a single device.
     if (rpc_status_t err = rpc_init(/*num_device=*/1))
       handle_error(err);
-  
+
     // Initialize the RPC server to run on the given device.
     if (rpc_status_t err =
             rpc_server_init(device, RPC_MAXIMUM_PORT_COUNT,
                             /*warp_size=*/32, alloc_host, /*data=*/nullptr))
       handle_error(err);
-  
+
     // Initialize the RPC client by copying the buffer to the device's handle.
     void *rpc_client = get_rpc_client();
     if (cudaError_t err =
             cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
                        rpc_get_client_size(), cudaMemcpyHostToDevice))
       handle_error(err);
-  
+
     cudaStream_t stream;
     if (cudaError_t err = cudaStreamCreate(&stream))
       handle_error(err);
-  
+
     // Execute the kernel.
     hello<<<1, 1, 0, stream>>>();
-  
+
     // While the kernel is executing, check the RPC server for work to do.
     while (cudaStreamQuery(stream) == cudaErrorNotReady)
       if (rpc_status_t err = rpc_handle_server(device))
         handle_error(err);
-  
+
     // Shut down the server running on the given device.
     if (rpc_status_t err =
             rpc_server_shutdown(device, free_host, /*data=*/nullptr))
       handle_error(err);
-  
+
     // Shut down the entire RPC server interface.
     if (rpc_status_t err = rpc_shutdown())
       handle_error(err);
-  
+
     return EXIT_SUCCESS;
   }
 
-The above code must be compiled in CUDA's relocatable device code mode and with 
-the advanced offloading driver to link in the library. Currently this can be 
-done with the following invocation. Using LTO avoids the overhead normally 
+The above code must be compiled in CUDA's relocatable device code mode and with
+the advanced offloading driver to link in the library. Currently this can be
+done with the following invocation. Using LTO avoids the overhead normally
 associated with relocatable device code linking.
 
 .. code-block:: sh
@@ -307,6 +307,6 @@ associated with relocatable device code linking.
 Extensions
 ----------
 
-We describe which operation the RPC server should take with a 16-bit opcode. We 
-consider the first 32768 numbers to be reserved while the others are free to 
+We describe which operation the RPC server should take with a 16-bit opcode. We
+consider the first 32768 numbers to be reserved while the others are free to
 use.
diff --git a/libc/docs/gpu/testing.rst b/libc/docs/gpu/testing.rst
index f793e938e9806..9842a67528361 100644
--- a/libc/docs/gpu/testing.rst
+++ b/libc/docs/gpu/testing.rst
@@ -18,8 +18,8 @@ Testing Infrastructure
 ======================
 
 The testing support in LLVM's libc implementation for GPUs is designed to mimic
-the standard unit tests as much as possible. We use the :ref:`libc_gpu_rpc` 
-support to provide the necessary utilities like printing from the GPU. Execution 
+the standard unit tests as much as possible. We use the :ref:`libc_gpu_rpc`
+support to provide the necessary utilities like printing from the GPU. Execution
 is performed by emitting a ``_start`` kernel from the GPU
 that is then called by an external loader utility. This is an example of how
 this can be done manually:
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index 79b9116c38ed2..1a48c8a3bcba3 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -14,11 +14,11 @@ Building the GPU library
 
 LLVM's libc GPU support *must* be built with an up-to-date ``clang`` compiler
 due to heavy reliance on ``clang``'s GPU support. This can be done automatically
-using the LLVM runtimes support. The GPU build is done using cross-compilation 
-to the GPU architecture. This project currently supports AMD and NVIDIA GPUs 
-which can be targeted using the appropriate target name. The following 
-invocation will enable a cross-compiling build for the GPU architecture and 
-enable the ``libc`` project only for them. 
+using the LLVM runtimes support. The GPU build is done using cross-compilation
+to the GPU architecture. This project currently supports AMD and NVIDIA GPUs
+which can be targeted using the appropriate target name. The following
+invocation will enable a cross-compiling build for the GPU architecture and
+enable the ``libc`` project only for them.
 
 .. code-block:: sh
 
@@ -29,8 +29,8 @@ enable the ``libc`` project only for them.
      -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt"                       \
      -DLLVM_ENABLE_RUNTIMES="openmp"                                      \
      -DCMAKE_BUILD_TYPE=<Debug|Release>   \ # Select build type
-     -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live 
-     -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc             \      
+     -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live
+     -DRUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES=libc             \
      -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=libc               \
      -DLLVM_RUNTIME_TARGETS=default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda
   $> ninja install
@@ -40,8 +40,8 @@ toolchain, we list them in ``LLVM_ENABLE_PROJECTS``. To ensure ``libc`` is built
 using a compatible compiler and to support ``openmp`` offloading, we list them
 in ``LLVM_ENABLE_RUNTIMES`` to build them after the enabled projects using the
 newly built compiler. ``CMAKE_INSTALL_PREFIX`` specifies the installation
-directory in which to install the ``libcgpu-nvptx.a`` and ``libcgpu-amdgpu.a`` 
-libraries and headers along with LLVM. The generated headers will be placed in 
+directory in which to install the ``libcgpu-nvptx.a`` and ``libcgpu-amdgpu.a``
+libraries and headers along with LLVM. The generated headers will be placed in
 ``include/<gpu-triple>``.
 
 Usage
diff --git a/libc/docs/libc_search.rst b/libc/docs/libc_search.rst
index 605efa8a8e8ea..4a7ee288dd43e 100644
--- a/libc/docs/libc_search.rst
+++ b/libc/docs/libc_search.rst
@@ -29,7 +29,7 @@ Type Name                    Available
 ============================ =========
 ACTION                       |check|
 ENTRY                        |check|
-VISIT                        
+VISIT
 ============================ =========
 
 POSIX Standard Functions
@@ -42,8 +42,8 @@ hcreate                      |check|
 hdestroy                     |check|
 hsearch                      |check|
 insque                       |check|
-lfind                        
-lsearch                      
+lfind
+lsearch
 remque                       |check|
 tdelete
 tfind
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index c586fe6664e27..8d584338c10f3 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -81,7 +81,7 @@ Implementation Status
 
 * To check math functions enabled for Windows:
 
-  - `windows-x86_64 <https://github.com/llvm/llvm-project/tree/main/libc/config/windows/entrypoints.txt>`_ 
+  - `windows-x86_64 <https://github.com/llvm/llvm-project/tree/main/libc/config/windows/entrypoints.txt>`_
 
   - windows-aarch64 - to be added
 
@@ -93,11 +93,11 @@ Implementation Status
 
 * To check math functions enabled for GPU:
 
-  - `gpu-entrypoints <https://github.com/llvm/llvm-project/tree/main/libc/config/gpu/entrypoints.txt>`_ 
+  - `gpu-entrypoints <https://github.com/llvm/llvm-project/tree/main/libc/config/gpu/entrypoints.txt>`_
 
 * To check math functions enabled for embedded system:
 
-  - `baremetal-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/baremetal/arm/entrypoints.txt>`_ 
+  - `baremetal-aarch32 <https://github.com/llvm/llvm-project/tree/main/libc/config/baremetal/arm/entrypoints.txt>`_
 
   - baremetal-riscv32 - to be added
 
diff --git a/libc/docs/math/log.rst b/libc/docs/math/log.rst
index 5be880c24f847..82d2e2be6fb59 100644
--- a/libc/docs/math/log.rst
+++ b/libc/docs/math/log.rst
@@ -121,7 +121,7 @@ Hence we have the following bound on `s`:
 In order for `s` to exist, we need that:
 
 .. math::
-  \frac{C - (k + 1) 2^{-M}}{1 + (k + 1) 2^{-M}} > 
+  \frac{C - (k + 1) 2^{-M}}{1 + (k + 1) 2^{-M}} >
   \frac{-C - k 2^{-M}}{1 + k 2^{-M}}
 
 which is equivalent to:
@@ -135,7 +135,7 @@ side of `\text{(C1)}` is bounded by:
 
 .. math::
   2^{-M - 1} > \frac{2^{-M - 1}}{1 + (2k + 1) 2^{-M - 1}} \geq
-  \frac{2^{-M - 1}}{1 + (2^{M + 1} - 1) 2^{-M - 1}} > 2^{-M - 2}. 
+  \frac{2^{-M - 1}}{1 + (2^{M + 1} - 1) 2^{-M - 1}} > 2^{-M - 2}.
 
 Hence, from `\text{(C1)}`, being an exact power of 2, `C = 2^{-N}` is bounded below
 by:
@@ -427,7 +427,7 @@ to look-up for the reduction constant `s_{i, k}`.  In other word, `k` is given
 by the formula:
 
 .. math::
-  k = \left\lfloor 2^{N_i + M_i} u_i \right\rfloor 
+  k = \left\lfloor 2^{N_i + M_i} u_i \right\rfloor
 
 Notice that our reduction constant `s_{i, k}` must work for all `u_i` in the
 interval `I = \{ v: k 2^{-N_i - M_i} \leq v < (k + 1) 2^{-N_i - M_i} \}`,
diff --git a/libc/docs/porting.rst b/libc/docs/porting.rst
index a3b318d361392..ef7a2ff5cc875 100644
--- a/libc/docs/porting.rst
+++ b/libc/docs/porting.rst
@@ -30,7 +30,7 @@ If you are starting to bring up LLVM's libc on a new operating system, the first
 step is to add a directory for that OS in the ``libc/config`` directory. Both
 `Linux <https://github.com/llvm/llvm-project/tree/main/libc/config/linux>`_ and
 `Windows <https://github.com/llvm/llvm-project/tree/main/libc/config/windows>`_,
-the two operating systems on which LLVM's libc is being actively developed, 
+the two operating systems on which LLVM's libc is being actively developed,
 have their own config directory.
 
 .. note:: Windows development is not as active as the development on Linux.
diff --git a/libc/docs/stdio.rst b/libc/docs/stdio.rst
index 1063aa2432af9..4fd6b71a09171 100644
--- a/libc/docs/stdio.rst
+++ b/libc/docs/stdio.rst
@@ -61,7 +61,7 @@ funlockfile    |check|
 Operations on system files
 ==========================
 
-These functions operate on files on the host's system, without using the 
+These functions operate on files on the host's system, without using the
 ``FILE`` object type. They only take the name of the file being operated on.
 
 =============  =========
diff --git a/libc/docs/strings.rst b/libc/docs/strings.rst
index d7e415d371cfb..427644c407ae8 100644
--- a/libc/docs/strings.rst
+++ b/libc/docs/strings.rst
@@ -36,7 +36,7 @@ Function Name  Available
 =============  =========
 bzero          |check|
 bcmp           |check|
-bcopy          |check|       
+bcopy          |check|
 memcpy         |check|
 memset         |check|
 memcmp         |check|
@@ -99,14 +99,14 @@ These functions are not in strings.h, but are still primarily string
 functions, and are therefore tracked along with the rest of the string
 functions.
 
-The String to float functions were implemented using the Eisel-Lemire algorithm 
+The String to float functions were implemented using the Eisel-Lemire algorithm
 (read more about the algorithm here: `The Eisel-Lemire ParseNumberF64 Algorithm
 <https://nigeltao.github.io/blog/2020/eisel-lemire.html>`_). This improved
 the performance of string to float and double, and allowed it to complete this
 comprehensive test 15% faster than glibc: `Parse Number FXX Test Data
-<https://github.com/nigeltao/parse-number-fxx-test-data>`_. The test was done 
+<https://github.com/nigeltao/parse-number-fxx-test-data>`_. The test was done
 with LLVM-libc built on 2022-04-14 and Debian GLibc version 2.33-6. The targets
-``libc_str_to_float_comparison_test`` and 
+``libc_str_to_float_comparison_test`` and
 ``libc_system_str_to_float_comparison_test`` were built and run on the test data
 10 times each, skipping the first run since it was an outlier.
 
@@ -142,7 +142,7 @@ strerror_r     |check|
 Localized String Functions
 ==========================
 
-These functions require locale.h, and will be finished when locale support is 
+These functions require locale.h, and will be finished when locale support is
 implemented in LLVM-libc.
 
 =============  =========
@@ -160,7 +160,7 @@ Many String functions have an equivalent _s version, which is intended to be
 more secure and safe than the previous standard. These functions add runtime
 error detection and overflow protection. While they can be seen as an
 improvement, adoption remains relatively low among users. In addition, they are
-being considered for removal, see 
+being considered for removal, see
 `Field Experience With Annex K — Bounds Checking Interfaces
-<http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1967.htm>`_. For these reasons, 
+<http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1967.htm>`_. For these reasons,
 there is no ongoing work to implement them.
diff --git a/libc/include/fcntl.h.def b/libc/include/fcntl.h.def
index 83d4477d00643..b11645d18c5bd 100644
--- a/libc/include/fcntl.h.def
+++ b/libc/include/fcntl.h.def
@@ -14,4 +14,4 @@
 
 %%public_api()
 
-#endif // LLVM_LIBC_FCNTL_H 
+#endif // LLVM_LIBC_FCNTL_H
diff --git a/libc/include/sched.h.def b/libc/include/sched.h.def
index eb5832690da14..3b2d5e3308593 100644
--- a/libc/include/sched.h.def
+++ b/libc/include/sched.h.def
@@ -14,4 +14,4 @@
 
 %%public_api()
 
-#endif // LLVM_LIBC_SCHED_H 
+#endif // LLVM_LIBC_SCHED_H
diff --git a/libc/include/spawn.h.def b/libc/include/spawn.h.def
index 95c24bba0df36..368ebff17ca12 100644
--- a/libc/include/spawn.h.def
+++ b/libc/include/spawn.h.def
@@ -13,4 +13,4 @@
 
 %%public_api()
 
-#endif // LLVM_LIBC_SPAWN_H 
+#endif // LLVM_LIBC_SPAWN_H
diff --git a/libc/spec/bsd_ext.td b/libc/spec/bsd_ext.td
index c52d5bf10c80a..50ca8b919ff2c 100644
--- a/libc/spec/bsd_ext.td
+++ b/libc/spec/bsd_ext.td
@@ -4,7 +4,7 @@ def BsdExtensions : StandardSpec<"BSDExtensions"> {
       [], // Macros
       [], // Types
       [], // Enumerations
-      [ 
+      [
         FunctionSpec<
             "strlcat",
             RetValSpec<SizeTType>,
diff --git a/libc/spec/gnu_ext.td b/libc/spec/gnu_ext.td
index 1b1b35b4d028b..add07d75050df 100644
--- a/libc/spec/gnu_ext.td
+++ b/libc/spec/gnu_ext.td
@@ -101,8 +101,8 @@ def GnuExtensions : StandardSpec<"GNUExtensions"> {
         FunctionSpec<
             "hcreate_r",
             RetValSpec<IntType>,
-            [  
-                ArgSpec<SizeTType>, 
+            [
+                ArgSpec<SizeTType>,
                 ArgSpec<StructHsearchDataPtr>
             ]
         >,
@@ -117,7 +117,7 @@ def GnuExtensions : StandardSpec<"GNUExtensions"> {
             "hsearch_r",
             RetValSpec<IntType>,
             [
-                ArgSpec<EntryType>, 
+                ArgSpec<EntryType>,
                 ArgSpec<ActionType>,
                 ArgSpec<EntryTypePtrPtr>,
                 ArgSpec<StructHsearchDataPtr>
@@ -207,8 +207,8 @@ def GnuExtensions : StandardSpec<"GNUExtensions"> {
       [], // Enumerations
       [
           FunctionSpec<
-                "qsort_r", 
-                RetValSpec<VoidType>, 
+                "qsort_r",
+                RetValSpec<VoidType>,
                 [ArgSpec<VoidPtr>, ArgSpec<SizeTType>, ArgSpec<SizeTType>, ArgSpec<QSortRCompareT>, ArgSpec<VoidPtr>]
           >,
       ]
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index 5e1ae2cfd060d..2fd0c5f78fb8a 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -36,7 +36,7 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
           >,
       ]
   >;
-  
+
   HeaderSpec Sched = HeaderSpec<
       "sched.h",
       [], // Macros
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 92d0f8edb2994..70be692e208a8 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -1319,7 +1319,7 @@ def POSIX : StandardSpec<"POSIX"> {
             "hsearch",
             RetValSpec<EntryTypePtr>,
             [
-                ArgSpec<EntryType>, 
+                ArgSpec<EntryType>,
                 ArgSpec<ActionType>
             ]
         >,
@@ -1339,7 +1339,7 @@ def POSIX : StandardSpec<"POSIX"> {
             ]
         >,
     ]
-  >; 
+  >;
 
   HeaderSpec Termios = HeaderSpec<
     "termios.h",
diff --git a/libc/src/__support/HashTable/CMakeLists.txt b/libc/src/__support/HashTable/CMakeLists.txt
index 6fa90c780c90d..c79ee9ab9f414 100644
--- a/libc/src/__support/HashTable/CMakeLists.txt
+++ b/libc/src/__support/HashTable/CMakeLists.txt
@@ -14,7 +14,7 @@ list(FIND TARGET_ENTRYPOINT_NAME_LIST getrandom getrandom_index)
 if (NOT ${getrandom_index} EQUAL -1)
   message(STATUS "Using getrandom for hashtable randomness")
   set(randomness_compile_flags -DLIBC_HASHTABLE_USE_GETRANDOM)
-  set(randomness_extra_depends 
+  set(randomness_extra_depends
     libc.src.sys.random.getrandom libc.src.errno.errno)
 endif()
 
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 07885ae6a3e24..2ef13168bce8f 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1437,7 +1437,7 @@ add_entrypoint_object(
     ../sqrtf128.h
   DEPENDS
     libc.src.__support.macros.properties.float
-    libc.src.__support.FPUtil.sqrt 
+    libc.src.__support.FPUtil.sqrt
   COMPILE_OPTIONS
     -O3
   )
diff --git a/libc/src/search/hsearch/CMakeLists.txt b/libc/src/search/hsearch/CMakeLists.txt
index 17289f03d0628..60a562d748e2a 100644
--- a/libc/src/search/hsearch/CMakeLists.txt
+++ b/libc/src/search/hsearch/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_object_library(
   global
   SRCS
-    global.cpp 
+    global.cpp
   HDRS
     global.h
 )
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 8da274395526d..9b81a7abb6cfd 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -110,7 +110,7 @@ add_object_library(
 )
 
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
-  # Not all platforms have a file implementation. If file is unvailable, and a 
+  # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
   return()
 endif()
diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
index 060dc2d239eee..b3445300059fa 100644
--- a/libc/src/stdio/scanf_core/CMakeLists.txt
+++ b/libc/src/stdio/scanf_core/CMakeLists.txt
@@ -75,7 +75,7 @@ add_object_library(
 )
 
 if(NOT (TARGET libc.src.__support.File.file) AND LLVM_LIBC_FULL_BUILD)
-  # Not all platforms have a file implementation. If file is unvailable, and a 
+  # Not all platforms have a file implementation. If file is unvailable, and a
   # full build is requested, then we must skip all file based printf sections.
   return()
 endif()
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index ce08635df3145..bd0bcffe0045d 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -268,7 +268,7 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
   set(SCUDO_DEPS "")
 
   include(${LIBC_SOURCE_DIR}/../compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake)
-  
+
   # scudo distinguishes riscv32 and riscv64, so we need to translate the architecture
   set(LIBC_TARGET_ARCHITECTURE_FOR_SCUDO ${LIBC_TARGET_ARCHITECTURE})
   if(LIBC_TARGET_ARCHITECTURE_IS_RISCV64)
@@ -278,7 +278,7 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
   endif()
 
   if(NOT (LIBC_TARGET_ARCHITECTURE_FOR_SCUDO IN_LIST ALL_SCUDO_STANDALONE_SUPPORTED_ARCH))
-    message(FATAL_ERROR "Architecture ${LIBC_TARGET_ARCHITECTURE_FOR_SCUDO} is not supported by SCUDO. 
+    message(FATAL_ERROR "Architecture ${LIBC_TARGET_ARCHITECTURE_FOR_SCUDO} is not supported by SCUDO.
       Either disable LLVM_LIBC_INCLUDE_SCUDO or change your target architecture.")
   endif()
 
@@ -290,7 +290,7 @@ if(LLVM_LIBC_INCLUDE_SCUDO)
     RTGwpAsanBacktraceLibc.${LIBC_TARGET_ARCHITECTURE_FOR_SCUDO}
     RTGwpAsanSegvHandler.${LIBC_TARGET_ARCHITECTURE_FOR_SCUDO}
     )
-  
+
   add_entrypoint_external(
     malloc
     DEPENDS
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 7af881dac79e8..1d62399c1fe9d 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -5,7 +5,7 @@ add_entrypoint_object(
     wctob.cpp
   HDRS
     wctob.h
-  DEPENDS 
+  DEPENDS
     libc.include.stdio
     libc.include.wchar
     libc.src.__support.wctype_utils
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index a287bc4d633d4..68f68ff45aa9e 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -37,13 +37,13 @@ function(merge_relocatable_object name)
   add_dependencies(${fq_name} ${relocatable_target})
   target_link_libraries(${fq_name} INTERFACE ${fq_link_libraries})
   set_target_properties(
-    ${fq_name} 
+    ${fq_name}
     PROPERTIES
       LINKER_LANGUAGE CXX
       IMPORTED_OBJECTS ${CMAKE_CURRENT_BINARY_DIR}/${name}.o
       TARGET_TYPE ${OBJECT_LIBRARY_TARGET_TYPE}
       DEPS "${fq_link_libraries}"
-  ) 
+  )
 endfunction()
 
 function(add_startup_object name)
@@ -56,7 +56,7 @@ function(add_startup_object name)
   )
 
   get_fq_target_name(${name} fq_target_name)
-  
+
   add_object_library(
     ${name}
     SRCS ${ADD_STARTUP_OBJECT_SRC}
diff --git a/libc/test/integration/scudo/CMakeLists.txt b/libc/test/integration/scudo/CMakeLists.txt
index 4fb8b656f89c9..8a085b618044b 100644
--- a/libc/test/integration/scudo/CMakeLists.txt
+++ b/libc/test/integration/scudo/CMakeLists.txt
@@ -54,5 +54,5 @@ target_link_libraries(libc-gwp-asan-uaf-should-crash
 add_custom_command(TARGET libc-scudo-integration-test
                    POST_BUILD
                    COMMAND $<TARGET_FILE:libc-scudo-integration-test>
-                   COMMENT "Run the test after it is built." 
+                   COMMENT "Run the test after it is built."
                    VERBATIM)
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 53fa1323d18b7..c5634866f839b 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -182,7 +182,7 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
                      POST_BUILD
                      COMMAND $<TARGET_FILE:libc_str_to_float_comparison_test> ${float_test_file}
                      DEPENDS ${float_test_file}
-                     COMMENT "Test the strtof and strtod implementations against precomputed results." 
+                     COMMENT "Test the strtof and strtod implementations against precomputed results."
                      VERBATIM)
 endif()
 
diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt
index d7b9104cf8304..ba338bb6c7318 100644
--- a/libc/test/src/fenv/CMakeLists.txt
+++ b/libc/test/src/fenv/CMakeLists.txt
@@ -87,7 +87,7 @@ add_libc_unittest(
 
 if (NOT (LLVM_USE_SANITIZER OR (${LIBC_TARGET_OS} STREQUAL "windows")
          OR (${LIBC_TARGET_OS} STREQUAL "darwin")))
-  # Sanitizers don't like SIGFPE. So, we will run the 
+  # Sanitizers don't like SIGFPE. So, we will run the
   # tests which raise SIGFPE only in non-sanitizer builds.
   # The tests are also disabled for Windows and MacOS as they fail currently.
   # TODO: Investigate and fix the windows failures and enable them for Windows
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/differential_testing/CMakeLists.txt
index 72bc2f8fb16aa..878f81f1d573c 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@@ -61,7 +61,7 @@ function(add_diff_binary target_name)
 
   set_target_properties(${fq_target_name}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  
+
   if(DIFF_CXX_STANDARD)
     set_target_properties(
       ${fq_target_name}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 2d24b5a76b013..0d6b33bd7d66e 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1164,7 +1164,7 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
       libc.src.math.fmaxl
       libc.src.__support.FPUtil.fp_bits
   )
-  
+
   add_fp_unittest(
     fmaxf128_test
     SUITE
diff --git a/libc/test/utils/UnitTest/CMakeLists.txt b/libc/test/utils/UnitTest/CMakeLists.txt
index 3b917e06cde21..d3af8245515b9 100644
--- a/libc/test/utils/UnitTest/CMakeLists.txt
+++ b/libc/test/utils/UnitTest/CMakeLists.txt
@@ -11,7 +11,7 @@ add_libc_unittest(
   SRCS
     testfilter_test.cpp
   DEPENDS
-    # TODO(michaelrj): Remove this dependancy. It's only here because all unit 
+    # TODO(michaelrj): Remove this dependancy. It's only here because all unit
     # tests must have at least one dependancy.
-    libc.src.__support.CPP.bit 
+    libc.src.__support.CPP.bit
 )
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index 6f44ca0d786c8..2f2b0ac09df9a 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -7,14 +7,14 @@ if(LIBC_TESTS_CAN_USE_MPFR)
   target_compile_options(libcMPFRWrapper PRIVATE -O3)
   if (LLVM_LIBC_FULL_BUILD)
     # It is not easy to make libcMPFRWrapper a standalone library because gmp.h may unconditionally
-    # pull in some STL headers. As a result, targets using this library will need to link against 
+    # pull in some STL headers. As a result, targets using this library will need to link against
     # C++ and unwind libraries. Since we are using MPFR anyway, we directly specifies the GNU toolchain.
     target_link_libraries(libcMPFRWrapper PUBLIC -lstdc++ -lgcc_s)
   endif()
   add_dependencies(
-    libcMPFRWrapper 
-    libc.src.__support.CPP.string_view 
-    libc.src.__support.CPP.type_traits 
+    libcMPFRWrapper
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.type_traits
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fpbits_str
     LibcTest.unit
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
index 745a248d9b63d..8712f24de84f7 100644
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -14,7 +14,7 @@ target_compile_definitions(llvmlibc_rpc_server PUBLIC
 
 # This utility needs to be compiled for the host system when cross compiling.
 if(LLVM_RUNTIMES_TARGET OR LIBC_TARGET_TRIPLE)
-  target_compile_options(llvmlibc_rpc_server PUBLIC 
+  target_compile_options(llvmlibc_rpc_server PUBLIC
                          --target=${LLVM_HOST_TRIPLE})
   target_link_libraries(llvmlibc_rpc_server PUBLIC
                         "--target=${LLVM_HOST_TRIPLE}")
diff --git a/libc/utils/mathtools/GenerateHPDConstants.py b/libc/utils/mathtools/GenerateHPDConstants.py
index c37b35e000c53..642c379f001d5 100644
--- a/libc/utils/mathtools/GenerateHPDConstants.py
+++ b/libc/utils/mathtools/GenerateHPDConstants.py
@@ -1,17 +1,17 @@
 from math import *
 
 """
-This script is used to generate a table used by 
+This script is used to generate a table used by
 libc/src/__support/high_precision_decimal.h.
 
 For the ith entry in the table there are two values (indexed starting at 0).
 The first value is the number of digits longer the second value would be if
 multiplied by 2^i.
-The second value is the smallest number that would create that number of 
-additional digits (which in base ten is always 5^i). Anything less creates one 
+The second value is the smallest number that would create that number of
+additional digits (which in base ten is always 5^i). Anything less creates one
 fewer digit.
 
-As an example, the 3rd entry in the table is {1, "125"}. This means that if 
+As an example, the 3rd entry in the table is {1, "125"}. This means that if
 125 is multiplied by 2^3 = 8, it will have exactly one more digit.
 Multiplying it out we get 125 * 8 = 1000. 125 is the smallest number that gives
 that extra digit, for example 124 * 8 = 992, and all larger 3 digit numbers
@@ -19,17 +19,17 @@
 This makes sense because 5^3 * 2^3 = 10^3, the smallest 4 digit number.
 
 For numbers with more digits we can ignore the digits past what's in the second
-value, since the most significant digits determine how many extra digits there 
-will be. Looking at the previous example, if we have 1000, and we look at just 
-the first 3 digits (since 125 has 3 digits), we see that 100 < 125, so we get 
-one fewer than 1 extra digits, which is 0. 
-Multiplying it out we get 1000 * 8 = 8000, which fits the expectation. 
-Another few quick examples: 
+value, since the most significant digits determine how many extra digits there
+will be. Looking at the previous example, if we have 1000, and we look at just
+the first 3 digits (since 125 has 3 digits), we see that 100 < 125, so we get
+one fewer than 1 extra digits, which is 0.
+Multiplying it out we get 1000 * 8 = 8000, which fits the expectation.
+Another few quick examples:
 For 1255, 125 !< 125, so 1 digit more: 1255 * 8 = 10040
 For 9999, 999 !< 125, so 1 digit more: 9999 * 8 = 79992
 
-Now let's try an example with the 10th entry: {4, "9765625"}. This one means 
-that 9765625 * 2^10 will have 4 extra digits. 
+Now let's try an example with the 10th entry: {4, "9765625"}. This one means
+that 9765625 * 2^10 will have 4 extra digits.
 Let's skip straight to the examples:
 For 1, 1 < 9765625, so 4-1=3 extra digits: 1 * 2^10 = 1024, 1 digit to 4 digits is a difference of 3.
 For 9765624, 9765624 < 9765625 so 3 extra digits: 9765624 * 1024 = 9999998976, 7 digits to 10 digits is a difference of 3.
diff --git a/libc/utils/mathtools/ryu_tablegen.py b/libc/utils/mathtools/ryu_tablegen.py
index 91e2029e41771..02806434b7e96 100644
--- a/libc/utils/mathtools/ryu_tablegen.py
+++ b/libc/utils/mathtools/ryu_tablegen.py
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 
-This file is used to generate the tables of values in 
+This file is used to generate the tables of values in
 src/__support/ryu_constants.h and ryu_long_double constants.h. To use it, set
 the constants at the top of the file to the values you want to use for the Ryu
 algorithm, then run this file. It will output the appropriate tables to stdout,

From 87fadb3929163752f650a1fc08d5fb13ee4c1a3f Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Fri, 23 Feb 2024 14:45:22 -0800
Subject: [PATCH 215/546] [lldb] Correctly annotate threads at a bp site as
 hitting it (#82709)

This is next in my series of "fix the racey tests that fail on
greendragon" addressing the failure of TestConcurrentManyBreakpoints.py
where we set a breakpoint in a function that 100 threads execute, and we
check that we hit the breakpoint 100 times. But sometimes it is only hit
99 times, and the test fails.

When we hit a software breakpoint, the pc value for the thread is the
address of the breakpoint instruction - as if it had not been hit yet.
And because a user might ADD a breakpoint for the current pc from the
commandline, when we go to resume execution, any thread that is sitting
at a breakpoint site will be silently advanced past the breakpoint
instruction (disable bp, instruction step that thread, re-enable bp)
before resuming -- whether that thread has hit its breakpoint or not.

What this test is exposing is that there is another corner case, a
thread that is sitting at a breakpoint site but has not yet executed the
breakpoint instruction. The thread will have no stop reason, no mach
exception, so it will not be recorded as having hit the breakpoint
(because it hasn't yet). But when we resume execution, because it is
sitting at a breakpoint site, we advance past it and miss the breakpoint
hit.

In 2016 Abhishek Aggarwal handled a similar issue with a patch in
`ProcessGDBRemote::SetThreadStopInfo()`, adding a breakpoint StopInfo
for a thread sitting at a breakpoint site that has no stop reason.
debugserver's `jThreadsInfo` would not correctly execute Abhishek's code
though because it would respond with `"reason":"none"` for a thread with
no stop reason, and `SetThreadStopInfo()` expected an empty reason here.
The first part of my patch is to clear the `reason` if it is `"none"` so
we flow through the code correctly.

On Darwin, though, our stop reply packet (Txx...) includes the
`threads`, `thread-pcs`, and `jstopinfo` keys, which give us the tids
for all current threads, the pc values for those threads, and
`jstopinfo` has a JSON dictionary with the mach exceptions for all
threads that have a mach exception. In
`ProcessGDBRemote::CalculateThreadStopInfo()` we set the StopInfo for
each thread for a private stop and if we have `jstopinfo` it is the
source of all the StopInfos. I have to add the same logic here, to give
the thread a breakpoint StopInfo even though it hasn't executed the
breakpoint yet. In this case we are very early in thread construction
and I only have the information in the Txx stop reply packet -- tids,
pcs, and jstopinfo, so I can't use the normal general mechanisms of
going through the RegisterContext to get the pc, it's a bit different.

If I hack debugserver to not issue `jstopinfo`,
`CalculateThreadStopInfo` will fall back to sending `qThreadStopInfo`
for each thread and going through
`ProcessGDBRemote::SetThreadStopInfo()` to set the stop infos (and with
the `reason:none` fix, use Abhishek's code).

rdar://110549165
---
 lldb/include/lldb/Target/Thread.h             | 21 ++++++++----
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 33 +++++++++++++++----
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/lldb/include/lldb/Target/Thread.h b/lldb/include/lldb/Target/Thread.h
index 96ca95ad233ff..1efef93b17ded 100644
--- a/lldb/include/lldb/Target/Thread.h
+++ b/lldb/include/lldb/Target/Thread.h
@@ -1163,13 +1163,20 @@ class Thread : public std::enable_shared_from_this<Thread>,
 
   void CalculatePublicStopInfo();
 
-  // Ask the thread subclass to set its stop info.
-  //
-  // Thread subclasses should call Thread::SetStopInfo(...) with the reason the
-  // thread stopped.
-  //
-  // \return
-  //      True if Thread::SetStopInfo(...) was called, false otherwise.
+  /// Ask the thread subclass to set its stop info.
+  ///
+  /// Thread subclasses should call Thread::SetStopInfo(...) with the reason the
+  /// thread stopped.
+  ///
+  /// A thread that is sitting at a breakpoint site, but has not yet executed
+  /// the breakpoint instruction, should have a breakpoint-hit StopInfo set.
+  /// When execution is resumed, any thread sitting at a breakpoint site will
+  /// instruction-step over the breakpoint instruction silently, and we will
+  /// never record this breakpoint as being hit, updating the hit count,
+  /// possibly executing breakpoint commands or conditions.
+  ///
+  /// \return
+  ///      True if Thread::SetStopInfo(...) was called, false otherwise.
   virtual bool CalculateStopInfo() = 0;
 
   // Gets the temporary resume state for a thread.
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 629b191f3117a..3dc40ee6bb9ff 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1600,6 +1600,26 @@ bool ProcessGDBRemote::CalculateThreadStopInfo(ThreadGDBRemote *thread) {
     // has no stop reason.
     thread->GetRegisterContext()->InvalidateIfNeeded(true);
     if (!GetThreadStopInfoFromJSON(thread, m_jstopinfo_sp)) {
+      // If a thread is stopped at a breakpoint site, set that as the stop
+      // reason even if it hasn't executed the breakpoint instruction yet.
+      // We will silently step over the breakpoint when we resume execution
+      // and miss the fact that this thread hit the breakpoint.
+      const size_t num_thread_ids = m_thread_ids.size();
+      for (size_t i = 0; i < num_thread_ids; i++) {
+        if (m_thread_ids[i] == thread->GetID() && m_thread_pcs.size() > i) {
+          addr_t pc = m_thread_pcs[i];
+          lldb::BreakpointSiteSP bp_site_sp =
+              thread->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
+          if (bp_site_sp) {
+            if (bp_site_sp->ValidForThisThread(*thread)) {
+              thread->SetStopInfo(
+                  StopInfo::CreateStopReasonWithBreakpointSiteID(
+                      *thread, bp_site_sp->GetID()));
+              return true;
+            }
+          }
+        }
+      }
       thread->SetStopInfo(StopInfoSP());
     }
     return true;
@@ -1722,7 +1742,9 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
     } else {
       bool handled = false;
       bool did_exec = false;
-      if (!reason.empty()) {
+      // debugserver can send reason = "none" which is equivalent
+      // to no reason.
+      if (!reason.empty() && reason != "none") {
         if (reason == "trace") {
           addr_t pc = thread_sp->GetRegisterContext()->GetPC();
           lldb::BreakpointSiteSP bp_site_sp =
@@ -1864,11 +1886,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
         lldb::BreakpointSiteSP bp_site_sp =
             thread_sp->GetProcess()->GetBreakpointSiteList().FindByAddress(pc);
 
-        // If the current pc is a breakpoint site then the StopInfo should be
-        // set to Breakpoint even though the remote stub did not set it as such.
-        // This can happen when the thread is involuntarily interrupted (e.g.
-        // due to stops on other threads) just as it is about to execute the
-        // breakpoint instruction.
+        // If a thread is stopped at a breakpoint site, set that as the stop
+        // reason even if it hasn't executed the breakpoint instruction yet.
+        // We will silently step over the breakpoint when we resume execution
+        // and miss the fact that this thread hit the breakpoint.
         if (bp_site_sp && bp_site_sp->ValidForThisThread(*thread_sp)) {
           thread_sp->SetStopInfo(StopInfo::CreateStopReasonWithBreakpointSiteID(
               *thread_sp, bp_site_sp->GetID()));

From 3f91bdfdd50aa4eaf1d3e49cf797220cfeccaf16 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 23 Feb 2024 15:25:49 -0800
Subject: [PATCH 216/546] Revert "Replace ArchSpec::PiecewiseCompare() with
 Triple::operator==()"

This reverts commit 5e6bed8c0ea2f7fe380127763c8f753adae0fc1b while investigating the bots.
---
 lldb/include/lldb/Utility/ArchSpec.h          |  5 ++++
 lldb/source/Target/Target.cpp                 |  8 +++++-
 lldb/source/Utility/ArchSpec.cpp              | 17 +++++++++++
 lldb/test/API/macosx/arm64e-attach/Makefile   |  2 --
 .../macosx/arm64e-attach/TestArm64eAttach.py  | 28 -------------------
 lldb/test/API/macosx/arm64e-attach/main.c     |  2 --
 6 files changed, 29 insertions(+), 33 deletions(-)
 delete mode 100644 lldb/test/API/macosx/arm64e-attach/Makefile
 delete mode 100644 lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
 delete mode 100644 lldb/test/API/macosx/arm64e-attach/main.c

diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index 50830b889b911..a226a3a5a9b71 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -505,6 +505,11 @@ class ArchSpec {
 
   bool IsFullySpecifiedTriple() const;
 
+  void PiecewiseTripleCompare(const ArchSpec &other, bool &arch_different,
+                              bool &vendor_different, bool &os_different,
+                              bool &os_version_different,
+                              bool &env_different) const;
+
   /// Detect whether this architecture uses thumb code exclusively
   ///
   /// Some embedded ARM chips (e.g. the ARM Cortex M0-7 line) can only execute
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e982a30a3ae4f..e17bfcb5d5e2a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1568,8 +1568,14 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
 
       if (m_arch.GetSpec().IsCompatibleMatch(other)) {
         compatible_local_arch = true;
+        bool arch_changed, vendor_changed, os_changed, os_ver_changed,
+            env_changed;
 
-        if (m_arch.GetSpec().GetTriple() == other.GetTriple())
+        m_arch.GetSpec().PiecewiseTripleCompare(other, arch_changed,
+                                                vendor_changed, os_changed,
+                                                os_ver_changed, env_changed);
+
+        if (!arch_changed && !vendor_changed && !os_changed && !env_changed)
           replace_local_arch = false;
       }
     }
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 07ef435ef451d..fb0e985a0d565 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -1421,6 +1421,23 @@ bool ArchSpec::IsFullySpecifiedTriple() const {
   return true;
 }
 
+void ArchSpec::PiecewiseTripleCompare(
+    const ArchSpec &other, bool &arch_different, bool &vendor_different,
+    bool &os_different, bool &os_version_different, bool &env_different) const {
+  const llvm::Triple &me(GetTriple());
+  const llvm::Triple &them(other.GetTriple());
+
+  arch_different = (me.getArch() != them.getArch());
+
+  vendor_different = (me.getVendor() != them.getVendor());
+
+  os_different = (me.getOS() != them.getOS());
+
+  os_version_different = (me.getOSMajorVersion() != them.getOSMajorVersion());
+
+  env_different = (me.getEnvironment() != them.getEnvironment());
+}
+
 bool ArchSpec::IsAlwaysThumbInstructions() const {
   std::string Status;
   if (GetTriple().getArch() == llvm::Triple::arm ||
diff --git a/lldb/test/API/macosx/arm64e-attach/Makefile b/lldb/test/API/macosx/arm64e-attach/Makefile
deleted file mode 100644
index c9319d6e6888a..0000000000000
--- a/lldb/test/API/macosx/arm64e-attach/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-C_SOURCES := main.c
-include Makefile.rules
diff --git a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
deleted file mode 100644
index 0dc8700ed02dd..0000000000000
--- a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import lldb
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-
-
-class TestArm64eAttach(TestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    # On Darwin systems, arch arm64e means ARMv8.3 with ptrauth ABI used.
-    @skipIf(archs=no_match(["arm64e"]))
-    def test(self):
-        # Skip this test if not running on AArch64 target that supports PAC
-        if not self.isAArch64PAuth():
-            self.skipTest("Target must support pointer authentication.")
-        self.build()
-        popen = self.spawnSubprocess(self.getBuildArtifact(), [])
-        error = lldb.SBError()
-        # This simulates how Xcode attaches to a process by pid/name.
-        target = self.dbg.CreateTarget("", "arm64", "", True, error)
-        listener = lldb.SBListener("my.attach.listener")
-        process = target.AttachToProcessWithID(listener, popen.pid, error)
-        self.assertSuccess(error)
-        self.assertTrue(process, PROCESS_IS_VALID)
-        self.assertEqual(target.GetTriple().split('-')[0], "arm64e",
-                         "target triple is updated correctly")
-        error = process.Kill()
-        self.assertSuccess(error)
diff --git a/lldb/test/API/macosx/arm64e-attach/main.c b/lldb/test/API/macosx/arm64e-attach/main.c
deleted file mode 100644
index 7baf2ffd8f289..0000000000000
--- a/lldb/test/API/macosx/arm64e-attach/main.c
+++ /dev/null
@@ -1,2 +0,0 @@
-int getchar();
-int main() { return getchar(); }

From 775bd60363353b78657967c80f0f109cdb65cf8f Mon Sep 17 00:00:00 2001
From: Visoiu Mistrih Francis <890283+francisvm@users.noreply.github.com>
Date: Fri, 23 Feb 2024 15:44:57 -0800
Subject: [PATCH 217/546] [RISCV] Add scheduling info for Zcmp (#82719)

The order of the entries in the list is:

outs, ins, Defs, Uses, implicit-defs, implicit uses, where the last two
are added programatically during codegen depending on the registers
saved/restored and are not described in the TD files.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZc.td | 32 +++++++++++----
 llvm/test/CodeGen/RISCV/cm_mvas_mvsa.mir  | 48 +++++++++++++++++++++++
 2 files changed, 73 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/cm_mvas_mvsa.mir

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index a3ec2e5667ee5..2c8451c5c4ceb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -181,29 +181,47 @@ def C_SH : CStoreH_rri<0b100011, 0b0, "c.sh">,
 
 // Zcmp
 let DecoderNamespace = "RVZcmp", Predicates = [HasStdExtZcmp],
-    Defs = [X10, X11], hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+    hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let Defs = [X10, X11] in
 def CM_MVA01S : RVInst16CA<0b101011, 0b11, 0b10, (outs),
-                            (ins SR07:$rs1, SR07:$rs2), "cm.mva01s", "$rs1, $rs2">;
+                            (ins SR07:$rs1, SR07:$rs2), "cm.mva01s", "$rs1, $rs2">,
+                Sched<[WriteIALU, WriteIALU, ReadIALU, ReadIALU]>;
 
+let Uses = [X10, X11] in
 def CM_MVSA01 : RVInst16CA<0b101011, 0b01, 0b10, (outs SR07:$rs1, SR07:$rs2),
-                            (ins), "cm.mvsa01", "$rs1, $rs2">;
+                            (ins), "cm.mvsa01", "$rs1, $rs2">,
+                Sched<[WriteIALU, WriteIALU, ReadIALU, ReadIALU]>;
 } // DecoderNamespace = "RVZcmp", Predicates = [HasStdExtZcmp]...
 
 let DecoderNamespace = "RVZcmp", Predicates = [HasStdExtZcmp] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Uses = [X2], Defs = [X2] in
-def CM_PUSH : RVInstZcCPPP<0b11000, "cm.push">;
+def CM_PUSH : RVInstZcCPPP<0b11000, "cm.push">,
+              Sched<[WriteIALU, ReadIALU, ReadStoreData, ReadStoreData,
+                     ReadStoreData, ReadStoreData, ReadStoreData, ReadStoreData,
+                     ReadStoreData, ReadStoreData, ReadStoreData, ReadStoreData,
+                     ReadStoreData, ReadStoreData, ReadStoreData]>;
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isReturn = 1,
     Uses = [X2], Defs = [X2] in
-def CM_POPRET : RVInstZcCPPP<0b11110, "cm.popret">;
+def CM_POPRET : RVInstZcCPPP<0b11110, "cm.popret">,
+                Sched<[WriteIALU, WriteLDW, WriteLDW, WriteLDW, WriteLDW,
+                       WriteLDW, WriteLDW, WriteLDW, WriteLDW, WriteLDW,
+                       WriteLDW, WriteLDW, WriteLDW, WriteLDW, ReadIALU]>;
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isReturn = 1,
     Uses = [X2], Defs = [X2, X10] in
-def CM_POPRETZ : RVInstZcCPPP<0b11100, "cm.popretz">;
+def CM_POPRETZ : RVInstZcCPPP<0b11100, "cm.popretz">,
+                 Sched<[WriteIALU, WriteIALU, WriteLDW, WriteLDW, WriteLDW,
+                        WriteLDW, WriteLDW, WriteLDW, WriteLDW, WriteLDW,
+                        WriteLDW, WriteLDW, WriteLDW, WriteLDW, WriteLDW,
+                        ReadIALU]>;
 
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0,
     Uses = [X2], Defs = [X2] in
-def CM_POP : RVInstZcCPPP<0b11010, "cm.pop">;
+def CM_POP : RVInstZcCPPP<0b11010, "cm.pop">,
+             Sched<[WriteIALU, WriteLDW, WriteLDW, WriteLDW, WriteLDW,
+                    WriteLDW, WriteLDW, WriteLDW, WriteLDW, WriteLDW, WriteLDW,
+                    WriteLDW, WriteLDW, WriteLDW, ReadIALU]>;
 } // DecoderNamespace = "RVZcmp", Predicates = [HasStdExtZcmp]...
 
 let DecoderNamespace = "RVZcmt", Predicates = [HasStdExtZcmt],
diff --git a/llvm/test/CodeGen/RISCV/cm_mvas_mvsa.mir b/llvm/test/CodeGen/RISCV/cm_mvas_mvsa.mir
new file mode 100644
index 0000000000000..d1c919f69f702
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/cm_mvas_mvsa.mir
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=riscv32 -verify-machineinstrs -run-pass=riscv-move-merge -simplify-mir -o - %s | FileCheck -check-prefixes=CHECK32I %s
+# RUN: llc -mtriple=riscv32 -mattr=+zcmp -verify-machineinstrs -run-pass=riscv-move-merge -simplify-mir -o - %s | FileCheck -check-prefixes=CHECK32ZCMP %s
+# RUN: llc -mtriple=riscv64 -verify-machineinstrs -run-pass=riscv-move-merge -simplify-mir -o - %s | FileCheck -check-prefixes=CHECK64I %s
+# RUN: llc -mtriple=riscv64 -mattr=+zcmp -verify-machineinstrs -run-pass=riscv-move-merge -simplify-mir -o - %s | FileCheck -check-prefixes=CHECK64ZCMP %s
+---
+name: zcmp_mv
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $x11, $x10
+    ; CHECK32I-LABEL: name: zcmp_mv
+    ; CHECK32I: liveins: $x11, $x10
+    ; CHECK32I-NEXT: {{  $}}
+    ; CHECK32I-NEXT: $x8 = ADDI $x11, 0
+    ; CHECK32I-NEXT: $x9 = ADDI $x10, 0
+    ; CHECK32I-NEXT: $x10 = ADDI killed $x9, 0
+    ; CHECK32I-NEXT: $x11 = ADDI $x8, 0
+    ; CHECK32I-NEXT: PseudoRET
+    ;
+    ; CHECK32ZCMP-LABEL: name: zcmp_mv
+    ; CHECK32ZCMP: liveins: $x11, $x10
+    ; CHECK32ZCMP-NEXT: {{  $}}
+    ; CHECK32ZCMP-NEXT: $x9, $x8 = CM_MVSA01 implicit $x10, implicit $x11
+    ; CHECK32ZCMP-NEXT: CM_MVA01S killed $x9, $x8, implicit-def $x10, implicit-def $x11
+    ; CHECK32ZCMP-NEXT: PseudoRET
+    ;
+    ; CHECK64I-LABEL: name: zcmp_mv
+    ; CHECK64I: liveins: $x11, $x10
+    ; CHECK64I-NEXT: {{  $}}
+    ; CHECK64I-NEXT: $x8 = ADDI $x11, 0
+    ; CHECK64I-NEXT: $x9 = ADDI $x10, 0
+    ; CHECK64I-NEXT: $x10 = ADDI killed $x9, 0
+    ; CHECK64I-NEXT: $x11 = ADDI $x8, 0
+    ; CHECK64I-NEXT: PseudoRET
+    ;
+    ; CHECK64ZCMP-LABEL: name: zcmp_mv
+    ; CHECK64ZCMP: liveins: $x11, $x10
+    ; CHECK64ZCMP-NEXT: {{  $}}
+    ; CHECK64ZCMP-NEXT: $x9, $x8 = CM_MVSA01 implicit $x10, implicit $x11
+    ; CHECK64ZCMP-NEXT: CM_MVA01S killed $x9, $x8, implicit-def $x10, implicit-def $x11
+    ; CHECK64ZCMP-NEXT: PseudoRET
+    $x8 = ADDI $x11, 0
+    $x9 = ADDI $x10, 0
+    $x10 = ADDI killed $x9, 0
+    $x11 = ADDI $x8, 0
+    PseudoRET
+...

From 10c48a772742b7afe665a815b7eba2047f17dc4b Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Fri, 23 Feb 2024 15:58:32 -0800
Subject: [PATCH 218/546] [llvm-shlib] Change libLLVM-$MAJOR.so symlink to
 point to versioned SO (#82660)

This symlink was added in 91a384621e5b762d9c173ffd247cfeadd5f436a2 to
maintain backwards compatibility, but it needs to point to
libLLVM.so.$MAJOR.$MINOR rather than libLLVM.so. This works better for
distros that ship libLLVM.so and libLLVM.so.$MAJOR.$MINOR in separate
packages and also prevents mistakes like
libLLVM-19.so -> libLLVM.so -> libLLVM.so.18.1

Fixes #82647
---
 llvm/cmake/modules/AddLLVM.cmake     | 8 ++++++--
 llvm/tools/llvm-shlib/CMakeLists.txt | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 5fc663d10b8f7..3bc78b0dc9355 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2074,7 +2074,7 @@ function(add_lit_testsuites project directory)
 endfunction()
 
 function(llvm_install_library_symlink name dest type)
-  cmake_parse_arguments(ARG "" "COMPONENT" "" ${ARGN})
+  cmake_parse_arguments(ARG "" "COMPONENT;SOVERSION" "" ${ARGN})
   foreach(path ${CMAKE_MODULE_PATH})
     if(EXISTS ${path}/LLVMInstallSymlink.cmake)
       set(INSTALL_SYMLINK ${path}/LLVMInstallSymlink.cmake)
@@ -2088,7 +2088,11 @@ function(llvm_install_library_symlink name dest type)
   endif()
 
   set(full_name ${CMAKE_${type}_LIBRARY_PREFIX}${name}${CMAKE_${type}_LIBRARY_SUFFIX})
-  set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX})
+  if (ARG_SOVERSION)
+    set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX}.${ARG_SOVERSION})
+  else()
+    set(full_dest ${CMAKE_${type}_LIBRARY_PREFIX}${dest}${CMAKE_${type}_LIBRARY_SUFFIX})
+  endif()
 
   if(LLVM_USE_SYMLINKS)
     set(LLVM_LINK_OR_COPY create_symlink)
diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index 09c15e304614c..eba1672faee7f 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -35,8 +35,8 @@ if(LLVM_BUILD_LLVM_DYLIB)
   endif()
   add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB OUTPUT_NAME LLVM ${INSTALL_WITH_TOOLCHAIN} ${SOURCES})
   # Add symlink for backwards compatibility with old library name
-  get_target_property(LLVM_DYLIB_FILENAME LLVM OUTPUT_NAME)
-  llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} ${LLVM_DYLIB_FILENAME} SHARED COMPONENT LLVM)
+  get_target_property(LLVM_DYLIB_SOVERSION LLVM SOVERSION)
+  llvm_install_library_symlink(LLVM-${LLVM_VERSION_MAJOR}${LLVM_VERSION_SUFFIX} LLVM SHARED COMPONENT LLVM SOVERSION ${LLVM_DYLIB_SOVERSION})
 
   list(REMOVE_DUPLICATES LIB_NAMES)
   if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")

From c862e612068c9c33995a2e2d289ced44b8eba810 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Fri, 23 Feb 2024 16:33:44 -0800
Subject: [PATCH 219/546] Revert "[RemoveDIs] Enable DPLabels conversion [3b/3]
 (#82639)"

This reverts commit 71d47a0b00e9f48dc740556d7f452ffadf308731 because
it causes clang to crash in some cases. See repro posted at
https://github.com/llvm/llvm-project/commit/71d47a0b00e9f48dc740556d7f452ffadf308731
---
 .../include/llvm/IR/DebugProgramInstruction.h | 10 --------
 llvm/lib/IR/BasicBlock.cpp                    | 18 ++++++-------
 llvm/lib/IR/DebugProgramInstruction.cpp       | 25 -------------------
 3 files changed, 8 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 97089098ee535..84b0f743d3c9b 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -62,8 +62,6 @@ class BasicBlock;
 class MDNode;
 class Module;
 class DbgVariableIntrinsic;
-class DbgInfoIntrinsic;
-class DbgLabelInst;
 class DIAssignID;
 class DPMarker;
 class DPValue;
@@ -82,7 +80,6 @@ class raw_ostream;
 ///   clone
 ///   isIdenticalToWhenDefined
 ///   both print methods
-///   createDebugIntrinsic
 class DbgRecord : public ilist_node<DbgRecord> {
 public:
   /// Marker that this DbgRecord is linked into.
@@ -106,11 +103,6 @@ class DbgRecord : public ilist_node<DbgRecord> {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &O, ModuleSlotTracker &MST, bool IsForDebug) const;
   bool isIdenticalToWhenDefined(const DbgRecord &R) const;
-  /// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
-  /// \p InsertBefore Optional position to insert this intrinsic.
-  /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
-  DbgInfoIntrinsic *createDebugIntrinsic(Module *M,
-                                         Instruction *InsertBefore) const;
   ///@}
 
   /// Same as isIdenticalToWhenDefined but checks DebugLoc too.
@@ -185,8 +177,6 @@ class DPLabel : public DbgRecord {
   DPLabel *clone() const;
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
-  DbgLabelInst *createDebugIntrinsic(Module *M,
-                                     Instruction *InsertBefore) const;
 
   void setLabel(DILabel *NewLabel) { Label = NewLabel; }
   DILabel *getLabel() const { return Label; }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 6ea876fde5ec6..0680754444f17 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -81,12 +81,6 @@ void BasicBlock::convertToNewDbgValues() {
       continue;
     }
 
-    if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
-      DLI->eraseFromParent();
-      continue;
-    }
-
     if (DPVals.empty())
       continue;
 
@@ -113,12 +107,16 @@ void BasicBlock::convertFromNewDbgValues() {
       continue;
 
     DPMarker &Marker = *Inst.DbgMarker;
-    for (DbgRecord &DR : Marker.getDbgValueRange())
-      InstList.insert(Inst.getIterator(),
-                      DR.createDebugIntrinsic(getModule(), nullptr));
+    for (DbgRecord &DR : Marker.getDbgValueRange()) {
+      if (auto *DPV = dyn_cast<DPValue>(&DR))
+        InstList.insert(Inst.getIterator(),
+                        DPV->createDebugIntrinsic(getModule(), nullptr));
+      else
+        llvm_unreachable("unsupported DbgRecord kind");
+    }
 
     Marker.eraseFromParent();
-  }
+  };
 
   // Assume no trailing DPValues: we could technically create them at the end
   // of the block, after a terminator, but this would be non-cannonical and
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 389bac4de6a1c..2ca4533afa96c 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -112,17 +112,6 @@ bool DbgRecord::isEquivalentTo(const DbgRecord &R) const {
   return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R);
 }
 
-DbgInfoIntrinsic *
-DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
-  switch (RecordKind) {
-  case ValueKind:
-    return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
-  case LabelKind:
-    return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
-  };
-  llvm_unreachable("unsupported DbgRecord kind");
-}
-
 DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
                                 DIExpression *Expr, const DILocation *DI) {
   return new DPValue(ValueAsMetadata::get(Location), DV, Expr, DI,
@@ -388,20 +377,6 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   return DVI;
 }
 
-DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
-                                            Instruction *InsertBefore) const {
-  auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label);
-  Value *Args[] = {
-      MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())};
-  DbgLabelInst *DbgLabel = cast<DbgLabelInst>(
-      CallInst::Create(LabelFn->getFunctionType(), LabelFn, Args));
-  DbgLabel->setTailCall();
-  DbgLabel->setDebugLoc(getDebugLoc());
-  if (InsertBefore)
-    DbgLabel->insertBefore(InsertBefore);
-  return DbgLabel;
-}
-
 Value *DPValue::getAddress() const {
   auto *MD = getRawAddress();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))

From 8f2bd8ae68883592a333f4bdbed9798d66e68630 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Fri, 23 Feb 2024 17:13:20 -0800
Subject: [PATCH 220/546] [AMDGPU] Introduce iglp_opt(2): Generalized exp/mfma
 interleaving for select kernels (#81342)

This implements the basic pipelining structure of exp/mfma interleaving
for better extensibility. While it does have improved extensibility,
there are controls which only enable it for DAGs with certain
characteristics (matching the DAGs it has been designed against).
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 1079 ++++++++-
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       |   10 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    9 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |   11 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 2055 +++++++++++++++++
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  901 ++++++++
 6 files changed, 4008 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 4462cd8a31f13..e3f7248507953 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -140,8 +140,6 @@ class SchedGroup {
   // Count of the number of created SchedGroups, used to initialize SGID.
   static unsigned NumSchedGroups;
 
-  const SIInstrInfo *TII;
-
   // Try to add and edge from SU A to SU B.
   bool tryAddEdge(SUnit *A, SUnit *B);
 
@@ -154,6 +152,7 @@ class SchedGroup {
   SmallVector<SUnit *, 32> Collection;
 
   ScheduleDAGInstrs *DAG;
+  const SIInstrInfo *TII;
 
   // Returns true if SU can be added to this SchedGroup.
   bool canAddSU(SUnit &SU) const;
@@ -234,13 +233,13 @@ class SchedGroup {
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 
   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
-      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
+      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 };
@@ -838,6 +837,7 @@ void PipelineSolver::solve() {
 enum IGLPStrategyID : int {
   MFMASmallGemmOptID = 0,
   MFMASmallGemmSingleWaveOptID = 1,
+  MFMAExpInterleave = 2
 };
 
 // Implement a IGLP scheduling strategy.
@@ -849,13 +849,14 @@ class IGLPStrategy {
 
 public:
   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
-  virtual void applyIGLPStrategy(
+  virtual bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) = 0;
+      AMDGPU::SchedulingPhase Phase) = 0;
 
   // Returns true if this strategy should be applied to a ScheduleDAG.
-  virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
+  virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+                                   AMDGPU::SchedulingPhase Phase) = 0;
 
   bool IsBottomUp = 1;
 
@@ -868,12 +869,15 @@ class IGLPStrategy {
 class MFMASmallGemmOpt final : public IGLPStrategy {
 private:
 public:
-  void applyIGLPStrategy(
+  bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      AMDGPU::SchedulingPhase Phase) override;
 
-  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+                           AMDGPU::SchedulingPhase Phase) override {
+    return true;
+  }
 
   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
@@ -881,10 +885,10 @@ class MFMASmallGemmOpt final : public IGLPStrategy {
   }
 };
 
-void MFMASmallGemmOpt::applyIGLPStrategy(
+bool MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    AMDGPU::SchedulingPhase Phase) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
@@ -902,6 +906,955 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }
+
+  return true;
+}
+
+class MFMAExpInterleaveOpt final : public IGLPStrategy {
+private:
+  // The count of TRANS SUs involved in the interleaved pipeline
+  static unsigned TransPipeCount;
+  // The count of MFMA SUs involved in the interleaved pipeline
+  static unsigned MFMAPipeCount;
+  // The count of Add SUs involved in the interleaved pipeline
+  static unsigned AddPipeCount;
+  // The number of transitive MFMA successors for each TRANS SU
+  static unsigned MFMAEnablement;
+  // The number of transitive TRANS predecessors for each MFMA SU
+  static unsigned ExpRequirement;
+  // The count of independent "chains" of MFMA instructions in the pipeline
+  static unsigned MFMAChains;
+  // The length of each independent "chain" of MFMA instructions
+  static unsigned MFMAChainLength;
+  // Whether or not the pipeline has V_CVT instructions
+  static bool HasCvt;
+  // Whether or not there are instructions between the TRANS instruction and
+  // V_CVT
+  static bool HasChainBetweenCvt;
+  // The first occuring DS_READ which feeds an MFMA chain
+  static std::optional<unsigned> FirstPipeDSR;
+  // The MFMAPipe SUs with no MFMA predecessors
+  SmallVector<SUnit *, 4> MFMAChainSeeds;
+  // Compute the heuristics for the pipeline, returning whether or not the DAG
+  // is well formatted for the mutation
+  bool analyzeDAG(const SIInstrInfo *TII);
+
+  /// Whether or not the instruction is a transitive predecessor of an MFMA
+  /// instruction
+  class IsPipeExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      auto DAG = SyncPipe[0].DAG;
+
+      if (Cache->empty()) {
+        auto I = DAG->SUnits.rbegin();
+        auto E = DAG->SUnits.rend();
+        for (; I != E; I++) {
+          if (TII->isMFMAorWMMA(*I->getInstr()))
+            Cache->push_back(&*I);
+        }
+        if (Cache->empty())
+          return false;
+      }
+
+      auto Reaches = (std::any_of(
+          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
+            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+          }));
+
+      return Reaches;
+    }
+    IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the instruction is a transitive predecessor of the
+  /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
+  class EnablesNthMFMA final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      bool FoundTrans = false;
+      unsigned Counter = 1;
+      auto DAG = SyncPipe[0].DAG;
+
+      if (Cache->empty()) {
+        SmallVector<SUnit *, 8> Worklist;
+
+        auto I = DAG->SUnits.begin();
+        auto E = DAG->SUnits.end();
+        for (; I != E; I++) {
+          if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
+            if (Counter == Number) {
+              Cache->push_back(&*I);
+              break;
+            }
+            ++Counter;
+          }
+          if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
+            FoundTrans = true;
+        }
+        if (Cache->empty())
+          return false;
+      }
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the instruction enables the exact MFMA that is the \p
+  /// Number th MFMA in the chain starting with \p ChainSeed
+  class EnablesNthMFMAInChain final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+    SUnit *ChainSeed;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto DAG = SyncPipe[0].DAG;
+
+      if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+        return false;
+
+      if (Cache->empty()) {
+        auto TempSU = ChainSeed;
+        auto Depth = Number;
+        while (Depth > 0) {
+          --Depth;
+          bool Found = false;
+          for (auto &Succ : TempSU->Succs) {
+            if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+              TempSU = Succ.getSUnit();
+              Found = true;
+              break;
+            }
+          }
+          if (!Found)
+            return false;
+        }
+
+        Cache->push_back(TempSU);
+      }
+      // If we failed to find the instruction to be placed into the cache, we
+      // would have already exited.
+      assert(!Cache->empty());
+
+      return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
+    }
+
+    EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
+                          const SIInstrInfo *TII, unsigned SGID,
+                          bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+          ChainSeed(ChainSeed) {}
+  };
+
+  /// Whether or not the instruction has less than \p Size immediate successors.
+  /// If \p HasIntermediary is true, this tests also whether all successors of
+  /// the SUnit have less than \p Size successors.
+  class LessThanNSuccs final : public InstructionRule {
+  private:
+    unsigned Size = 1;
+    bool HasIntermediary = false;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SyncPipe.size())
+        return false;
+
+      auto SuccSize = std::count_if(
+          SU->Succs.begin(), SU->Succs.end(),
+          [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
+      if (SuccSize >= Size)
+        return false;
+
+      if (HasIntermediary) {
+        for (auto Succ : SU->Succs) {
+          auto SuccSize = std::count_if(
+              Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
+              [](const SDep &SuccSucc) {
+                return SuccSucc.getKind() == SDep::Data;
+              });
+          if (SuccSize >= Size)
+            return false;
+        }
+      }
+
+      return true;
+    }
+    LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
+                   bool HasIntermediary = false, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Size(Size),
+          HasIntermediary(HasIntermediary) {}
+  };
+
+  /// Whether or not the instruction has greater than or equal to \p Size
+  /// immediate successors. If \p HasIntermediary is true, this tests also
+  /// whether all successors of the SUnit have greater than or equal to \p Size
+  /// successors.
+  class GreaterThanOrEqualToNSuccs final : public InstructionRule {
+  private:
+    unsigned Size = 1;
+    bool HasIntermediary = false;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SyncPipe.size())
+        return false;
+
+      auto SuccSize = std::count_if(
+          SU->Succs.begin(), SU->Succs.end(),
+          [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
+      if (SuccSize >= Size)
+        return true;
+
+      if (HasIntermediary) {
+        for (auto Succ : SU->Succs) {
+          auto SuccSize = std::count_if(
+              Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
+              [](const SDep &SuccSucc) {
+                return SuccSucc.getKind() == SDep::Data;
+              });
+          if (SuccSize >= Size)
+            return true;
+        }
+      }
+
+      return false;
+    }
+    GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
+                               unsigned SGID, bool HasIntermediary = false,
+                               bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Size(Size),
+          HasIntermediary(HasIntermediary) {}
+  };
+
+  // Whether or not the instruction is a relevant V_CVT instruction.
+  class IsCvt final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      auto Opc = SU->getInstr()->getOpcode();
+      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
+             Opc == AMDGPU::V_CVT_I32_F32_e32;
+    }
+    IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether or not the instruction is FMA_F32.
+  class IsFMA final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
+             SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
+    }
+    IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  // Whether or not the instruction is a V_ADD_F32 instruction.
+  class IsPipeAdd final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
+    }
+    IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+  /// Whether or not the instruction is an immediate RAW successor
+  /// of the SchedGroup \p Distance steps before.
+  class IsSuccOfPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+          OtherGroup = &PipeSG;
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      for (auto &OtherEle : OtherGroup->Collection) {
+        for (auto &Succ : OtherEle->Succs) {
+          if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
+            return true;
+        }
+      }
+
+      return false;
+    }
+    IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                         unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction is a transitive successor of any
+  /// instruction the the SchedGroup \p Distance steps before.
+  class IsReachableFromPrevNthGroup final : public InstructionRule {
+  private:
+    unsigned Distance = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      SchedGroup *OtherGroup = nullptr;
+      if (!SyncPipe.size())
+        return false;
+
+      for (auto &PipeSG : SyncPipe) {
+        if ((unsigned)PipeSG.getSGID() == SGID - Distance)
+          OtherGroup = &PipeSG;
+      }
+
+      if (!OtherGroup)
+        return false;
+      if (!OtherGroup->Collection.size())
+        return true;
+
+      auto DAG = SyncPipe[0].DAG;
+
+      for (auto &OtherEle : OtherGroup->Collection)
+        if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
+          return true;
+
+      return false;
+    }
+    IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+                                unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+  };
+
+  /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
+  class OccursAtOrAfterNode final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      return SU->NodeNum >= Number;
+    }
+    OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
+                        bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
+  };
+
+  /// Whether or not the SU is exactly the \p Number th MFMA in the chain
+  /// starting with \p ChainSeed
+  class IsExactMFMA final : public InstructionRule {
+  private:
+    unsigned Number = 1;
+    SUnit *ChainSeed;
+
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+      if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
+        return false;
+
+      if (Cache->empty()) {
+        auto TempSU = ChainSeed;
+        auto Depth = Number;
+        while (Depth > 0) {
+          --Depth;
+          bool Found = false;
+          for (auto &Succ : TempSU->Succs) {
+            if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
+              TempSU = Succ.getSUnit();
+              Found = true;
+              break;
+            }
+          }
+          if (!Found) {
+            return false;
+          }
+        }
+        Cache->push_back(TempSU);
+      }
+      // If we failed to find the instruction to be placed into the cache, we
+      // would have already exited.
+      assert(!Cache->empty());
+
+      return (*Cache)[0] == SU;
+    }
+
+    IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
+                unsigned SGID, bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache), Number(Number),
+          ChainSeed(ChainSeed) {}
+  };
+
+  // Whether the instruction occurs after the first TRANS instruction. This
+  // implies the instruction can not be a predecessor of the first TRANS
+  // insruction
+  class OccursAfterExp final : public InstructionRule {
+  public:
+    bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+               SmallVectorImpl<SchedGroup> &SyncPipe) override {
+
+      SmallVector<SUnit *, 12> Worklist;
+      auto DAG = SyncPipe[0].DAG;
+      if (Cache->empty()) {
+        for (auto &SU : DAG->SUnits)
+          if (TII->isTRANS(SU.getInstr()->getOpcode())) {
+            Cache->push_back(&SU);
+            break;
+          }
+        if (Cache->empty())
+          return false;
+      }
+
+      return SU->NodeNum > (*Cache)[0]->NodeNum;
+    }
+
+    OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
+                   bool NeedsCache = false)
+        : InstructionRule(TII, SGID, NeedsCache) {}
+  };
+
+public:
+  bool applyIGLPStrategy(
+      DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+      DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+      AMDGPU::SchedulingPhase Phase) override;
+
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+                           AMDGPU::SchedulingPhase Phase) override;
+
+  MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+      : IGLPStrategy(DAG, TII) {
+    IsBottomUp = 0;
+  }
+};
+
+unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
+unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
+unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
+unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
+unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
+unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
+unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
+bool MFMAExpInterleaveOpt::HasCvt = false;
+bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
+std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
+
+bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
+  SmallVector<SUnit *, 10> ExpPipeCands;
+  SmallVector<SUnit *, 10> MFMAPipeCands;
+  SmallVector<SUnit *, 10> MFMAPipeSUs;
+  SmallVector<SUnit *, 10> PackSUs;
+  SmallVector<SUnit *, 10> CvtSUs;
+
+  auto isBitPack = [](unsigned Opc) {
+    return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
+  };
+
+  auto isCvt = [](unsigned Opc) {
+    return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
+  };
+
+  auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
+
+  AddPipeCount = 0;
+  for (SUnit &SU : DAG->SUnits) {
+    auto Opc = SU.getInstr()->getOpcode();
+    if (TII->isTRANS(Opc)) {
+      // Avoid counting a potential bonus V_EXP which all the MFMA depend on
+      if (SU.Succs.size() >= 7)
+        continue;
+      for (auto &Succ : SU.Succs) {
+        if (Succ.getSUnit()->Succs.size() >= 7)
+          continue;
+      }
+      ExpPipeCands.push_back(&SU);
+    }
+
+    if (TII->isMFMAorWMMA(*SU.getInstr()))
+      MFMAPipeCands.push_back(&SU);
+
+    if (isBitPack(Opc))
+      PackSUs.push_back(&SU);
+
+    if (isCvt(Opc))
+      CvtSUs.push_back(&SU);
+
+    if (isAdd(Opc))
+      ++AddPipeCount;
+  }
+
+  if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
+    return false;
+
+  TransPipeCount = 0;
+
+  std::optional<SUnit *> TempMFMA;
+  std::optional<SUnit *> TempExp;
+  // Count the number of EXPs that reach an MFMA
+  for (auto &PredSU : ExpPipeCands) {
+    for (auto &SuccSU : MFMAPipeCands) {
+      if (DAG->IsReachable(SuccSU, PredSU)) {
+        if (!TempExp) {
+          TempExp = PredSU;
+          TempMFMA = SuccSU;
+        }
+        MFMAPipeSUs.push_back(SuccSU);
+        ++TransPipeCount;
+        break;
+      }
+    }
+  }
+
+  if (!(TempExp && TempMFMA))
+    return false;
+
+  HasChainBetweenCvt =
+      std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
+                   [&isCvt](SDep &Succ) {
+                     return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
+                   }) == (*TempExp)->Succs.end();
+
+  // Count the number of MFMAs that are reached by an EXP
+  for (auto &SuccSU : MFMAPipeCands) {
+    if (MFMAPipeSUs.size() &&
+        std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(),
+                     [&SuccSU](SUnit *PotentialMatch) {
+                       return PotentialMatch->NodeNum == SuccSU->NodeNum;
+                     }) != MFMAPipeSUs.end())
+      continue;
+
+    for (auto &PredSU : ExpPipeCands) {
+      if (DAG->IsReachable(SuccSU, PredSU)) {
+        MFMAPipeSUs.push_back(SuccSU);
+        break;
+      }
+    }
+  }
+
+  MFMAPipeCount = MFMAPipeSUs.size();
+
+  assert(TempExp && TempMFMA);
+  assert(MFMAPipeCount > 0);
+
+  std::optional<SUnit *> TempCvt;
+  for (auto &SuccSU : CvtSUs) {
+    if (DAG->IsReachable(SuccSU, *TempExp)) {
+      TempCvt = SuccSU;
+      break;
+    }
+  }
+
+  HasCvt = false;
+  if (TempCvt.has_value()) {
+    for (auto &SuccSU : MFMAPipeSUs) {
+      if (DAG->IsReachable(SuccSU, *TempCvt)) {
+        HasCvt = true;
+        break;
+      }
+    }
+  }
+
+  MFMAChains = 0;
+  for (auto &MFMAPipeSU : MFMAPipeSUs) {
+    if (MFMAChainSeeds.size() &&
+        std::find(MFMAChainSeeds.begin(), MFMAChainSeeds.end(), MFMAPipeSU) !=
+            MFMAChainSeeds.end())
+      continue;
+    if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
+                     [&TII](SDep &Succ) {
+                       return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+                     })) {
+      MFMAChainSeeds.push_back(MFMAPipeSU);
+      ++MFMAChains;
+    }
+  }
+
+  if (!MFMAChains)
+    return false;
+
+  for (auto Pred : MFMAChainSeeds[0]->Preds) {
+    if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
+        Pred.getSUnit()->getInstr()->mayLoad())
+      FirstPipeDSR = Pred.getSUnit()->NodeNum;
+  }
+
+  MFMAChainLength = MFMAPipeCount / MFMAChains;
+
+  // The number of bit pack operations that depend on a single V_EXP
+  unsigned PackSuccCount = std::count_if(
+      PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
+        return DAG->IsReachable(VPack, *TempExp);
+      });
+
+  // The number of bit pack operations an MFMA depends on
+  unsigned PackPredCount =
+      std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                    [&isBitPack](SDep &Pred) {
+                      auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                      return isBitPack(Opc);
+                    });
+
+  auto PackPred =
+      std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
+                   [&isBitPack](SDep &Pred) {
+                     auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
+                     return isBitPack(Opc);
+                   });
+
+  if (PackPred == (*TempMFMA)->Preds.end())
+    return false;
+
+  MFMAEnablement = 0;
+  ExpRequirement = 0;
+  // How many MFMAs depend on a single bit pack operation
+  MFMAEnablement =
+      std::count_if(PackPred->getSUnit()->Succs.begin(),
+                    PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
+                      return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+                    });
+
+  // The number of MFMAs that depend on a single V_EXP
+  MFMAEnablement *= PackSuccCount;
+
+  // The number of V_EXPs required to resolve all dependencies for an MFMA
+  ExpRequirement =
+      std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
+                    [this, &PackPred](SUnit *ExpBase) {
+                      return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
+                    });
+
+  ExpRequirement *= PackPredCount;
+  return true;
+}
+
+bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+                                               AMDGPU::SchedulingPhase Phase) {
+  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  if (Phase != AMDGPU::SchedulingPhase::PostRA)
+    MFMAChainSeeds.clear();
+  if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
+    return false;
+
+  return true;
+}
+
+bool MFMAExpInterleaveOpt::applyIGLPStrategy(
+    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+    DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+    AMDGPU::SchedulingPhase Phase) {
+
+  bool IsSmallKernelType =
+      MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
+  bool IsLargeKernelType =
+      MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
+
+  if (!(IsSmallKernelType || IsLargeKernelType))
+    return false;
+
+  const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned PipelineSyncID = 0;
+  SchedGroup *SG = nullptr;
+
+  unsigned MFMAChain = 0;
+  unsigned PositionInChain = 0;
+  unsigned CurrMFMAForTransPosition = 0;
+
+  auto incrementTransPosition = [&MFMAChain, &PositionInChain,
+                                 &CurrMFMAForTransPosition]() {
+    CurrMFMAForTransPosition += MFMAEnablement;
+    PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
+    MFMAChain = CurrMFMAForTransPosition % MFMAChains;
+  };
+
+  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
+    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
+    return (TempMFMAForTrans / MFMAChains);
+  };
+
+  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
+    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
+    return TempMFMAForTrans % MFMAChains;
+  };
+
+  unsigned CurrMFMAPosition = 0;
+  unsigned MFMAChainForMFMA = 0;
+  unsigned PositionInChainForMFMA = 0;
+
+  auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
+                                &PositionInChainForMFMA]() {
+    ++CurrMFMAPosition;
+    MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
+    PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
+  };
+
+  bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
+  assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
+
+  bool UsesFMA = IsSmallKernelType || !IsPostRA;
+  bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
+  bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
+  bool UsesVALU = IsSmallKernelType;
+
+  // PHASE 1: "Prefetch"
+  if (UsesFMA) {
+    // First Round FMA
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains) {
+      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+          PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+          true));
+    } else
+      SG->addRule(
+          std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+    // Second Round FMA
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains) {
+      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+          getNextTransPositionInChain(),
+          MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
+    } else
+      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
+                                                   SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  if (UsesDSRead) {
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+    SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
+                                                      SG->getSGID()));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  // First Round EXP
+  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+      SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
+  if (!IsPostRA && MFMAChains)
+    SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+        PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
+  else
+    SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
+  SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+  SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+                                               HasChainBetweenCvt));
+  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+  incrementTransPosition();
+
+  // First Round CVT, Third Round FMA, Second Round EXP; interleaved
+  for (unsigned I = 0; I < ExpRequirement; I++) {
+    // First Round CVT
+    if (UsesCvt) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
+      if (HasChainBetweenCvt)
+        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+            1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
+      else
+        SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
+            1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    // Third Round FMA
+    if (UsesFMA) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+      if (!IsPostRA && MFMAChains) {
+        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+            getNextTransPositionInChain(),
+            MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
+      } else
+        SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
+                                                     TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    // Second Round EXP
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains)
+      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+          PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+          true));
+    else
+      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
+                                                   SG->getSGID(), true));
+    SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+    SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+                                                 HasChainBetweenCvt));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  }
+
+  // The "extra" EXP which enables all MFMA
+  // TODO: UsesExtraExp
+  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+      SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+  SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+  SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
+      8, TII, SG->getSGID(), HasChainBetweenCvt));
+  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+  // PHASE 2: Main Interleave Loop
+
+  // The number of MFMAs per iteration
+  unsigned MFMARatio =
+      MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
+  // The number of Exps per iteration
+  unsigned ExpRatio =
+      MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
+  // The reamaining Exps
+  unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
+                              ? TransPipeCount - (2 * ExpRequirement)
+                              : 0;
+  unsigned ExpLoopCount = RemainingExp / ExpRatio;
+  // In loop MFMAs
+  unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
+                            ? MFMAPipeCount - (MFMAEnablement * 2)
+                            : 0;
+  unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
+  unsigned VALUOps =
+      AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
+  unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
+
+  for (unsigned I = 0; I < LoopSize; I++) {
+    if (!(I * ExpRatio % ExpRequirement))
+      incrementTransPosition();
+
+    // Round N MFMA
+    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+        SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
+    if (!IsPostRA && MFMAChains)
+      SG->addRule(std::make_shared<IsExactMFMA>(
+          PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
+          SG->getSGID(), true));
+    else
+      SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    incrementMFMAPosition();
+
+    if (UsesVALU) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    if (UsesDSRead && !(I % 4)) {
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
+      SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
+                                                        SG->getSGID()));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+
+    // CVT, EXP, FMA Interleaving
+    for (unsigned J = 0; J < ExpRatio; J++) {
+      auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
+      auto MaxMFMAOffset =
+          (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
+
+      // Round N + 1 CVT
+      if (UsesCvt) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
+        auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
+        auto DSROffset = I / 4 + 1;
+        auto MaxDSROffset = MaxMFMAOffset / 4;
+        // TODO: UsesExtraExp
+        auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
+        auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
+                             std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
+                             ExpOffset;
+        if (HasChainBetweenCvt)
+          SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
+              CurrentOffset, TII, SG->getSGID()));
+        else
+          SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
+                                                             SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+
+      // Round N + 3 FMA
+      if (UsesFMA) {
+        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
+        if (!IsPostRA && MFMAChains)
+          SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+              getNextTransPositionInChain(),
+              MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
+              true));
+        else
+          SG->addRule(std::make_shared<EnablesNthMFMA>(
+              (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
+              TII, SG->getSGID(), true));
+        SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
+        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+      }
+
+      // Round N + 2 Exp
+      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
+      if (!IsPostRA && MFMAChains)
+        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
+            PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
+            true));
+      else
+        SG->addRule(std::make_shared<EnablesNthMFMA>(
+            (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
+            TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
+      SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
+                                                   HasChainBetweenCvt));
+      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+    }
+  }
+
+  // PHASE 3: Remaining MFMAs
+  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+      SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
+  SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
+  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+  return true;
 }
 
 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
@@ -1098,12 +2051,15 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
   };
 
 public:
-  void applyIGLPStrategy(
+  bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-      bool IsReentry) override;
+      AMDGPU::SchedulingPhase Phase) override;
 
-  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
+                           AMDGPU::SchedulingPhase Phase) override {
+    return true;
+  }
 
   MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
@@ -1115,15 +2071,17 @@ static unsigned DSWCount = 0;
 static unsigned DSWWithPermCount = 0;
 static unsigned DSWWithSharedVMEMCount = 0;
 
-void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
+bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
-    bool IsReentry) {
+    AMDGPU::SchedulingPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
 
-  assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
-                        DSWWithSharedVMEMCount == 0)) &&
+  bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
+
+  assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
+                         DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
@@ -1133,7 +2091,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
-      else if (I->mayStore() && !IsReentry) {
+      else if (I->mayStore() && IsInitial) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1146,7 +2104,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     }
   }
 
-  if (!IsReentry) {
+  if (IsInitial) {
     DSWWithPermCount = DSWithPerms.size();
     auto I = DSWithPerms.begin();
     auto E = DSWithPerms.end();
@@ -1254,14 +2212,14 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         1, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1272,7 +2230,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         3, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1290,7 +2248,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1311,7 +2269,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1325,7 +2283,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
-    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1336,7 +2294,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         2, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
@@ -1347,13 +2305,15 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         4, TII, SG->getSGID(), true));
-    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+    SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }
+
+  return true;
 }
 
 static std::unique_ptr<IGLPStrategy>
@@ -1364,6 +2324,8 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
   case MFMASmallGemmSingleWaveOptID:
     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
+  case MFMAExpInterleave:
+    return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
   }
 
   llvm_unreachable("Unknown IGLPStrategyID");
@@ -1375,6 +2337,8 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
 
   ScheduleDAGMI *DAG;
 
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
+
   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
@@ -1401,7 +2365,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   void initSchedGroupBarrierPipelineStage(
       std::vector<SUnit>::reverse_iterator RIter);
 
-  void initIGLPOpt(SUnit &SU);
+  bool initIGLPOpt(SUnit &SU);
 
 public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
@@ -1413,11 +2377,14 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   // first created SchedGroup first.
   bool IsBottomUp = 1;
 
-  // Whether or not this is a reentry into the IGroupLPDAGMutation.
-  bool IsReentry = false;
+  // The scheduling phase this application of IGLP corresponds with.
+  AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
+  IGroupLPDAGMutation(
+      AMDGPU::SchedulingPhase Phase,
+      std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
+      : SavedMutations(SavedMutations), Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -1591,7 +2558,6 @@ void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
     auto &SU = *I;
     if (isFull())
       break;
-
     if (canAddSU(SU))
       SyncedInstrs[&SU].push_back(SGID);
   }
@@ -1608,31 +2574,41 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
   SyncedSchedGroups.clear();
   SyncedInstrs.clear();
-  bool foundSB = false;
-  bool foundIGLP = false;
+  bool FoundSB = false;
+  bool FoundIGLP = false;
+  bool ShouldApplyIGLP = false;
   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
     unsigned Opc = R->getInstr()->getOpcode();
     // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
     if (Opc == AMDGPU::SCHED_BARRIER) {
       addSchedBarrierEdges(*R);
-      foundSB = true;
+      FoundSB = true;
     } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
       initSchedGroupBarrierPipelineStage(R);
-      foundSB = true;
+      FoundSB = true;
     } else if (Opc == AMDGPU::IGLP_OPT) {
       resetEdges(*R, DAG);
-      if (!foundSB && !foundIGLP)
-        initIGLPOpt(*R);
-      foundIGLP = true;
+      if (!FoundSB && !FoundIGLP) {
+        FoundIGLP = true;
+        ShouldApplyIGLP = initIGLPOpt(*R);
+      }
     }
   }
 
-  if (foundSB || foundIGLP) {
+  if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
     // PipelineSolver performs the mutation by adding the edges it
     // determined as the best
     PS.solve();
+    return;
   }
+
+  if (!SavedMutations)
+    return;
+
+  // We did not apply a mutation, fall back to SavedMutations
+  for (auto &m : *SavedMutations)
+    m->apply(DAG);
 }
 
 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -1711,27 +2687,30 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
 }
 
-void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
+bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   IGLPStrategyID StrategyID =
       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
-  if (S->shouldApplyStrategy(DAG)) {
-    IsBottomUp = S->IsBottomUp;
-    S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
-  }
+  if (!S->shouldApplyStrategy(DAG, Phase))
+    return false;
+
+  IsBottomUp = S->IsBottomUp;
+  return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
 }
 
 } // namespace
 
 namespace llvm {
 
-/// \p IsReentry specifes whether or not this is a reentry into the
+/// \p Phase specifes whether or not this is a reentry into the
 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
-  return std::make_unique<IGroupLPDAGMutation>(IsReentry);
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+    AMDGPU::SchedulingPhase Phase,
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 3ec8be4f88920..46ef4d702d002 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -11,10 +11,18 @@
 
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include <memory>
+#include <vector>
 
 namespace llvm {
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry);
+namespace AMDGPU {
+// The current phase of instruction scheduling
+enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
+} // namespace AMDGPU
+
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
+    AMDGPU::SchedulingPhase Phase,
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e26b4cf820a52..a0c6e3a1b49b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,7 +461,8 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG->addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -471,7 +472,8 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG->addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
   return DAG;
 }
 
@@ -934,7 +936,8 @@ class GCNPassConfig final : public AMDGPUPassConfig {
     if (ST.shouldClusterStores())
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
-    DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
+    DAG->addMutation(
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4081115aa68ca..3f3550d3029aa 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,7 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
+  DAG.addMutation(createIGroupLPDAGMutation(
+      AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -855,7 +856,10 @@ bool GCNSchedStage::initGCNRegion() {
     SavedMutations.swap(DAG.Mutations);
     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
                           StageID == GCNSchedStageID::ILPInitialSchedule;
-    DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage));
+    DAG.addMutation(createIGroupLPDAGMutation(
+        IsInitialStage ? AMDGPU::SchedulingPhase::Initial
+                       : AMDGPU::SchedulingPhase::PreRAReentry,
+        &SavedMutations));
   }
 
   return true;
@@ -1569,7 +1573,8 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true));
+    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
+                                          &SavedMutations));
   }
 
   ScheduleDAGMI::schedule();
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
new file mode 100644
index 0000000000000..da1d9972e42dc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -0,0 +1,2055 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @largeInterleave() #0 { ret void }
+  ; GCN-LABEL: largeInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr94
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr106
+  ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr133
+  ; GCN-NEXT:    ; implicit-def: $vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
+  ; GCN-NEXT:    ; implicit-def: $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10_sgpr11
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_lshl_add_u32 v0, s7, 4, v2
+  ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v92, v0, v1, 1
+  ; GCN-NEXT:    v_add_u32_e32 v93, s0, v92
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v92, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v93, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s0, s7, 7
+  ; GCN-NEXT:    v_add_lshl_u32 v95, v8, s0, 1
+  ; GCN-NEXT:    v_add_u32_e32 v8, 64, v93
+  ; GCN-NEXT:    ; kill: killed $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:    ; kill: killed $vgpr92
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[0:3]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[4:7] offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v8, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], 0
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0x80, v93
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:128 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[80:81], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[88:89], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[82:83], v[78:79], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[78:79], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[90:91], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    ; implicit-def: $vgpr73
+  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr74
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
+  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
+  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[78:79], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[72:73], v[76:77], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[74:75], v[78:79], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v95, v[64:67]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[64:67], v94
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[90:93], v94 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr68_vgpr69_vgpr70_vgpr71
+  ; GCN-NEXT:    ds_read_b128 v[84:87], v94 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[64:65], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v94 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[94:97], v106
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[66:67], v[70:71], v[48:63]
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[68:69], v[32:47]
+  ; GCN-NEXT:    ds_read_b128 v[88:91], v106 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[80:83], v106 offset:1024
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v106 offset:1536
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[94:95], v[64:65], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v94, v102, v98, s5
+  ; GCN-NEXT:    v_perm_b32 v98, v102, v98, s8
+  ; GCN-NEXT:    v_perm_b32 v102, v103, v99, s5
+  ; GCN-NEXT:    v_perm_b32 v95, v104, v100, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[92:93], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[84:85], v[68:69], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[96:97], v[66:67], v[48:63]
+  ; GCN-NEXT:    v_perm_b32 v96, v103, v99, s8
+  ; GCN-NEXT:    v_perm_b32 v99, v104, v100, s8
+  ; GCN-NEXT:    v_perm_b32 v103, v105, v101, s5
+  ; GCN-NEXT:    v_perm_b32 v97, v105, v101, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[88:89], v[64:65], v[32:47]
+  ; GCN-NEXT:    s_nop 5
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v101, s4, v49
+  ; GCN-NEXT:    v_max3_f32 v92, v100, s6, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v50
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v51
+  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
+  ; GCN-NEXT:    v_mul_f32_e32 v93, s4, v52
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[86:87], v[70:71], v[16:31]
+  ; GCN-NEXT:    v_mul_f32_e32 v100, s4, v53
+  ; GCN-NEXT:    v_max3_f32 v92, v92, v93, v100
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v54
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v55
+  ; GCN-NEXT:    v_max3_f32 v84, v92, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v56
+  ; GCN-NEXT:    v_mul_f32_e32 v92, s4, v57
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[76:77], v[68:69], v[0:15]
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v92
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v58
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v59
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v60
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v61
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[90:91], v[66:67], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v62
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v63
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[80:81], v[64:65], v[16:31]
+  ; GCN-NEXT:    s_nop 6
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v33
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v34
+  ; GCN-NEXT:    v_mul_f32_e32 v88, s4, v35
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[78:79], v[70:71], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v37
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v39
+  ; GCN-NEXT:    v_max3_f32 v84, v84, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v41
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[82:83], v[66:67], v[16:31]
+  ; GCN-NEXT:    v_max3_f32 v80, v84, v85, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v42
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v43
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v44
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v45
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v84
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v46
+  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v47
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v82, s4, v17
+  ; GCN-NEXT:    v_max3_f32 v80, v80, v81, v82
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[66:67], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
+  ; GCN-NEXT:    v_max3_f32 v68, v80, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v76, s4, v21
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v76
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v23
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v25
+  ; GCN-NEXT:    v_max3_f32 v68, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v70, s4, v27
+  ; GCN-NEXT:    v_max3_f32 v64, v68, v69, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v28
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v29
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v31
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v1
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v66, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
+  ; GCN-NEXT:    ; implicit-def: $vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr68
+  ; GCN-NEXT:    ; implicit-def: $vgpr67
+  ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
+  ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
+  ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v135, v66, v65, 1
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_lshl_add_u32 v136, v66, 1, v135
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_lshl_add_u32 v137, v66, 1, v136
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $sgpr6_sgpr7
+  ; GCN-NEXT:    v_lshl_add_u32 v138, v66, 1, v137
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v135, v[94:95]
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v133, v64
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[98:99]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[102:103]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
+  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr65
+  ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
+  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
+  ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v49, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v52, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v51, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_fma_f32 v69, s4, v53, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v52, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v68
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v54, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v53, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v69
+  ; GCN-NEXT:    v_fma_f32 v71, s4, v55, -v134
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v54, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_exp_f32_e32 v55, v48
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v56, v48
+  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
+  ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
+  ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v48, v48
+  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
+  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
+  ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[70:71], v[70:71], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[72:73], v[72:73], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
+  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
+  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
+  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
+  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
+  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
+  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    ; implicit-def: $vgpr57
+  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
+  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
+  ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
+  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_exp_f32_e32 v151, v33
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
+  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
+  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
+  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
+  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
+  ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
+  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
+  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
+  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
+  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
+  ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    ; implicit-def: $vgpr38
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
+  ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
+  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr36
+  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
+  ; GCN-NEXT:    ; implicit-def: $vgpr37
+  ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
+  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
+  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
+  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
+  ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
+  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
+  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
+  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
+  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
+  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
+  ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
+  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
+  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
+  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
+  ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v130, v37
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
+  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
+  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    v_exp_f32_e32 v144, v22
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[16:17]
+  ; GCN-NEXT:    ; implicit-def: $vgpr17
+  ; GCN-NEXT:    ; implicit-def: $vgpr22
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[42:43]
+  ; GCN-NEXT:    v_add_u32_e32 v22, v132, v22
+  ; GCN-NEXT:    v_add_u32_e32 v17, v132, v17
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
+  ; GCN-NEXT:    ; implicit-def: $vgpr21
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
+  ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
+  ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v132, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
+  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v62, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
+  ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
+  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
+  ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
+  ; GCN-NEXT:    v_exp_f32_e32 v145, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
+  ; GCN-NEXT:    v_exp_f32_e32 v35, v16
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v34, s4, v27, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[32:33], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ds_read_b128 v[24:27], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v46, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v47
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v132
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[36:37], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v47, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v36, v62
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v34
+  ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
+  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v33, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v36, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_exp_f32_e32 v39, v24
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[26:27], v[16:17], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v34, v1
+  ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
+  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
+  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
+  ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v38, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
+  ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
+  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
+  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b64 v135, v[4:5]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[24:25], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
+  ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_exp_f32_e32 v42, v1
+  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v137, v[0:1]
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b64 v138, v[26:27]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v40, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v37, v28
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v7, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v6
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v139
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v9, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[30:31], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v29, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v41
+  ; GCN-NEXT:    v_fma_f32 v30, s4, v10, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[16:17], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v2, v42
+  ; GCN-NEXT:    v_exp_f32_e32 v31, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v25
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v17, v2, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v16, v1, v27
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v11, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v19, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v139 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    ds_read_b128 v[8:11], v139 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v24, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v26
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v29
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v13, -v134
+  ; GCN-NEXT:    v_fma_f32 v28, s4, v14, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v30
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v31
+  ; GCN-NEXT:    v_exp_f32_e32 v30, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v19
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v5, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v9
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    v_exp_f32_e32 v21, v9
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v27
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v23, v18
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v3, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+  ; GCN-NEXT:    v_exp_f32_e32 v2, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v17, v20
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v3, v0
+  ; GCN-NEXT:    v_add_f32_e32 v3, 0, v49
+  ; GCN-NEXT:    v_add_f32_e32 v3, v50, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v51, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v52, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v53, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v145, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
+  ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
+  ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v42, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v25, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v26, v3
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v11, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v23, v22
+  ; GCN-NEXT:    v_add_f32_e32 v3, v29, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v31, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
+  ; GCN-NEXT:    v_add_f32_e32 v3, v19, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v24, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v27, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v30, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v16, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v18, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v21, v3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+  ; GCN-NEXT:    v_add_f32_e32 v0, v2, v3
+  ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
+  ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+  !0 = !{i64 2862105}
+
+...
+
+---
+name:            largeInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       7
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+    %11:vgpr_32 = IMPLICIT_DEF
+    %1:sgpr_512 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %443:sgpr_128 = IMPLICIT_DEF
+    %18:sreg_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %391:vreg_128_align2 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %392:vreg_128_align2 = IMPLICIT_DEF
+    %401:vreg_128_align2 = IMPLICIT_DEF
+    %406:vreg_128_align2 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %473:sgpr_128 = IMPLICIT_DEF
+    %411:vreg_128_align2 = IMPLICIT_DEF
+    %416:vreg_128_align2 = IMPLICIT_DEF
+    %421:vreg_128_align2 = IMPLICIT_DEF
+    %426:vreg_128_align2 = IMPLICIT_DEF
+    %1114:sgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %484:sreg_64_xexec = IMPLICIT_DEF
+    %3346:vgpr_32 = IMPLICIT_DEF
+    %1422:sreg_32 = IMPLICIT_DEF
+    %1424:sreg_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %494:sreg_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %50:sreg_32 = IMPLICIT_DEF
+    %3347:vgpr_32 = IMPLICIT_DEF
+    %3329:vgpr_32 = IMPLICIT_DEF
+    %3330:vgpr_32 = IMPLICIT_DEF
+    %3331:vgpr_32 = IMPLICIT_DEF
+    %3332:vgpr_32 = IMPLICIT_DEF
+    %3333:vgpr_32 = IMPLICIT_DEF
+    %2986:vreg_512_align2 = IMPLICIT_DEF
+    %3038:vreg_512_align2 = IMPLICIT_DEF
+    %2980:vreg_512_align2 = IMPLICIT_DEF
+    %3003:vreg_512_align2 = IMPLICIT_DEF
+    %3334:vgpr_32 = IMPLICIT_DEF
+    %3335:vgpr_32 = IMPLICIT_DEF
+    %3336:vgpr_32 = IMPLICIT_DEF
+    %3337:vgpr_32 = IMPLICIT_DEF
+    %3338:vgpr_32 = IMPLICIT_DEF
+    %3339:vgpr_32 = IMPLICIT_DEF
+    %3345:vgpr_32 = IMPLICIT_DEF
+    %3340:vgpr_32 = IMPLICIT_DEF
+    %3341:vgpr_32 = IMPLICIT_DEF
+    %3342:vgpr_32 = IMPLICIT_DEF
+    %3343:vgpr_32 = IMPLICIT_DEF
+    %3344:vgpr_32 = IMPLICIT_DEF
+    %84:vgpr_32 = COPY %3347
+    %86:vgpr_32 = COPY %3347:vgpr_32
+    IGLP_OPT 2
+    %593:sreg_32 = V_READFIRSTLANE_B32 %11:vgpr_32, implicit $exec
+    %595:vgpr_32 = V_LSHL_ADD_U32_e64 %593:sreg_32, 4, %3329:vgpr_32, implicit $exec
+    %597:vgpr_32 = nsw V_MUL_LO_U32_e64 %595:vgpr_32, %1.sub6:sgpr_512, implicit $exec
+    %599:vgpr_32 = V_ADD_LSHL_U32_e64 %597:vgpr_32, %16:vgpr_32, 1, implicit $exec
+    %601:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %602:vgpr_32 = V_ADD_U32_e32 %18:sreg_32, %599:vgpr_32, implicit $exec
+    %603:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %602:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %605:sreg_32 = S_LSHL_B32 %593:sreg_32, 7, implicit-def dead $scc
+    %606:vgpr_32 = V_ADD_LSHL_U32_e64 %25:vgpr_32, %605:sreg_32, 1, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %601:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %603:vreg_128_align2, 1024, 0, implicit $exec
+    %608:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 64, 0, 0, implicit $exec
+    %610:vgpr_32 = V_ADD_U32_e32 64, %602:vgpr_32, implicit $exec
+    %611:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %610:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %612:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    early-clobber %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %612.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %612.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %626:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    early-clobber %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %626.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %626.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %638:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    early-clobber %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %638.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %638.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %650:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    early-clobber %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %650.sub0_sub1:vreg_128_align2, %391.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %650.sub2_sub3:vreg_128_align2, %391.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %662:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %662.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %673:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %673.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %684:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %684.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %695:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub0_sub1:vreg_128_align2, %392.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %695.sub2_sub3:vreg_128_align2, %392.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %608:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %611:vreg_128_align2, 1024, 0, implicit $exec
+    %706:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 128, 0, 0, implicit $exec
+    %708:vgpr_32 = V_ADD_U32_e32 128, %602:vgpr_32, implicit $exec
+    %709:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %708:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %710:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %710.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %721:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %721.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %732:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %732.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %743:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub0_sub1:vreg_128_align2, %401.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %743.sub2_sub3:vreg_128_align2, %401.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %754:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %754.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %765:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %765.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %776:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %776.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %787:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub0_sub1:vreg_128_align2, %406.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %787.sub2_sub3:vreg_128_align2, %406.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %706:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %709:vreg_128_align2, 1024, 0, implicit $exec
+    %798:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %599:vgpr_32, %443:sgpr_128, 0, 192, 0, 0, implicit $exec
+    %800:vgpr_32 = V_ADD_U32_e32 192, %602:vgpr_32, implicit $exec
+    %801:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %800:vgpr_32, %443:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %802:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3330:vgpr_32, implicit $exec
+    %803:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %802:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %804:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3331:vgpr_32, implicit $exec
+    %805:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %804:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %806:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3332:vgpr_32, implicit $exec
+    %807:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %806:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %808:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3333:vgpr_32, implicit $exec
+    %809:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %808:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %810:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %810.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %821:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %821.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %832:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %832.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %843:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub0_sub1:vreg_128_align2, %411.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %843.sub2_sub3:vreg_128_align2, %411.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %854:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %854.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %865:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %865.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %876:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %876.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %887:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub0_sub1:vreg_128_align2, %416.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %887.sub2_sub3:vreg_128_align2, %416.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %798:vreg_128_align2, 0, 0, implicit $exec
+    DS_WRITE_B128_gfx9 %606:vgpr_32, %801:vreg_128_align2, 1024, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %898:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %898.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %909:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %909.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %920:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %920.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_128_align2 = DS_READ_B128_gfx9 %23:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub0_sub1:vreg_128_align2, %421.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %931.sub2_sub3:vreg_128_align2, %421.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %942:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %668:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %942.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %668:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %969:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 512, 0, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %679:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %969.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %679:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %996:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1024, 0, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %690:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %996.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %690:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1023:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 1536, 0, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub0_sub1:vreg_128_align2, %426.sub0_sub1:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %701:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1023.sub2_sub3:vreg_128_align2, %426.sub2_sub3:vreg_128_align2, %701:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %1050:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1051:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1052:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1053:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1054:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1055:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1056:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1057:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1058:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1059:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1060:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1061:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1062:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1063:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1064:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1065:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %668.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1066:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1067:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1068:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1069:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1070:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1071:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1072:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1073:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1074:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1075:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1076:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1077:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1078:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1079:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1080:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1081:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %679.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1082:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1083:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1084:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1085:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1086:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1087:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1088:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1089:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1090:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1091:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1092:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1093:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1094:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1095:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1096:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1097:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %690.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1098:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub0:vreg_512_align2, implicit $mode, implicit $exec
+    %1099:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub1:vreg_512_align2, implicit $mode, implicit $exec
+    %1100:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub2:vreg_512_align2, implicit $mode, implicit $exec
+    %1101:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub3:vreg_512_align2, implicit $mode, implicit $exec
+    %1102:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub4:vreg_512_align2, implicit $mode, implicit $exec
+    %1103:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub5:vreg_512_align2, implicit $mode, implicit $exec
+    %1104:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub6:vreg_512_align2, implicit $mode, implicit $exec
+    %1105:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub7:vreg_512_align2, implicit $mode, implicit $exec
+    %1106:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub8:vreg_512_align2, implicit $mode, implicit $exec
+    %1107:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub9:vreg_512_align2, implicit $mode, implicit $exec
+    %1108:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub10:vreg_512_align2, implicit $mode, implicit $exec
+    %1109:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub11:vreg_512_align2, implicit $mode, implicit $exec
+    %1110:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub12:vreg_512_align2, implicit $mode, implicit $exec
+    %1111:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub13:vreg_512_align2, implicit $mode, implicit $exec
+    %1112:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub14:vreg_512_align2, implicit $mode, implicit $exec
+    %1113:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %1.sub4:sgpr_512, %701.sub15:vreg_512_align2, implicit $mode, implicit $exec
+    %1115:vgpr_32 = V_MAX3_F32_e64 0, %1050:vgpr_32, 0, %1114:sgpr_32, 0, %1051:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1116:vgpr_32 = V_MAX3_F32_e64 0, %1115:vgpr_32, 0, %1052:vgpr_32, 0, %1053:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1117:vgpr_32 = V_MAX3_F32_e64 0, %1116:vgpr_32, 0, %1054:vgpr_32, 0, %1055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1118:vgpr_32 = V_MAX3_F32_e64 0, %1117:vgpr_32, 0, %1056:vgpr_32, 0, %1057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1119:vgpr_32 = V_MAX3_F32_e64 0, %1118:vgpr_32, 0, %1058:vgpr_32, 0, %1059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1120:vgpr_32 = V_MAX3_F32_e64 0, %1119:vgpr_32, 0, %1060:vgpr_32, 0, %1061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1121:vgpr_32 = V_MAX3_F32_e64 0, %1120:vgpr_32, 0, %1062:vgpr_32, 0, %1063:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1122:vgpr_32 = V_MAX3_F32_e64 0, %1121:vgpr_32, 0, %1064:vgpr_32, 0, %1065:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1123:vgpr_32 = V_MAX3_F32_e64 0, %1122:vgpr_32, 0, %1066:vgpr_32, 0, %1067:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1124:vgpr_32 = V_MAX3_F32_e64 0, %1123:vgpr_32, 0, %1068:vgpr_32, 0, %1069:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1125:vgpr_32 = V_MAX3_F32_e64 0, %1124:vgpr_32, 0, %1070:vgpr_32, 0, %1071:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1126:vgpr_32 = V_MAX3_F32_e64 0, %1125:vgpr_32, 0, %1072:vgpr_32, 0, %1073:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1127:vgpr_32 = V_MAX3_F32_e64 0, %1126:vgpr_32, 0, %1074:vgpr_32, 0, %1075:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1128:vgpr_32 = V_MAX3_F32_e64 0, %1127:vgpr_32, 0, %1076:vgpr_32, 0, %1077:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1129:vgpr_32 = V_MAX3_F32_e64 0, %1128:vgpr_32, 0, %1078:vgpr_32, 0, %1079:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1130:vgpr_32 = V_MAX3_F32_e64 0, %1129:vgpr_32, 0, %1080:vgpr_32, 0, %1081:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1131:vgpr_32 = V_MAX3_F32_e64 0, %1130:vgpr_32, 0, %1082:vgpr_32, 0, %1083:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1132:vgpr_32 = V_MAX3_F32_e64 0, %1131:vgpr_32, 0, %1084:vgpr_32, 0, %1085:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1133:vgpr_32 = V_MAX3_F32_e64 0, %1132:vgpr_32, 0, %1086:vgpr_32, 0, %1087:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1134:vgpr_32 = V_MAX3_F32_e64 0, %1133:vgpr_32, 0, %1088:vgpr_32, 0, %1089:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1135:vgpr_32 = V_MAX3_F32_e64 0, %1134:vgpr_32, 0, %1090:vgpr_32, 0, %1091:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1136:vgpr_32 = V_MAX3_F32_e64 0, %1135:vgpr_32, 0, %1092:vgpr_32, 0, %1093:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1137:vgpr_32 = V_MAX3_F32_e64 0, %1136:vgpr_32, 0, %1094:vgpr_32, 0, %1095:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1138:vgpr_32 = V_MAX3_F32_e64 0, %1137:vgpr_32, 0, %1096:vgpr_32, 0, %1097:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1139:vgpr_32 = V_MAX3_F32_e64 0, %1138:vgpr_32, 0, %1098:vgpr_32, 0, %1099:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1140:vgpr_32 = V_MAX3_F32_e64 0, %1139:vgpr_32, 0, %1100:vgpr_32, 0, %1101:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1141:vgpr_32 = V_MAX3_F32_e64 0, %1140:vgpr_32, 0, %1102:vgpr_32, 0, %1103:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1142:vgpr_32 = V_MAX3_F32_e64 0, %1141:vgpr_32, 0, %1104:vgpr_32, 0, %1105:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1143:vgpr_32 = V_MAX3_F32_e64 0, %1142:vgpr_32, 0, %1106:vgpr_32, 0, %1107:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1144:vgpr_32 = V_MAX3_F32_e64 0, %1143:vgpr_32, 0, %1108:vgpr_32, 0, %1109:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1145:vgpr_32 = V_MAX3_F32_e64 0, %1144:vgpr_32, 0, %1110:vgpr_32, 0, %1111:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1146:vgpr_32 = V_MAX3_F32_e64 0, %1145:vgpr_32, 0, %1112:vgpr_32, 0, %1113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1147:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1146:vgpr_32, 0, implicit $exec
+    %1148:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1147:vgpr_32, %1147:vgpr_32, implicit $mode, implicit $exec
+    %1149:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1146:vgpr_32, %1148:vgpr_32, implicit $mode, implicit $exec
+    %1150:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1149:vgpr_32, 0, implicit $exec
+    %1151:vgpr_32 = V_CNDMASK_B32_e64 0, %1150:vgpr_32, 0, %1149:vgpr_32, %484:sreg_64_xexec, implicit $exec
+    %1153:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1151:vgpr_32, %1151:vgpr_32, implicit $mode, implicit $exec
+    %1154:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %3346:vgpr_32, %3346:vgpr_32, implicit $mode, implicit $exec
+    %151:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %1154:vgpr_32, %1153:vgpr_32, implicit $mode, implicit $exec
+    %1155:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1155:vgpr_32, implicit $mode, implicit $exec
+    %1158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1157:vgpr_32, implicit $mode, implicit $exec
+    %1159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1159:vgpr_32, implicit $mode, implicit $exec
+    %1161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1160:vgpr_32, implicit $mode, implicit $exec
+    %1162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1162:vgpr_32, implicit $mode, implicit $exec
+    %1164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1163:vgpr_32, implicit $mode, implicit $exec
+    %1165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1165:vgpr_32, implicit $mode, implicit $exec
+    %1167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1166:vgpr_32, implicit $mode, implicit $exec
+    %1168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1168:vgpr_32, implicit $mode, implicit $exec
+    %1170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1169:vgpr_32, implicit $mode, implicit $exec
+    %1171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1171:vgpr_32, implicit $mode, implicit $exec
+    %1173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1172:vgpr_32, implicit $mode, implicit $exec
+    %1174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1174:vgpr_32, implicit $mode, implicit $exec
+    %1176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1175:vgpr_32, implicit $mode, implicit $exec
+    %1177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1177:vgpr_32, implicit $mode, implicit $exec
+    %1179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1178:vgpr_32, implicit $mode, implicit $exec
+    %1180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1180:vgpr_32, implicit $mode, implicit $exec
+    %1182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1181:vgpr_32, implicit $mode, implicit $exec
+    %1183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1183:vgpr_32, implicit $mode, implicit $exec
+    %1185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1184:vgpr_32, implicit $mode, implicit $exec
+    %1186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1186:vgpr_32, implicit $mode, implicit $exec
+    %1188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1187:vgpr_32, implicit $mode, implicit $exec
+    %1189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1189:vgpr_32, implicit $mode, implicit $exec
+    %1191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1190:vgpr_32, implicit $mode, implicit $exec
+    %1192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1192:vgpr_32, implicit $mode, implicit $exec
+    %1194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1193:vgpr_32, implicit $mode, implicit $exec
+    %1195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1195:vgpr_32, implicit $mode, implicit $exec
+    %1197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1196:vgpr_32, implicit $mode, implicit $exec
+    %1198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1198:vgpr_32, implicit $mode, implicit $exec
+    %1200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1199:vgpr_32, implicit $mode, implicit $exec
+    %1201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %668.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1201:vgpr_32, implicit $mode, implicit $exec
+    %1203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1202:vgpr_32, implicit $mode, implicit $exec
+    %1204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1204:vgpr_32, implicit $mode, implicit $exec
+    %1206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1205:vgpr_32, implicit $mode, implicit $exec
+    %1207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1207:vgpr_32, implicit $mode, implicit $exec
+    %1209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1208:vgpr_32, implicit $mode, implicit $exec
+    %1210:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1211:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1210:vgpr_32, implicit $mode, implicit $exec
+    %1212:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1211:vgpr_32, implicit $mode, implicit $exec
+    %1213:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1214:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1213:vgpr_32, implicit $mode, implicit $exec
+    %1215:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1214:vgpr_32, implicit $mode, implicit $exec
+    %1216:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1217:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1216:vgpr_32, implicit $mode, implicit $exec
+    %1218:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1217:vgpr_32, implicit $mode, implicit $exec
+    %1219:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1220:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1219:vgpr_32, implicit $mode, implicit $exec
+    %1221:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1220:vgpr_32, implicit $mode, implicit $exec
+    %1222:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1223:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1222:vgpr_32, implicit $mode, implicit $exec
+    %1224:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1223:vgpr_32, implicit $mode, implicit $exec
+    %1225:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1226:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1225:vgpr_32, implicit $mode, implicit $exec
+    %1227:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1226:vgpr_32, implicit $mode, implicit $exec
+    %1228:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1229:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1228:vgpr_32, implicit $mode, implicit $exec
+    %1230:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1229:vgpr_32, implicit $mode, implicit $exec
+    %1231:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1232:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1231:vgpr_32, implicit $mode, implicit $exec
+    %1233:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1232:vgpr_32, implicit $mode, implicit $exec
+    %1234:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1235:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1234:vgpr_32, implicit $mode, implicit $exec
+    %1236:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1235:vgpr_32, implicit $mode, implicit $exec
+    %1237:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1238:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1237:vgpr_32, implicit $mode, implicit $exec
+    %1239:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1238:vgpr_32, implicit $mode, implicit $exec
+    %1240:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1241:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1240:vgpr_32, implicit $mode, implicit $exec
+    %1242:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1241:vgpr_32, implicit $mode, implicit $exec
+    %1243:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1244:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1243:vgpr_32, implicit $mode, implicit $exec
+    %1245:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1244:vgpr_32, implicit $mode, implicit $exec
+    %1246:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1247:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1246:vgpr_32, implicit $mode, implicit $exec
+    %1248:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1247:vgpr_32, implicit $mode, implicit $exec
+    %1249:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %679.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1250:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1249:vgpr_32, implicit $mode, implicit $exec
+    %1251:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1250:vgpr_32, implicit $mode, implicit $exec
+    %1252:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1253:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1252:vgpr_32, implicit $mode, implicit $exec
+    %1254:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1253:vgpr_32, implicit $mode, implicit $exec
+    %1255:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1256:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1255:vgpr_32, implicit $mode, implicit $exec
+    %1257:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1256:vgpr_32, implicit $mode, implicit $exec
+    %1258:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1259:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1258:vgpr_32, implicit $mode, implicit $exec
+    %1260:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1259:vgpr_32, implicit $mode, implicit $exec
+    %1261:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1262:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1261:vgpr_32, implicit $mode, implicit $exec
+    %1263:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1262:vgpr_32, implicit $mode, implicit $exec
+    %1264:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1265:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1264:vgpr_32, implicit $mode, implicit $exec
+    %1266:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1265:vgpr_32, implicit $mode, implicit $exec
+    %1267:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1268:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1267:vgpr_32, implicit $mode, implicit $exec
+    %1269:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1268:vgpr_32, implicit $mode, implicit $exec
+    %1270:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1271:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1270:vgpr_32, implicit $mode, implicit $exec
+    %1272:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1271:vgpr_32, implicit $mode, implicit $exec
+    %1273:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1274:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1273:vgpr_32, implicit $mode, implicit $exec
+    %1275:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1274:vgpr_32, implicit $mode, implicit $exec
+    %1276:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1277:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1276:vgpr_32, implicit $mode, implicit $exec
+    %1278:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1277:vgpr_32, implicit $mode, implicit $exec
+    %1279:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1280:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1279:vgpr_32, implicit $mode, implicit $exec
+    %1281:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1280:vgpr_32, implicit $mode, implicit $exec
+    %1282:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1283:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1282:vgpr_32, implicit $mode, implicit $exec
+    %1284:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1283:vgpr_32, implicit $mode, implicit $exec
+    %1285:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1286:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1285:vgpr_32, implicit $mode, implicit $exec
+    %1287:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1286:vgpr_32, implicit $mode, implicit $exec
+    %1288:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1289:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1288:vgpr_32, implicit $mode, implicit $exec
+    %1290:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1289:vgpr_32, implicit $mode, implicit $exec
+    %1291:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1292:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1291:vgpr_32, implicit $mode, implicit $exec
+    %1293:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1292:vgpr_32, implicit $mode, implicit $exec
+    %1294:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1295:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1294:vgpr_32, implicit $mode, implicit $exec
+    %1296:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1295:vgpr_32, implicit $mode, implicit $exec
+    %1297:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %690.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1298:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1297:vgpr_32, implicit $mode, implicit $exec
+    %1299:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1298:vgpr_32, implicit $mode, implicit $exec
+    %1300:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub0:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1301:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1300:vgpr_32, implicit $mode, implicit $exec
+    %1302:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1301:vgpr_32, implicit $mode, implicit $exec
+    %1303:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub1:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1304:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1303:vgpr_32, implicit $mode, implicit $exec
+    %1305:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1304:vgpr_32, implicit $mode, implicit $exec
+    %1306:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub2:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1307:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1306:vgpr_32, implicit $mode, implicit $exec
+    %1308:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1307:vgpr_32, implicit $mode, implicit $exec
+    %1309:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub3:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1310:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1309:vgpr_32, implicit $mode, implicit $exec
+    %1311:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1310:vgpr_32, implicit $mode, implicit $exec
+    %1312:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub4:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1313:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1312:vgpr_32, implicit $mode, implicit $exec
+    %1314:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1313:vgpr_32, implicit $mode, implicit $exec
+    %1315:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub5:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1316:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1315:vgpr_32, implicit $mode, implicit $exec
+    %1317:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1316:vgpr_32, implicit $mode, implicit $exec
+    %1318:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub6:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1319:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1318:vgpr_32, implicit $mode, implicit $exec
+    %1320:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1319:vgpr_32, implicit $mode, implicit $exec
+    %1321:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub7:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1322:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1321:vgpr_32, implicit $mode, implicit $exec
+    %1323:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1322:vgpr_32, implicit $mode, implicit $exec
+    %1324:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub8:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1325:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1324:vgpr_32, implicit $mode, implicit $exec
+    %1326:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1325:vgpr_32, implicit $mode, implicit $exec
+    %1327:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub9:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1328:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1327:vgpr_32, implicit $mode, implicit $exec
+    %1329:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1328:vgpr_32, implicit $mode, implicit $exec
+    %1330:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub10:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1331:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1330:vgpr_32, implicit $mode, implicit $exec
+    %1332:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1331:vgpr_32, implicit $mode, implicit $exec
+    %1333:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub11:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1334:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1333:vgpr_32, implicit $mode, implicit $exec
+    %1335:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1334:vgpr_32, implicit $mode, implicit $exec
+    %1336:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub12:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1337:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1336:vgpr_32, implicit $mode, implicit $exec
+    %1338:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1337:vgpr_32, implicit $mode, implicit $exec
+    %1339:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub13:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1340:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1339:vgpr_32, implicit $mode, implicit $exec
+    %1341:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1340:vgpr_32, implicit $mode, implicit $exec
+    %1342:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub14:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1343:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1342:vgpr_32, implicit $mode, implicit $exec
+    %1344:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1343:vgpr_32, implicit $mode, implicit $exec
+    %1345:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %1.sub4:sgpr_512, 0, %701.sub15:vreg_512_align2, 1, %151:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %1346:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1345:vgpr_32, implicit $mode, implicit $exec
+    %1347:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %1346:vgpr_32, implicit $mode, implicit $exec
+    %1348:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %1158:vgpr_32, implicit $mode, implicit $exec
+    %1349:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1161:vgpr_32, %1348:vgpr_32, implicit $mode, implicit $exec
+    %1350:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1164:vgpr_32, %1349:vgpr_32, implicit $mode, implicit $exec
+    %1351:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1167:vgpr_32, %1350:vgpr_32, implicit $mode, implicit $exec
+    %1352:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1170:vgpr_32, %1351:vgpr_32, implicit $mode, implicit $exec
+    %1353:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1173:vgpr_32, %1352:vgpr_32, implicit $mode, implicit $exec
+    %1354:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1176:vgpr_32, %1353:vgpr_32, implicit $mode, implicit $exec
+    %1355:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1179:vgpr_32, %1354:vgpr_32, implicit $mode, implicit $exec
+    %1356:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1182:vgpr_32, %1355:vgpr_32, implicit $mode, implicit $exec
+    %1357:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1185:vgpr_32, %1356:vgpr_32, implicit $mode, implicit $exec
+    %1358:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1188:vgpr_32, %1357:vgpr_32, implicit $mode, implicit $exec
+    %1359:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1191:vgpr_32, %1358:vgpr_32, implicit $mode, implicit $exec
+    %1360:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1194:vgpr_32, %1359:vgpr_32, implicit $mode, implicit $exec
+    %1361:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1197:vgpr_32, %1360:vgpr_32, implicit $mode, implicit $exec
+    %1362:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1200:vgpr_32, %1361:vgpr_32, implicit $mode, implicit $exec
+    %1363:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1203:vgpr_32, %1362:vgpr_32, implicit $mode, implicit $exec
+    %1364:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1206:vgpr_32, %1363:vgpr_32, implicit $mode, implicit $exec
+    %1365:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1209:vgpr_32, %1364:vgpr_32, implicit $mode, implicit $exec
+    %1366:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1212:vgpr_32, %1365:vgpr_32, implicit $mode, implicit $exec
+    %1367:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1215:vgpr_32, %1366:vgpr_32, implicit $mode, implicit $exec
+    %1368:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1218:vgpr_32, %1367:vgpr_32, implicit $mode, implicit $exec
+    %1369:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1221:vgpr_32, %1368:vgpr_32, implicit $mode, implicit $exec
+    %1370:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1224:vgpr_32, %1369:vgpr_32, implicit $mode, implicit $exec
+    %1371:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1227:vgpr_32, %1370:vgpr_32, implicit $mode, implicit $exec
+    %1372:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1230:vgpr_32, %1371:vgpr_32, implicit $mode, implicit $exec
+    %1373:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1233:vgpr_32, %1372:vgpr_32, implicit $mode, implicit $exec
+    %1374:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1236:vgpr_32, %1373:vgpr_32, implicit $mode, implicit $exec
+    %1375:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1239:vgpr_32, %1374:vgpr_32, implicit $mode, implicit $exec
+    %1376:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1242:vgpr_32, %1375:vgpr_32, implicit $mode, implicit $exec
+    %1377:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1245:vgpr_32, %1376:vgpr_32, implicit $mode, implicit $exec
+    %1378:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1248:vgpr_32, %1377:vgpr_32, implicit $mode, implicit $exec
+    %1379:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1251:vgpr_32, %1378:vgpr_32, implicit $mode, implicit $exec
+    %1380:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1254:vgpr_32, %1379:vgpr_32, implicit $mode, implicit $exec
+    %1381:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1257:vgpr_32, %1380:vgpr_32, implicit $mode, implicit $exec
+    %1382:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1260:vgpr_32, %1381:vgpr_32, implicit $mode, implicit $exec
+    %1383:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1263:vgpr_32, %1382:vgpr_32, implicit $mode, implicit $exec
+    %1384:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1266:vgpr_32, %1383:vgpr_32, implicit $mode, implicit $exec
+    %1385:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1269:vgpr_32, %1384:vgpr_32, implicit $mode, implicit $exec
+    %1386:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1272:vgpr_32, %1385:vgpr_32, implicit $mode, implicit $exec
+    %1387:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1275:vgpr_32, %1386:vgpr_32, implicit $mode, implicit $exec
+    %1388:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1278:vgpr_32, %1387:vgpr_32, implicit $mode, implicit $exec
+    %1389:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1281:vgpr_32, %1388:vgpr_32, implicit $mode, implicit $exec
+    %1390:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1284:vgpr_32, %1389:vgpr_32, implicit $mode, implicit $exec
+    %1391:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1287:vgpr_32, %1390:vgpr_32, implicit $mode, implicit $exec
+    %1392:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1290:vgpr_32, %1391:vgpr_32, implicit $mode, implicit $exec
+    %1393:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1293:vgpr_32, %1392:vgpr_32, implicit $mode, implicit $exec
+    %1394:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1296:vgpr_32, %1393:vgpr_32, implicit $mode, implicit $exec
+    %1395:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1299:vgpr_32, %1394:vgpr_32, implicit $mode, implicit $exec
+    %1396:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1302:vgpr_32, %1395:vgpr_32, implicit $mode, implicit $exec
+    %1397:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1305:vgpr_32, %1396:vgpr_32, implicit $mode, implicit $exec
+    %1398:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1308:vgpr_32, %1397:vgpr_32, implicit $mode, implicit $exec
+    %1399:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1311:vgpr_32, %1398:vgpr_32, implicit $mode, implicit $exec
+    %1400:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1314:vgpr_32, %1399:vgpr_32, implicit $mode, implicit $exec
+    %1401:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1317:vgpr_32, %1400:vgpr_32, implicit $mode, implicit $exec
+    %1402:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1320:vgpr_32, %1401:vgpr_32, implicit $mode, implicit $exec
+    %1403:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1323:vgpr_32, %1402:vgpr_32, implicit $mode, implicit $exec
+    %1404:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1326:vgpr_32, %1403:vgpr_32, implicit $mode, implicit $exec
+    %1405:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1329:vgpr_32, %1404:vgpr_32, implicit $mode, implicit $exec
+    %1406:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1332:vgpr_32, %1405:vgpr_32, implicit $mode, implicit $exec
+    %1407:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1335:vgpr_32, %1406:vgpr_32, implicit $mode, implicit $exec
+    %1408:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1338:vgpr_32, %1407:vgpr_32, implicit $mode, implicit $exec
+    %1409:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1341:vgpr_32, %1408:vgpr_32, implicit $mode, implicit $exec
+    %1410:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1344:vgpr_32, %1409:vgpr_32, implicit $mode, implicit $exec
+    %1411:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1347:vgpr_32, %1410:vgpr_32, implicit $mode, implicit $exec
+    %1412:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1411:vgpr_32, 0, implicit $exec
+    %1413:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %1411:vgpr_32, %1412:vgpr_32, implicit $mode, implicit $exec
+    %1414:vgpr_32 = DS_BPERMUTE_B32 %39:vgpr_32, %1413:vgpr_32, 0, implicit $exec
+    %3347:vgpr_32 = V_CNDMASK_B32_e64 0, %1414:vgpr_32, 0, %1413:vgpr_32, %484:sreg_64_xexec, implicit $exec
+    %1417:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %3346:vgpr_32, %151:vgpr_32, implicit $mode, implicit $exec
+    %1418:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %1417:vgpr_32, implicit $mode, implicit $exec
+    undef %1455.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %1418:vgpr_32, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3037.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3021.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub0:vreg_64_align2, %803.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3037.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3021.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub0:vreg_64_align2, %807.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3005.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %2978.sub0:vreg_64_align2 = V_PERM_B32_e64 %805.sub1:vreg_64_align2, %803.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3005.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %2978.sub1:vreg_64_align2 = V_PERM_B32_e64 %809.sub1:vreg_64_align2, %807.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %1442:vgpr_32 = V_ADD_U32_e32 %593:sreg_32, %15:vgpr_32, implicit $exec
+    %1444:vgpr_32 = V_AND_B32_e32 536870911, %1442:vgpr_32, implicit $exec
+    %1446:vgpr_32 = nsw V_MUL_LO_U32_e64 %1444:vgpr_32, %494:sreg_32, implicit $exec
+    %1447:vgpr_32 = V_ADD_LSHL_U32_e64 %47:vgpr_32, %1446:vgpr_32, 1, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3037:vreg_64_align2, 0, 0, implicit $exec
+    %1449:vgpr_32 = V_LSHL_ADD_U32_e64 %41:vgpr_32, 1, %1447:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3021:vreg_64_align2, 0, 0, implicit $exec
+    %1451:vgpr_32 = V_LSHL_ADD_U32_e64 %42:vgpr_32, 1, %1449:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3005:vreg_64_align2, 0, 0, implicit $exec
+    %1453:vgpr_32 = V_LSHL_ADD_U32_e64 %43:vgpr_32, 1, %1451:vgpr_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %2978:vreg_64_align2, 0, 0, implicit $exec
+    %3347:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %86:vgpr_32, %1455.sub0:vreg_64_align2, %3347:vgpr_32, implicit $mode, implicit $exec
+    %2986.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2986.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2986.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3038.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3038.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %2980.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %2980.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub0_sub1:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub2_sub3:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub4_sub5:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub6_sub7:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub8_sub9:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub10_sub11:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub12_sub13:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3003.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %3003.sub14_sub15:vreg_512_align2, 0, %1455:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %1554:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1158:vgpr_32, implicit $mode, implicit $exec
+    %1555:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1161:vgpr_32, implicit $mode, implicit $exec
+    %1556:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1164:vgpr_32, implicit $mode, implicit $exec
+    %1557:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1170:vgpr_32, implicit $mode, implicit $exec
+    %1558:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1173:vgpr_32, implicit $mode, implicit $exec
+    %1559:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1176:vgpr_32, implicit $mode, implicit $exec
+    %1560:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1182:vgpr_32, implicit $mode, implicit $exec
+    %1561:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1185:vgpr_32, implicit $mode, implicit $exec
+    %1562:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1188:vgpr_32, implicit $mode, implicit $exec
+    %1563:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1194:vgpr_32, implicit $mode, implicit $exec
+    %1564:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1197:vgpr_32, implicit $mode, implicit $exec
+    %1565:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1200:vgpr_32, implicit $mode, implicit $exec
+    %1566:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1206:vgpr_32, implicit $mode, implicit $exec
+    %1567:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1209:vgpr_32, implicit $mode, implicit $exec
+    %1568:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1212:vgpr_32, implicit $mode, implicit $exec
+    %1569:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1218:vgpr_32, implicit $mode, implicit $exec
+    %1570:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1221:vgpr_32, implicit $mode, implicit $exec
+    %1571:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1224:vgpr_32, implicit $mode, implicit $exec
+    %1572:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1230:vgpr_32, implicit $mode, implicit $exec
+    %1573:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1233:vgpr_32, implicit $mode, implicit $exec
+    %1574:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1236:vgpr_32, implicit $mode, implicit $exec
+    %1575:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1242:vgpr_32, implicit $mode, implicit $exec
+    %1576:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1245:vgpr_32, implicit $mode, implicit $exec
+    %1577:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1248:vgpr_32, implicit $mode, implicit $exec
+    %1578:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1254:vgpr_32, implicit $mode, implicit $exec
+    %1579:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1257:vgpr_32, implicit $mode, implicit $exec
+    %1580:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1260:vgpr_32, implicit $mode, implicit $exec
+    %1581:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1266:vgpr_32, implicit $mode, implicit $exec
+    %1582:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1269:vgpr_32, implicit $mode, implicit $exec
+    %1583:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1272:vgpr_32, implicit $mode, implicit $exec
+    %1584:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1278:vgpr_32, implicit $mode, implicit $exec
+    %1585:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1281:vgpr_32, implicit $mode, implicit $exec
+    %1586:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1284:vgpr_32, implicit $mode, implicit $exec
+    %1587:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1290:vgpr_32, implicit $mode, implicit $exec
+    %1588:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1293:vgpr_32, implicit $mode, implicit $exec
+    %1589:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1296:vgpr_32, implicit $mode, implicit $exec
+    %1590:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3345:vgpr_32, implicit $exec
+    %1591:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1590:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1592:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3334:vgpr_32, implicit $exec
+    %1593:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1592:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1594:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3335:vgpr_32, implicit $exec
+    %1595:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1594:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1596:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3336:vgpr_32, implicit $exec
+    %1597:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1596:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1598:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1605:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1612:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1619:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1626:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1633:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1640:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1647:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3161.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3145.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub0:vreg_64_align2, %1591.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3161.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3145.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub0:vreg_64_align2, %1595.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3129.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3113.sub0:vreg_64_align2 = V_PERM_B32_e64 %1593.sub1:vreg_64_align2, %1591.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3129.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3113.sub1:vreg_64_align2 = V_PERM_B32_e64 %1597.sub1:vreg_64_align2, %1595.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3161:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3145:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3129:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3113:vreg_64_align2, 0, 0, implicit $exec
+    %1678:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3344:vgpr_32, implicit $exec
+    %1679:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1678:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1680:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3337:vgpr_32, implicit $exec
+    %1681:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1680:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1682:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3338:vgpr_32, implicit $exec
+    %1683:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1682:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1684:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3339:vgpr_32, implicit $exec
+    %1685:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1684:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1686:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1693:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1700:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1707:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1714:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1721:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1728:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1735:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3062.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3046.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub0:vreg_64_align2, %1679.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3062.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3046.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub0:vreg_64_align2, %1683.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3029.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3013.sub0:vreg_64_align2 = V_PERM_B32_e64 %1681.sub1:vreg_64_align2, %1679.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3029.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3013.sub1:vreg_64_align2 = V_PERM_B32_e64 %1685.sub1:vreg_64_align2, %1683.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3062:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3046:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3029:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3013:vreg_64_align2, 0, 0, implicit $exec
+    %1766:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3343:vgpr_32, implicit $exec
+    %1767:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1766:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1768:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3340:vgpr_32, implicit $exec
+    %1769:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1768:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1770:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3341:vgpr_32, implicit $exec
+    %1771:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1770:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    %1772:vgpr_32 = V_ADD_U32_e32 %48:vgpr_32, %3342:vgpr_32, implicit $exec
+    %1773:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %1772:vgpr_32, %473:sgpr_128, 0, 0, 0, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %1774:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %1781:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %1788:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %1795:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %1802:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %1809:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %1816:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %1823:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3185.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3169.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub0:vreg_64_align2, %1767.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3185.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3169.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub0:vreg_64_align2, %1771.sub0:vreg_64_align2, %1424:sreg_32, implicit $exec
+    undef %3153.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    undef %3137.sub0:vreg_64_align2 = V_PERM_B32_e64 %1769.sub1:vreg_64_align2, %1767.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    %3153.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1422:sreg_32, implicit $exec
+    %3137.sub1:vreg_64_align2 = V_PERM_B32_e64 %1773.sub1:vreg_64_align2, %1771.sub1:vreg_64_align2, %1424:sreg_32, implicit $exec
+    DS_WRITE_B64_gfx9 %1447:vgpr_32, %3185:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1449:vgpr_32, %3169:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1451:vgpr_32, %3153:vreg_64_align2, 0, 0, implicit $exec
+    DS_WRITE_B64_gfx9 %1453:vgpr_32, %3137:vreg_64_align2, 0, 0, implicit $exec
+    %1854:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1167:vgpr_32, implicit $mode, implicit $exec
+    %1855:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1179:vgpr_32, implicit $mode, implicit $exec
+    %1856:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1191:vgpr_32, implicit $mode, implicit $exec
+    %1857:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1203:vgpr_32, implicit $mode, implicit $exec
+    %1858:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1215:vgpr_32, implicit $mode, implicit $exec
+    %1859:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1227:vgpr_32, implicit $mode, implicit $exec
+    %1860:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1239:vgpr_32, implicit $mode, implicit $exec
+    %1861:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1251:vgpr_32, implicit $mode, implicit $exec
+    %1862:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1263:vgpr_32, implicit $mode, implicit $exec
+    %1863:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1275:vgpr_32, implicit $mode, implicit $exec
+    %1864:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1287:vgpr_32, implicit $mode, implicit $exec
+    %1865:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1299:vgpr_32, implicit $mode, implicit $exec
+    undef %3121.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1556:vgpr_32, 0, %1854:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3121.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1554:vgpr_32, 0, %1555:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3105.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1559:vgpr_32, 0, %1855:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3105.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1557:vgpr_32, 0, %1558:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3089.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1562:vgpr_32, 0, %1856:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3089.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1560:vgpr_32, 0, %1561:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3073.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1565:vgpr_32, 0, %1857:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3073.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1563:vgpr_32, 0, %1564:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1598.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1605.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1612.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub0_sub1:vreg_128_align2, %3121:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1619.sub2_sub3:vreg_128_align2, %3105:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1626.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1633.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1640.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub0_sub1:vreg_128_align2, %3089:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1647.sub2_sub3:vreg_128_align2, %3073:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %2993.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1568:vgpr_32, 0, %1858:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2993.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1566:vgpr_32, 0, %1567:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3195.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1571:vgpr_32, 0, %1859:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3195.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1569:vgpr_32, 0, %1570:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3178.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1574:vgpr_32, 0, %1860:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3178.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1572:vgpr_32, 0, %1573:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3162.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1577:vgpr_32, 0, %1861:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3162.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1575:vgpr_32, 0, %1576:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1686.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1693.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1700.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub0_sub1:vreg_128_align2, %2993:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1707.sub2_sub3:vreg_128_align2, %3195:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1714.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1721.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1728.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub0_sub1:vreg_128_align2, %3178:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1735.sub2_sub3:vreg_128_align2, %3162:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    undef %3146.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1580:vgpr_32, 0, %1862:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3146.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1578:vgpr_32, 0, %1579:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3130.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1583:vgpr_32, 0, %1863:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3130.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1581:vgpr_32, 0, %1582:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3114.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1586:vgpr_32, 0, %1864:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3114.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1584:vgpr_32, 0, %1585:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3098.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1589:vgpr_32, 0, %1865:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3098.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %1587:vgpr_32, 0, %1588:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1774.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1781.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1788.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub0_sub1:vreg_128_align2, %3146:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1795.sub2_sub3:vreg_128_align2, %3130:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1802.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1809.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1816.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub0_sub1:vreg_128_align2, %3114:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1823.sub2_sub3:vreg_128_align2, %3098:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2054:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1347:vgpr_32, implicit $mode, implicit $exec
+    %2055:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1341:vgpr_32, implicit $mode, implicit $exec
+    %2056:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1335:vgpr_32, implicit $mode, implicit $exec
+    %2057:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1329:vgpr_32, implicit $mode, implicit $exec
+    %2058:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1323:vgpr_32, implicit $mode, implicit $exec
+    %2059:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1317:vgpr_32, implicit $mode, implicit $exec
+    %2060:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1311:vgpr_32, implicit $mode, implicit $exec
+    %2061:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1305:vgpr_32, implicit $mode, implicit $exec
+    %2062:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1344:vgpr_32, implicit $mode, implicit $exec
+    %2063:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1338:vgpr_32, implicit $mode, implicit $exec
+    %2064:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1332:vgpr_32, implicit $mode, implicit $exec
+    %2065:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1326:vgpr_32, implicit $mode, implicit $exec
+    %2066:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1320:vgpr_32, implicit $mode, implicit $exec
+    %2067:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1314:vgpr_32, implicit $mode, implicit $exec
+    %2068:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1308:vgpr_32, implicit $mode, implicit $exec
+    %2069:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %1302:vgpr_32, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    undef %3082.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2068:vgpr_32, 0, %2060:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3082.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2069:vgpr_32, 0, %2061:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3066.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2066:vgpr_32, 0, %2058:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3066.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2067:vgpr_32, 0, %2059:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3050.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2064:vgpr_32, 0, %2056:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3050.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2065:vgpr_32, 0, %2057:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    undef %3033.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2062:vgpr_32, 0, %2054:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3033.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %2063:vgpr_32, 0, %2055:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %2082:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 0, 0, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2082.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2095:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 576, 0, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2095.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2108:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1152, 0, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2108.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2121:vreg_128_align2 = DS_READ_B128_gfx9 %44:vgpr_32, 1728, 0, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub0_sub1:vreg_128_align2, %3082:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2121.sub2_sub3:vreg_128_align2, %3066:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2134:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 0, 0, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2986:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2134.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2986:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2146:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 576, 0, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3038:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2146.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3038:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2158:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1152, 0, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2980:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2158.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %2980:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %2170:vreg_128_align2 = DS_READ_B128_gfx9 %45:vgpr_32, 1728, 0, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub0_sub1:vreg_128_align2, %3050:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %3003:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %2170.sub2_sub3:vreg_128_align2, %3033:vreg_64_align2, %3003:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+    INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+    %3345:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3345:vgpr_32, implicit $exec
+    %3344:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3344:vgpr_32, implicit $exec
+    %3343:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3343:vgpr_32, implicit $exec
+    %3342:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3342:vgpr_32, implicit $exec
+    %3341:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3341:vgpr_32, implicit $exec
+    %3340:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3340:vgpr_32, implicit $exec
+    %3339:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3339:vgpr_32, implicit $exec
+    %3338:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3338:vgpr_32, implicit $exec
+    %3337:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3337:vgpr_32, implicit $exec
+    %3336:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3336:vgpr_32, implicit $exec
+    %3335:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3335:vgpr_32, implicit $exec
+    %3334:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3334:vgpr_32, implicit $exec
+    %3333:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3333:vgpr_32, implicit $exec
+    %3332:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3332:vgpr_32, implicit $exec
+    %3331:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3331:vgpr_32, implicit $exec
+    %3330:vgpr_32 = V_ADD_U32_e32 %50:sreg_32, %3330:vgpr_32, implicit $exec
+    %3329:vgpr_32 = nuw V_ADD_U32_e32 128, %3329:vgpr_32, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
new file mode 100644
index 0000000000000..0473e017f193c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -0,0 +1,901 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @smallInterleave() #0 { ret void }
+  ; GCN-LABEL: smallInterleave:
+  ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
+  ; GCN-NEXT:    ; implicit-def: $sgpr4
+  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; implicit-def: $vgpr49
+  ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+  ; GCN-NEXT:    ; implicit-def: $vgpr51
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr77
+  ; GCN-NEXT:    ; implicit-def: $vgpr78
+  ; GCN-NEXT:    ; implicit-def: $vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr80
+  ; GCN-NEXT:    ; implicit-def: $vgpr91
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
+  ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
+  ; GCN-NEXT:    ; implicit-def: $vgpr5
+  ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
+  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
+  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx4 v[32:35], v4, s[0:3], 0 offen offset:64 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $sgpr6
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[74:75], v1, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v49
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v49 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
+  ; GCN-NEXT:    ; kill: killed $vgpr1
+  ; GCN-NEXT:    ; kill: killed $vgpr0
+  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    ; implicit-def: $sgpr3
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b128 v48, v[32:35]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], v[16:31]
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v49
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
+  ; GCN-NEXT:    ; implicit-def: $vgpr36_vgpr37_vgpr38_vgpr39
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
+  ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
+  ; GCN-NEXT:    ; implicit-def: $vgpr32
+  ; GCN-NEXT:    ; implicit-def: $vgpr33
+  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
+  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
+  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[42:43], v[38:39], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr67
+  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
+  ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
+  ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
+  ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
+  ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
+  ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v27
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v87
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
+  ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
+  ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v79, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr84
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    s_waitcnt vmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
+  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
+  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
+  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
+  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
+  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
+  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
+  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
+  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    ds_write_b32 v76, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
+  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    buffer_wbl2 sc0 sc1
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b32 v79, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
+  ; GCN-NEXT:    ;;#ASMSTART
+  ; GCN-NEXT:    s_waitcnt vmcnt(8)
+  ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
+  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
+  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
+  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
+  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
+  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
+  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
+  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
+  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
+  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
+  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
+  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
+  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
+  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
+  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
+  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
+  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    s_endpgm
+  attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+
+  !0 = !{i64 2862105}
+
+...
+
+---
+name:            smallInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+ bb.0:
+  liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2, $sgpr3, $sgpr4
+  %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  %1:vgpr_32 = COPY %0:vgpr_32
+  %2:vgpr_32 = IMPLICIT_DEF
+  %3:sreg_32 = IMPLICIT_DEF
+  %4:vreg_64_align2 = IMPLICIT_DEF
+  %5:sgpr_128 = IMPLICIT_DEF
+  %6:vgpr_32 = IMPLICIT_DEF
+  %7:vgpr_32 = IMPLICIT_DEF
+  %8:sgpr_128 = IMPLICIT_DEF
+  %9:vgpr_32 = IMPLICIT_DEF
+  %10:sgpr_512 = IMPLICIT_DEF
+  %11:sgpr_32 = IMPLICIT_DEF
+  %12:sreg_64_xexec = IMPLICIT_DEF
+  %13:vgpr_32 = IMPLICIT_DEF
+  %14:sreg_32 = IMPLICIT_DEF
+  %15:sreg_32 = IMPLICIT_DEF
+  %16:vgpr_32 = IMPLICIT_DEF
+  %17:sreg_32 = IMPLICIT_DEF
+  %18:vgpr_32 = IMPLICIT_DEF
+  %19:vgpr_32 = IMPLICIT_DEF
+  %20:vgpr_32 = IMPLICIT_DEF
+  %21:vgpr_32 = IMPLICIT_DEF
+  %22:vgpr_32 = IMPLICIT_DEF
+  %23:vgpr_32 = IMPLICIT_DEF
+  %24:vgpr_32 = IMPLICIT_DEF
+  %25:vgpr_32 = IMPLICIT_DEF
+  %26:sreg_32 = IMPLICIT_DEF
+  %42:vgpr_32 = IMPLICIT_DEF
+  %44:vreg_128_align2 = IMPLICIT_DEF
+  %48:vgpr_32 = IMPLICIT_DEF
+  %49:vreg_128_align2 = IMPLICIT_DEF
+  %52:vreg_128_align2 = IMPLICIT_DEF
+  %55:vreg_128_align2 = IMPLICIT_DEF
+  %106:vgpr_32 = IMPLICIT_DEF
+  %29:vgpr_32 = IMPLICIT_DEF
+  %37:vgpr_32 = IMPLICIT_DEF
+  %259:vreg_512_align2 = IMPLICIT_DEF
+  %260:vreg_512_align2 = IMPLICIT_DEF
+  IGLP_OPT 2
+  %27:sreg_32 = V_READFIRSTLANE_B32 %2:vgpr_32, implicit $exec
+  %28:vgpr_32 = V_LSHL_ADD_U32_e64 %27:sreg_32, 4, %29:vgpr_32, implicit $exec
+  %30:vreg_64_align2, dead %31:sreg_64 = V_MAD_U64_U32_e64 %3:sreg_32, %28:vgpr_32, %4:vreg_64_align2, 0, implicit $exec
+  %32:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %33:sreg_32 = S_LSHL_B32 %27:sreg_32, 7, implicit-def dead $scc
+  %34:vgpr_32 = V_ADD_LSHL_U32_e64 %6:vgpr_32, %33:sreg_32, 1, implicit $exec
+  DS_WRITE_B128_gfx9 %34:vgpr_32, %32:vreg_128_align2, 0, 0, implicit $exec
+  %35:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %30.sub0:vreg_64_align2, %5:sgpr_128, 0, 64, 0, 0, implicit $exec
+  %36:vgpr_32 = V_ADD_U32_e32 %7:vgpr_32, %37:vgpr_32, implicit $exec
+  %38:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %36:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %39:vgpr_32 = V_ADD_U32_e32 %9:vgpr_32, %37:vgpr_32, implicit $exec
+  %40:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %39:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %41:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+  early-clobber %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %41.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %41.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %45:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+  early-clobber %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_vgprcd_e64 %45.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %45.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %47:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %47.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %50:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub0_sub1:vreg_128_align2, %49.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %50.sub2_sub3:vreg_128_align2, %49.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  DS_WRITE_B128_gfx9 %34:vgpr_32, %35:vreg_128_align2, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %51:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %51.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %53:vreg_128_align2 = DS_READ_B128_gfx9 %42:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub0_sub1:vreg_128_align2, %52.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %53.sub2_sub3:vreg_128_align2, %52.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %54:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 0, 0, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %43:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %43:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %56:vreg_128_align2 = DS_READ_B128_gfx9 %48:vgpr_32, 512, 0, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub0_sub1:vreg_128_align2, %55.sub0_sub1:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %46:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %56.sub2_sub3:vreg_128_align2, %55.sub2_sub3:vreg_128_align2, %46:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %57:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub0:vreg_512_align2, implicit $mode, implicit $exec
+  %58:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub1:vreg_512_align2, implicit $mode, implicit $exec
+  %59:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub2:vreg_512_align2, implicit $mode, implicit $exec
+  %60:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub3:vreg_512_align2, implicit $mode, implicit $exec
+  %61:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub4:vreg_512_align2, implicit $mode, implicit $exec
+  %62:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub5:vreg_512_align2, implicit $mode, implicit $exec
+  %63:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub6:vreg_512_align2, implicit $mode, implicit $exec
+  %64:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub7:vreg_512_align2, implicit $mode, implicit $exec
+  %65:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub8:vreg_512_align2, implicit $mode, implicit $exec
+  %66:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub9:vreg_512_align2, implicit $mode, implicit $exec
+  %67:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub10:vreg_512_align2, implicit $mode, implicit $exec
+  %68:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub11:vreg_512_align2, implicit $mode, implicit $exec
+  %69:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub12:vreg_512_align2, implicit $mode, implicit $exec
+  %70:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub13:vreg_512_align2, implicit $mode, implicit $exec
+  %71:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub14:vreg_512_align2, implicit $mode, implicit $exec
+  %72:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %43.sub15:vreg_512_align2, implicit $mode, implicit $exec
+  %73:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub0:vreg_512_align2, implicit $mode, implicit $exec
+  %74:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub1:vreg_512_align2, implicit $mode, implicit $exec
+  %75:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub2:vreg_512_align2, implicit $mode, implicit $exec
+  %76:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub3:vreg_512_align2, implicit $mode, implicit $exec
+  %77:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub4:vreg_512_align2, implicit $mode, implicit $exec
+  %78:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub5:vreg_512_align2, implicit $mode, implicit $exec
+  %79:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub6:vreg_512_align2, implicit $mode, implicit $exec
+  %80:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub7:vreg_512_align2, implicit $mode, implicit $exec
+  %81:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub8:vreg_512_align2, implicit $mode, implicit $exec
+  %82:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub9:vreg_512_align2, implicit $mode, implicit $exec
+  %83:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub10:vreg_512_align2, implicit $mode, implicit $exec
+  %84:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub11:vreg_512_align2, implicit $mode, implicit $exec
+  %85:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub12:vreg_512_align2, implicit $mode, implicit $exec
+  %86:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub13:vreg_512_align2, implicit $mode, implicit $exec
+  %87:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub14:vreg_512_align2, implicit $mode, implicit $exec
+  %88:vgpr_32 = contract nofpexcept V_MUL_F32_e32 %10.sub4:sgpr_512, %46.sub15:vreg_512_align2, implicit $mode, implicit $exec
+  %89:vgpr_32 = V_MAX3_F32_e64 0, %57:vgpr_32, 0, %11:sgpr_32, 0, %58:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %90:vgpr_32 = V_MAX3_F32_e64 0, %89:vgpr_32, 0, %59:vgpr_32, 0, %60:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %91:vgpr_32 = V_MAX3_F32_e64 0, %90:vgpr_32, 0, %61:vgpr_32, 0, %62:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %92:vgpr_32 = V_MAX3_F32_e64 0, %91:vgpr_32, 0, %63:vgpr_32, 0, %64:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %93:vgpr_32 = V_MAX3_F32_e64 0, %92:vgpr_32, 0, %65:vgpr_32, 0, %66:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %94:vgpr_32 = V_MAX3_F32_e64 0, %93:vgpr_32, 0, %67:vgpr_32, 0, %68:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %95:vgpr_32 = V_MAX3_F32_e64 0, %94:vgpr_32, 0, %69:vgpr_32, 0, %70:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %96:vgpr_32 = V_MAX3_F32_e64 0, %95:vgpr_32, 0, %71:vgpr_32, 0, %72:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %97:vgpr_32 = V_MAX3_F32_e64 0, %96:vgpr_32, 0, %73:vgpr_32, 0, %74:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %98:vgpr_32 = V_MAX3_F32_e64 0, %97:vgpr_32, 0, %75:vgpr_32, 0, %76:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %99:vgpr_32 = V_MAX3_F32_e64 0, %98:vgpr_32, 0, %77:vgpr_32, 0, %78:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %100:vgpr_32 = V_MAX3_F32_e64 0, %99:vgpr_32, 0, %79:vgpr_32, 0, %80:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %101:vgpr_32 = V_MAX3_F32_e64 0, %100:vgpr_32, 0, %81:vgpr_32, 0, %82:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %102:vgpr_32 = V_MAX3_F32_e64 0, %101:vgpr_32, 0, %83:vgpr_32, 0, %84:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %103:vgpr_32 = V_MAX3_F32_e64 0, %102:vgpr_32, 0, %85:vgpr_32, 0, %86:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %104:vgpr_32 = V_MAX3_F32_e64 0, %103:vgpr_32, 0, %87:vgpr_32, 0, %88:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %105:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %104:vgpr_32, 0, implicit $exec
+  %107:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %105:vgpr_32, %105:vgpr_32, implicit $mode, implicit $exec
+  %108:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %104:vgpr_32, %107:vgpr_32, implicit $mode, implicit $exec
+  %109:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %108:vgpr_32, 0, implicit $exec
+  %110:vgpr_32 = V_CNDMASK_B32_e64 0, %109:vgpr_32, 0, %108:vgpr_32, %12:sreg_64_xexec, implicit $exec
+  %111:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %110:vgpr_32, %110:vgpr_32, implicit $mode, implicit $exec
+  %112:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %13:vgpr_32, %13:vgpr_32, implicit $mode, implicit $exec
+  %113:vgpr_32 = contract nofpexcept V_MAX_F32_e32 %112:vgpr_32, %111:vgpr_32, implicit $mode, implicit $exec
+  %114:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %115:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %114:vgpr_32, implicit $mode, implicit $exec
+  %116:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %115:vgpr_32, implicit $mode, implicit $exec
+  %117:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %118:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %117:vgpr_32, implicit $mode, implicit $exec
+  %119:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %118:vgpr_32, implicit $mode, implicit $exec
+  %120:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %121:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %120:vgpr_32, implicit $mode, implicit $exec
+  %122:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %121:vgpr_32, implicit $mode, implicit $exec
+  %123:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %124:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %123:vgpr_32, implicit $mode, implicit $exec
+  %125:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %124:vgpr_32, implicit $mode, implicit $exec
+  %126:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %127:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %126:vgpr_32, implicit $mode, implicit $exec
+  %128:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %127:vgpr_32, implicit $mode, implicit $exec
+  %129:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %130:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %129:vgpr_32, implicit $mode, implicit $exec
+  %131:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %130:vgpr_32, implicit $mode, implicit $exec
+  %132:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %133:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %132:vgpr_32, implicit $mode, implicit $exec
+  %134:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %133:vgpr_32, implicit $mode, implicit $exec
+  %135:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %136:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %135:vgpr_32, implicit $mode, implicit $exec
+  %137:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %136:vgpr_32, implicit $mode, implicit $exec
+  %138:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %139:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %138:vgpr_32, implicit $mode, implicit $exec
+  %140:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %139:vgpr_32, implicit $mode, implicit $exec
+  %141:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %142:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %141:vgpr_32, implicit $mode, implicit $exec
+  %143:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %142:vgpr_32, implicit $mode, implicit $exec
+  %144:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %145:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %144:vgpr_32, implicit $mode, implicit $exec
+  %146:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %145:vgpr_32, implicit $mode, implicit $exec
+  %147:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %148:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %147:vgpr_32, implicit $mode, implicit $exec
+  %149:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %148:vgpr_32, implicit $mode, implicit $exec
+  %150:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %151:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %150:vgpr_32, implicit $mode, implicit $exec
+  %152:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %151:vgpr_32, implicit $mode, implicit $exec
+  %153:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %154:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %153:vgpr_32, implicit $mode, implicit $exec
+  %155:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %154:vgpr_32, implicit $mode, implicit $exec
+  %156:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %157:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %156:vgpr_32, implicit $mode, implicit $exec
+  %158:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %157:vgpr_32, implicit $mode, implicit $exec
+  %159:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %43.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %160:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %159:vgpr_32, implicit $mode, implicit $exec
+  %161:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %160:vgpr_32, implicit $mode, implicit $exec
+  %162:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub0:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %163:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %162:vgpr_32, implicit $mode, implicit $exec
+  %164:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %163:vgpr_32, implicit $mode, implicit $exec
+  %165:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub1:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %166:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %165:vgpr_32, implicit $mode, implicit $exec
+  %167:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %166:vgpr_32, implicit $mode, implicit $exec
+  %168:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub2:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %169:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %168:vgpr_32, implicit $mode, implicit $exec
+  %170:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %169:vgpr_32, implicit $mode, implicit $exec
+  %171:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub3:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %172:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %171:vgpr_32, implicit $mode, implicit $exec
+  %173:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %172:vgpr_32, implicit $mode, implicit $exec
+  %174:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub4:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %175:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %174:vgpr_32, implicit $mode, implicit $exec
+  %176:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %175:vgpr_32, implicit $mode, implicit $exec
+  %177:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub5:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %178:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %177:vgpr_32, implicit $mode, implicit $exec
+  %179:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %178:vgpr_32, implicit $mode, implicit $exec
+  %180:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub6:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %181:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %180:vgpr_32, implicit $mode, implicit $exec
+  %182:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %181:vgpr_32, implicit $mode, implicit $exec
+  %183:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub7:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %184:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %183:vgpr_32, implicit $mode, implicit $exec
+  %185:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %184:vgpr_32, implicit $mode, implicit $exec
+  %186:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub8:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %187:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %186:vgpr_32, implicit $mode, implicit $exec
+  %188:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %187:vgpr_32, implicit $mode, implicit $exec
+  %189:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub9:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %190:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %189:vgpr_32, implicit $mode, implicit $exec
+  %191:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %190:vgpr_32, implicit $mode, implicit $exec
+  %192:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub10:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %193:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %192:vgpr_32, implicit $mode, implicit $exec
+  %194:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %193:vgpr_32, implicit $mode, implicit $exec
+  %195:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub11:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %196:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %195:vgpr_32, implicit $mode, implicit $exec
+  %197:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %196:vgpr_32, implicit $mode, implicit $exec
+  %198:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub12:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %199:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %198:vgpr_32, implicit $mode, implicit $exec
+  %200:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %199:vgpr_32, implicit $mode, implicit $exec
+  %201:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub13:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %202:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %201:vgpr_32, implicit $mode, implicit $exec
+  %203:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %202:vgpr_32, implicit $mode, implicit $exec
+  %204:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub14:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %205:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %204:vgpr_32, implicit $mode, implicit $exec
+  %206:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %205:vgpr_32, implicit $mode, implicit $exec
+  %207:vgpr_32 = contract nofpexcept V_FMA_F32_e64 0, %10.sub4:sgpr_512, 0, %46.sub15:vreg_512_align2, 1, %113:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %208:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %207:vgpr_32, implicit $mode, implicit $exec
+  %209:vgpr_32 = afn nofpexcept V_EXP_F32_e32 %208:vgpr_32, implicit $mode, implicit $exec
+  %210:vgpr_32 = contract nofpexcept V_ADD_F32_e32 0, %116:vgpr_32, implicit $mode, implicit $exec
+  %211:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %119:vgpr_32, %210:vgpr_32, implicit $mode, implicit $exec
+  %212:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %122:vgpr_32, %211:vgpr_32, implicit $mode, implicit $exec
+  %213:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %125:vgpr_32, %212:vgpr_32, implicit $mode, implicit $exec
+  %214:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %128:vgpr_32, %213:vgpr_32, implicit $mode, implicit $exec
+  %215:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %131:vgpr_32, %214:vgpr_32, implicit $mode, implicit $exec
+  %216:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %134:vgpr_32, %215:vgpr_32, implicit $mode, implicit $exec
+  %217:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %137:vgpr_32, %216:vgpr_32, implicit $mode, implicit $exec
+  %218:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %140:vgpr_32, %217:vgpr_32, implicit $mode, implicit $exec
+  %219:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %143:vgpr_32, %218:vgpr_32, implicit $mode, implicit $exec
+  %220:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %146:vgpr_32, %219:vgpr_32, implicit $mode, implicit $exec
+  %221:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %149:vgpr_32, %220:vgpr_32, implicit $mode, implicit $exec
+  %222:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %152:vgpr_32, %221:vgpr_32, implicit $mode, implicit $exec
+  %223:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %155:vgpr_32, %222:vgpr_32, implicit $mode, implicit $exec
+  %224:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %158:vgpr_32, %223:vgpr_32, implicit $mode, implicit $exec
+  %225:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %161:vgpr_32, %224:vgpr_32, implicit $mode, implicit $exec
+  %226:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %164:vgpr_32, %225:vgpr_32, implicit $mode, implicit $exec
+  %227:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %167:vgpr_32, %226:vgpr_32, implicit $mode, implicit $exec
+  %228:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %170:vgpr_32, %227:vgpr_32, implicit $mode, implicit $exec
+  %229:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %173:vgpr_32, %228:vgpr_32, implicit $mode, implicit $exec
+  %230:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %176:vgpr_32, %229:vgpr_32, implicit $mode, implicit $exec
+  %231:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %179:vgpr_32, %230:vgpr_32, implicit $mode, implicit $exec
+  %232:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %182:vgpr_32, %231:vgpr_32, implicit $mode, implicit $exec
+  %233:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %185:vgpr_32, %232:vgpr_32, implicit $mode, implicit $exec
+  %234:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %188:vgpr_32, %233:vgpr_32, implicit $mode, implicit $exec
+  %235:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %191:vgpr_32, %234:vgpr_32, implicit $mode, implicit $exec
+  %236:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %194:vgpr_32, %235:vgpr_32, implicit $mode, implicit $exec
+  %237:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %197:vgpr_32, %236:vgpr_32, implicit $mode, implicit $exec
+  %238:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %200:vgpr_32, %237:vgpr_32, implicit $mode, implicit $exec
+  %239:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %203:vgpr_32, %238:vgpr_32, implicit $mode, implicit $exec
+  %240:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %206:vgpr_32, %239:vgpr_32, implicit $mode, implicit $exec
+  %241:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %209:vgpr_32, %240:vgpr_32, implicit $mode, implicit $exec
+  %242:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %241:vgpr_32, 0, implicit $exec
+  %243:vgpr_32 = contract nofpexcept V_ADD_F32_e32 %241:vgpr_32, %242:vgpr_32, implicit $mode, implicit $exec
+  %244:vgpr_32 = DS_BPERMUTE_B32 %106:vgpr_32, %243:vgpr_32, 0, implicit $exec
+  %0:vgpr_32 = V_CNDMASK_B32_e64 0, %244:vgpr_32, 0, %243:vgpr_32, %12:sreg_64_xexec, implicit $exec
+  %245:vgpr_32 = contract nofpexcept V_SUB_F32_e32 %13:vgpr_32, %113:vgpr_32, implicit $mode, implicit $exec
+  %246:vgpr_32 = afn nofpexcept V_MUL_F32_e32 1069066811, %245:vgpr_32, implicit $mode, implicit $exec
+  undef %247.sub0:vreg_64_align2 = afn nofpexcept V_EXP_F32_e32 %246:vgpr_32, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %248:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+  %249:vgpr_32 = V_PERM_B32_e64 %40.sub0:vreg_64_align2, %38.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+  %250:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+  %251:vgpr_32 = V_PERM_B32_e64 %40.sub1:vreg_64_align2, %38.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+  %252:vgpr_32 = V_ADD_U32_e32 %27:sreg_32, %16:vgpr_32, implicit $exec
+  %253:vgpr_32 = V_AND_B32_e32 536870911, %252:vgpr_32, implicit $exec
+  %254:vgpr_32 = nsw V_MUL_LO_U32_e64 %253:vgpr_32, %17:sreg_32, implicit $exec
+  %255:vgpr_32 = V_ADD_LSHL_U32_e64 %18:vgpr_32, %254:vgpr_32, 1, implicit $exec
+  DS_WRITE_B32_gfx9 %255:vgpr_32, %248:vgpr_32, 0, 0, implicit $exec
+  %256:vgpr_32 = V_LSHL_ADD_U32_e64 %19:vgpr_32, 1, %255:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %256:vgpr_32, %249:vgpr_32, 0, 0, implicit $exec
+  %257:vgpr_32 = V_LSHL_ADD_U32_e64 %20:vgpr_32, 1, %256:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %257:vgpr_32, %250:vgpr_32, 0, 0, implicit $exec
+  %258:vgpr_32 = V_LSHL_ADD_U32_e64 %21:vgpr_32, 1, %257:vgpr_32, implicit $exec
+  DS_WRITE_B32_gfx9 %258:vgpr_32, %251:vgpr_32, 0, 0, implicit $exec
+  %0:vgpr_32 = contract nofpexcept V_FMAC_F32_e32 %1:vgpr_32, %247.sub0:vreg_64_align2, %0:vgpr_32, implicit $mode, implicit $exec
+  %259.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %259.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %259.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub0_sub1:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub0_sub1:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub2_sub3:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub2_sub3:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub4_sub5:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub4_sub5:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub6_sub7:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub6_sub7:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub8_sub9:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub8_sub9:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub10_sub11:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub10_sub11:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub12_sub13:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub12_sub13:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %260.sub14_sub15:vreg_512_align2 = contract nofpexcept V_PK_MUL_F32 8, %260.sub14_sub15:vreg_512_align2, 0, %247:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  %261:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %116:vgpr_32, implicit $mode, implicit $exec
+  %262:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %119:vgpr_32, implicit $mode, implicit $exec
+  %263:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %122:vgpr_32, implicit $mode, implicit $exec
+  %264:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %128:vgpr_32, implicit $mode, implicit $exec
+  %265:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %131:vgpr_32, implicit $mode, implicit $exec
+  %266:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %134:vgpr_32, implicit $mode, implicit $exec
+  %267:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %140:vgpr_32, implicit $mode, implicit $exec
+  %268:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %143:vgpr_32, implicit $mode, implicit $exec
+  %269:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %146:vgpr_32, implicit $mode, implicit $exec
+  %270:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %152:vgpr_32, implicit $mode, implicit $exec
+  %271:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %155:vgpr_32, implicit $mode, implicit $exec
+  %272:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %158:vgpr_32, implicit $mode, implicit $exec
+  %273:vgpr_32 = V_ADD_U32_e32 %22:vgpr_32, %37:vgpr_32, implicit $exec
+  %274:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %273:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  %275:vgpr_32 = V_ADD_U32_e32 %23:vgpr_32, %37:vgpr_32, implicit $exec
+  %276:vreg_64_align2 = BUFFER_LOAD_DWORDX2_OFFEN %275:vgpr_32, %8:sgpr_128, 0, 0, 0, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %277:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+  %278:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+  %279:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+  %280:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %281:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %14:sreg_32, implicit $exec
+  %282:vgpr_32 = V_PERM_B32_e64 %276.sub0:vreg_64_align2, %274.sub0:vreg_64_align2, %15:sreg_32, implicit $exec
+  %283:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %14:sreg_32, implicit $exec
+  %284:vgpr_32 = V_PERM_B32_e64 %276.sub1:vreg_64_align2, %274.sub1:vreg_64_align2, %15:sreg_32, implicit $exec
+  DS_WRITE_B32_gfx9 %255:vgpr_32, %281:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %256:vgpr_32, %282:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %257:vgpr_32, %283:vgpr_32, 0, 0, implicit $exec
+  DS_WRITE_B32_gfx9 %258:vgpr_32, %284:vgpr_32, 0, 0, implicit $exec
+  %285:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %125:vgpr_32, implicit $mode, implicit $exec
+  %286:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %137:vgpr_32, implicit $mode, implicit $exec
+  %287:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %149:vgpr_32, implicit $mode, implicit $exec
+  %288:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %161:vgpr_32, implicit $mode, implicit $exec
+  undef %289.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %263:vgpr_32, 0, %285:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %289.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %261:vgpr_32, 0, %262:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %290.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %266:vgpr_32, 0, %286:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %290.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %264:vgpr_32, 0, %265:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %291.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %269:vgpr_32, 0, %287:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %291.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %267:vgpr_32, 0, %268:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %292.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %272:vgpr_32, 0, %288:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %292.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %270:vgpr_32, 0, %271:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %277.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub0_sub1:vreg_128_align2, %289:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %278.sub2_sub3:vreg_128_align2, %290:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %279.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub0_sub1:vreg_128_align2, %291:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %280.sub2_sub3:vreg_128_align2, %292:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %293:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %209:vgpr_32, implicit $mode, implicit $exec
+  %294:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %203:vgpr_32, implicit $mode, implicit $exec
+  %295:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %197:vgpr_32, implicit $mode, implicit $exec
+  %296:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %191:vgpr_32, implicit $mode, implicit $exec
+  %297:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %185:vgpr_32, implicit $mode, implicit $exec
+  %298:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %179:vgpr_32, implicit $mode, implicit $exec
+  %299:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %173:vgpr_32, implicit $mode, implicit $exec
+  %300:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %167:vgpr_32, implicit $mode, implicit $exec
+  %301:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %206:vgpr_32, implicit $mode, implicit $exec
+  %302:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %200:vgpr_32, implicit $mode, implicit $exec
+  %303:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %194:vgpr_32, implicit $mode, implicit $exec
+  %304:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %188:vgpr_32, implicit $mode, implicit $exec
+  %305:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %182:vgpr_32, implicit $mode, implicit $exec
+  %306:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %176:vgpr_32, implicit $mode, implicit $exec
+  %307:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %170:vgpr_32, implicit $mode, implicit $exec
+  %308:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %164:vgpr_32, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  undef %309.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %307:vgpr_32, 0, %299:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %309.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %308:vgpr_32, 0, %300:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %310.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %305:vgpr_32, 0, %297:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %310.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %306:vgpr_32, 0, %298:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %311.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %303:vgpr_32, 0, %295:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %311.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %304:vgpr_32, 0, %296:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  undef %312.sub1:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %301:vgpr_32, 0, %293:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %312.sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, %302:vgpr_32, 0, %294:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  %313:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 0, 0, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %313.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %314:vreg_128_align2 = DS_READ_B128_gfx9 %24:vgpr_32, 576, 0, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub0_sub1:vreg_128_align2, %309:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %314.sub2_sub3:vreg_128_align2, %310:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %315:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 0, 0, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %259:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %315.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %259:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %316:vreg_128_align2 = DS_READ_B128_gfx9 %25:vgpr_32, 576, 0, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub0_sub1:vreg_128_align2, %311:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  %260:vreg_512_align2 = contract V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %316.sub2_sub3:vreg_128_align2, %312:vreg_64_align2, %260:vreg_512_align2, 0, 0, 0, implicit $mode, implicit $exec
+  INLINEASM &"s_waitcnt vmcnt($0)", 57 /* sideeffect mayload maystore isconvergent attdialect */, 13 /* imm */, 8, !0
+  %37:vgpr_32 = V_ADD_U32_e32 %26:sreg_32, %37:vgpr_32, implicit $exec
+  %29:vgpr_32 = nuw V_ADD_U32_e32 64, %29:vgpr_32, implicit $exec
+  S_ENDPGM 0
+...
+
+

From d42de86eb37b08b3007a67650b3ca73b9ae174b1 Mon Sep 17 00:00:00 2001
From: Michael Spencer <bigcheesegs@gmail.com>
Date: Fri, 23 Feb 2024 17:44:32 -0800
Subject: [PATCH 221/546] reland: [clang][ScanDeps] Canonicalize -D and -U
 flags (#82568)

Canonicalize `-D` and `-U` flags by sorting them and only keeping the
last instance of a given name.

This optimization will only fire if all `-D` and `-U` flags start with a
simple identifier that we can guarantee a simple analysis of can
determine if two flags refer to the same identifier or not. See the
comment on `getSimpleMacroName()` for details of what the issues are.

Previous version of this had issues with sed differences between macOS,
Linux, and Windows. This test doesn't check paths, so just don't run
sed.
Other tests should use `sed -E 's:\\\\?:/:g'` to get portable behavior.

Windows has different command line parsing behavior than Linux for
compilation databases, so the test has been adjusted to ignore that
difference.
---
 .../DependencyScanningService.h               |  5 +-
 .../DependencyScanningWorker.cpp              | 74 ++++++++++++++++
 .../optimize-canonicalize-macros.m            | 87 +++++++++++++++++++
 clang/tools/clang-scan-deps/ClangScanDeps.cpp |  1 +
 4 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/ClangScanDeps/optimize-canonicalize-macros.m

diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
index 4f9867262a275..557f0e547ab4a 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h
@@ -60,7 +60,10 @@ enum class ScanningOptimizations {
   /// Remove unused -ivfsoverlay arguments.
   VFS = 4,
 
-  DSS_LAST_BITMASK_ENUM(VFS),
+  /// Canonicalize -D and -U options.
+  Macros = 8,
+
+  DSS_LAST_BITMASK_ENUM(Macros),
   Default = All
 };
 
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 3cf3ad8a4e490..7477b930188b4 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -179,6 +179,78 @@ static void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
   DiagOpts.IgnoreWarnings = true;
 }
 
+// Clang implements -D and -U by splatting text into a predefines buffer. This
+// allows constructs such as `-DFඞ=3 "-D F\u{0D9E} 4 3 2”` to be accepted and
+// define the same macro, or adding C++ style comments before the macro name.
+//
+// This function checks that the first non-space characters in the macro
+// obviously form an identifier that can be uniqued on without lexing. Failing
+// to do this could lead to changing the final definition of a macro.
+//
+// We could set up a preprocessor and actually lex the name, but that's very
+// heavyweight for a situation that will almost never happen in practice.
+static std::optional<StringRef> getSimpleMacroName(StringRef Macro) {
+  StringRef Name = Macro.split("=").first.ltrim(" \t");
+  std::size_t I = 0;
+
+  auto FinishName = [&]() -> std::optional<StringRef> {
+    StringRef SimpleName = Name.slice(0, I);
+    if (SimpleName.empty())
+      return std::nullopt;
+    return SimpleName;
+  };
+
+  for (; I != Name.size(); ++I) {
+    switch (Name[I]) {
+    case '(': // Start of macro parameter list
+    case ' ': // End of macro name
+    case '\t':
+      return FinishName();
+    case '_':
+      continue;
+    default:
+      if (llvm::isAlnum(Name[I]))
+        continue;
+      return std::nullopt;
+    }
+  }
+  return FinishName();
+}
+
+static void canonicalizeDefines(PreprocessorOptions &PPOpts) {
+  using MacroOpt = std::pair<StringRef, std::size_t>;
+  std::vector<MacroOpt> SimpleNames;
+  SimpleNames.reserve(PPOpts.Macros.size());
+  std::size_t Index = 0;
+  for (const auto &M : PPOpts.Macros) {
+    auto SName = getSimpleMacroName(M.first);
+    // Skip optimizing if we can't guarantee we can preserve relative order.
+    if (!SName)
+      return;
+    SimpleNames.emplace_back(*SName, Index);
+    ++Index;
+  }
+
+  llvm::stable_sort(SimpleNames, [](const MacroOpt &A, const MacroOpt &B) {
+    return A.first < B.first;
+  });
+  // Keep the last instance of each macro name by going in reverse
+  auto NewEnd = std::unique(
+      SimpleNames.rbegin(), SimpleNames.rend(),
+      [](const MacroOpt &A, const MacroOpt &B) { return A.first == B.first; });
+  SimpleNames.erase(SimpleNames.begin(), NewEnd.base());
+
+  // Apply permutation.
+  decltype(PPOpts.Macros) NewMacros;
+  NewMacros.reserve(SimpleNames.size());
+  for (std::size_t I = 0, E = SimpleNames.size(); I != E; ++I) {
+    std::size_t OriginalIndex = SimpleNames[I].second;
+    // We still emit undefines here as they may be undefining a predefined macro
+    NewMacros.push_back(std::move(PPOpts.Macros[OriginalIndex]));
+  }
+  std::swap(PPOpts.Macros, NewMacros);
+}
+
 /// A clang tool that runs the preprocessor in a mode that's optimized for
 /// dependency scanning for the given compiler invocation.
 class DependencyScanningAction : public tooling::ToolAction {
@@ -203,6 +275,8 @@ class DependencyScanningAction : public tooling::ToolAction {
     CompilerInvocation OriginalInvocation(*Invocation);
     // Restore the value of DisableFree, which may be modified by Tooling.
     OriginalInvocation.getFrontendOpts().DisableFree = DisableFree;
+    if (any(OptimizeArgs & ScanningOptimizations::Macros))
+      canonicalizeDefines(OriginalInvocation.getPreprocessorOpts());
 
     if (Scanned) {
       // Scanning runs once for the first -cc1 invocation in a chain of driver
diff --git a/clang/test/ClangScanDeps/optimize-canonicalize-macros.m b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m
new file mode 100644
index 0000000000000..b1b351ba56afc
--- /dev/null
+++ b/clang/test/ClangScanDeps/optimize-canonicalize-macros.m
@@ -0,0 +1,87 @@
+// This test verifies that command lines with equivalent -D and -U arguments
+// are canonicalized to the same module variant.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands.json.in > %t/build/compile-commands.json
+// RUN: clang-scan-deps -compilation-database %t/build/compile-commands.json \
+// RUN:   -j 1 -format experimental-full -optimize-args=canonicalize-macros > %t/deps.db
+// RUN: cat %t/deps.db | FileCheck %s -DPREFIX=%/t
+
+// Verify that there are only two variants and that the expected merges have
+// happened.
+
+// CHECK:      {
+// CHECK-NEXT:   "modules": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file":
+// CHECK-NEXT:       "command-line": [
+// CHECK-NOT:          "J=1"
+// CHECK-NOT:          "J"
+// CHECK-NOT:          "K"
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK:            ],
+// CHECK-NEXT:       "name": "A"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file":
+// CHECK-NEXT:       "command-line": [
+// CHECK:              "Fඞ"
+// CHECK:              0D9E
+// CHECK:              "K"
+// CHECK:              "K"
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK:            ],
+// CHECK-NEXT:       "name": "A"
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ],
+// CHECK-NEXT:   "translation-units": [
+// CHECK:        ]
+// CHECK:      }
+
+
+//--- build/compile-commands.json.in
+
+[
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/tu0.m -DJ=1 -UJ -DJ=2 -DI -DK(x)=x -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/tu0.m"
+},
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/tu1.m -DK -DK(x)=x -DI -D \"J=2\" -I modules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/tu1.m"
+},
+{
+  "directory": "DIR",
+  "command": "clang -c DIR/tu2.m -I modules/A -DFඞ \"-DF\\\\u{0D9E}\" -DK -DK -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-module-maps",
+  "file": "DIR/tu2.m"
+}
+]
+
+//--- modules/A/module.modulemap
+
+module A {
+  umbrella header "A.h"
+}
+
+//--- modules/A/A.h
+
+//--- tu0.m
+
+#include <A.h>
+
+//--- tu1.m
+
+#include <A.h>
+
+//--- tu2.m
+
+#include <A.h>
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 0458a4b3ecec3..9811d2a875335 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -157,6 +157,7 @@ static void ParseArgs(int argc, char **argv) {
             .Case("header-search", ScanningOptimizations::HeaderSearch)
             .Case("system-warnings", ScanningOptimizations::SystemWarnings)
             .Case("vfs", ScanningOptimizations::VFS)
+            .Case("canonicalize-macros", ScanningOptimizations::Macros)
             .Case("all", ScanningOptimizations::All)
             .Default(std::nullopt);
     if (!Optimization) {

From de3b2c293b8bf336f8e1380148cf16b54a794c0c Mon Sep 17 00:00:00 2001
From: Michael Spencer <bigcheesegs@gmail.com>
Date: Fri, 23 Feb 2024 17:48:58 -0800
Subject: [PATCH 222/546] [clang][ScanDeps] Allow PCHs to have different VFS
 overlays (#82294)

It turns out it's not that uncommon for real code to pass a different
set of VFSs while building a PCH than while using the PCH. This can
cause problems as seen in `test/ClangScanDeps/optimize-vfs-pch.m`. If
you scan `compile-commands-tu-no-vfs-error.json` without -Werror and run
the resulting commands, Clang will emit a fatal error while trying to
emit a note saying that it can't find a remapped header.

This also adds textual tracking of VFSs for prebuilt modules that are
part of an included PCH, as the same issue can occur in a module we are
building if we drop VFSs. This has to be textual because we have no
guarantee the PCH had the same list of VFSs as the current TU.

This uses the `PrebuiltModuleListener` to collect `VFSOverlayFiles`
instead of trying to extract it out of a `serialization::ModuleFile`
each time it's needed. There's not a great way to just store a pointer
to the list of strings in the serialized AST.
---
 .../Basic/DiagnosticSerializationKinds.td     |   4 +-
 .../DependencyScanning/ModuleDepCollector.h   |   5 +
 .../DependencyScanningWorker.cpp              |  58 +++++++--
 .../DependencyScanning/ModuleDepCollector.cpp |  34 ++++--
 clang/test/ClangScanDeps/optimize-vfs-pch.m   | 114 ++++++++++++++++--
 llvm/include/llvm/ADT/StringSet.h             |   4 +
 6 files changed, 190 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
index 4c4659ed517e0..eb27de5921d6a 100644
--- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -44,7 +44,9 @@ def err_pch_diagopt_mismatch : Error<"%0 is currently enabled, but was not in "
   "the PCH file">;
 def err_pch_modulecache_mismatch : Error<"PCH was compiled with module cache "
   "path '%0', but the path is currently '%1'">;
-def err_pch_vfsoverlay_mismatch : Error<"PCH was compiled with different VFS overlay files than are currently in use">;
+def warn_pch_vfsoverlay_mismatch : Warning<
+  "PCH was compiled with different VFS overlay files than are currently in use">,
+  InGroup<DiagGroup<"pch-vfs-diff">>;
 def note_pch_vfsoverlay_files : Note<"%select{PCH|current translation unit}0 has the following VFS overlays:\n%1">;
 def note_pch_vfsoverlay_empty : Note<"%select{PCH|current translation unit}0 has no VFS overlays">;
 
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index 13ad253086492..081899cc2c850 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -149,6 +149,8 @@ struct ModuleDeps {
       BuildInfo;
 };
 
+using PrebuiltModuleVFSMapT = llvm::StringMap<llvm::StringSet<>>;
+
 class ModuleDepCollector;
 
 /// Callback that records textual includes and direct modular includes/imports
@@ -214,6 +216,7 @@ class ModuleDepCollector final : public DependencyCollector {
                      CompilerInstance &ScanInstance, DependencyConsumer &C,
                      DependencyActionController &Controller,
                      CompilerInvocation OriginalCI,
+                     PrebuiltModuleVFSMapT PrebuiltModuleVFSMap,
                      ScanningOptimizations OptimizeArgs, bool EagerLoadModules,
                      bool IsStdModuleP1689Format);
 
@@ -233,6 +236,8 @@ class ModuleDepCollector final : public DependencyCollector {
   DependencyConsumer &Consumer;
   /// Callbacks for computing dependency information.
   DependencyActionController &Controller;
+  /// Mapping from prebuilt AST files to their sorted list of VFS overlay files.
+  PrebuiltModuleVFSMapT PrebuiltModuleVFSMap;
   /// Path to the main source file.
   std::string MainFile;
   /// Hash identifying the compilation conditions of the current TU.
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 7477b930188b4..2b882f8a5e079 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -24,6 +24,7 @@
 #include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
 #include "clang/Tooling/DependencyScanning/ModuleDepCollector.h"
 #include "clang/Tooling/Tooling.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
 #include "llvm/TargetParser/Host.h"
@@ -67,7 +68,7 @@ static bool checkHeaderSearchPaths(const HeaderSearchOptions &HSOpts,
   if (LangOpts.Modules) {
     if (HSOpts.VFSOverlayFiles != ExistingHSOpts.VFSOverlayFiles) {
       if (Diags) {
-        Diags->Report(diag::err_pch_vfsoverlay_mismatch);
+        Diags->Report(diag::warn_pch_vfsoverlay_mismatch);
         auto VFSNote = [&](int Type, ArrayRef<std::string> VFSOverlays) {
           if (VFSOverlays.empty()) {
             Diags->Report(diag::note_pch_vfsoverlay_empty) << Type;
@@ -79,7 +80,6 @@ static bool checkHeaderSearchPaths(const HeaderSearchOptions &HSOpts,
         VFSNote(0, HSOpts.VFSOverlayFiles);
         VFSNote(1, ExistingHSOpts.VFSOverlayFiles);
       }
-      return true;
     }
   }
   return false;
@@ -93,10 +93,12 @@ class PrebuiltModuleListener : public ASTReaderListener {
 public:
   PrebuiltModuleListener(PrebuiltModuleFilesT &PrebuiltModuleFiles,
                          llvm::SmallVector<std::string> &NewModuleFiles,
+                         PrebuiltModuleVFSMapT &PrebuiltModuleVFSMap,
                          const HeaderSearchOptions &HSOpts,
                          const LangOptions &LangOpts, DiagnosticsEngine &Diags)
       : PrebuiltModuleFiles(PrebuiltModuleFiles),
-        NewModuleFiles(NewModuleFiles), ExistingHSOpts(HSOpts),
+        NewModuleFiles(NewModuleFiles),
+        PrebuiltModuleVFSMap(PrebuiltModuleVFSMap), ExistingHSOpts(HSOpts),
         ExistingLangOpts(LangOpts), Diags(Diags) {}
 
   bool needsImportVisitation() const override { return true; }
@@ -106,8 +108,16 @@ class PrebuiltModuleListener : public ASTReaderListener {
       NewModuleFiles.push_back(Filename.str());
   }
 
+  void visitModuleFile(StringRef Filename,
+                       serialization::ModuleKind Kind) override {
+    CurrentFile = Filename;
+  }
+
   bool ReadHeaderSearchPaths(const HeaderSearchOptions &HSOpts,
                              bool Complain) override {
+    std::vector<std::string> VFSOverlayFiles = HSOpts.VFSOverlayFiles;
+    PrebuiltModuleVFSMap.insert(
+        {CurrentFile, llvm::StringSet<>(VFSOverlayFiles)});
     return checkHeaderSearchPaths(
         HSOpts, ExistingHSOpts, Complain ? &Diags : nullptr, ExistingLangOpts);
   }
@@ -115,9 +125,11 @@ class PrebuiltModuleListener : public ASTReaderListener {
 private:
   PrebuiltModuleFilesT &PrebuiltModuleFiles;
   llvm::SmallVector<std::string> &NewModuleFiles;
+  PrebuiltModuleVFSMapT &PrebuiltModuleVFSMap;
   const HeaderSearchOptions &ExistingHSOpts;
   const LangOptions &ExistingLangOpts;
   DiagnosticsEngine &Diags;
+  std::string CurrentFile;
 };
 
 /// Visit the given prebuilt module and collect all of the modules it
@@ -125,12 +137,16 @@ class PrebuiltModuleListener : public ASTReaderListener {
 static bool visitPrebuiltModule(StringRef PrebuiltModuleFilename,
                                 CompilerInstance &CI,
                                 PrebuiltModuleFilesT &ModuleFiles,
+                                PrebuiltModuleVFSMapT &PrebuiltModuleVFSMap,
                                 DiagnosticsEngine &Diags) {
   // List of module files to be processed.
   llvm::SmallVector<std::string> Worklist;
-  PrebuiltModuleListener Listener(
-      ModuleFiles, Worklist, CI.getHeaderSearchOpts(), CI.getLangOpts(), Diags);
+  PrebuiltModuleListener Listener(ModuleFiles, Worklist, PrebuiltModuleVFSMap,
+                                  CI.getHeaderSearchOpts(), CI.getLangOpts(),
+                                  Diags);
 
+  Listener.visitModuleFile(PrebuiltModuleFilename,
+                           serialization::MK_ExplicitModule);
   if (ASTReader::readASTFileControlBlock(
           PrebuiltModuleFilename, CI.getFileManager(), CI.getModuleCache(),
           CI.getPCHContainerReader(),
@@ -139,6 +155,7 @@ static bool visitPrebuiltModule(StringRef PrebuiltModuleFilename,
     return true;
 
   while (!Worklist.empty()) {
+    Listener.visitModuleFile(Worklist.back(), serialization::MK_ExplicitModule);
     if (ASTReader::readASTFileControlBlock(
             Worklist.pop_back_val(), CI.getFileManager(), CI.getModuleCache(),
             CI.getPCHContainerReader(),
@@ -175,8 +192,19 @@ static void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
   DiagOpts.ShowCarets = false;
   // Don't write out diagnostic file.
   DiagOpts.DiagnosticSerializationFile.clear();
-  // Don't emit warnings as errors (and all other warnings too).
-  DiagOpts.IgnoreWarnings = true;
+  // Don't emit warnings except for scanning specific warnings.
+  // TODO: It would be useful to add a more principled way to ignore all
+  //       warnings that come from source code. The issue is that we need to
+  //       ignore warnings that could be surpressed by
+  //       `#pragma clang diagnostic`, while still allowing some scanning
+  //       warnings for things we're not ready to turn into errors yet.
+  //       See `test/ClangScanDeps/diagnostic-pragmas.c` for an example.
+  llvm::erase_if(DiagOpts.Warnings, [](StringRef Warning) {
+    return llvm::StringSwitch<bool>(Warning)
+        .Cases("pch-vfs-diff", "error=pch-vfs-diff", false)
+        .StartsWith("no-error=", false)
+        .Default(true);
+  });
 }
 
 // Clang implements -D and -U by splatting text into a predefines buffer. This
@@ -300,6 +328,10 @@ class DependencyScanningAction : public tooling::ToolAction {
     if (!ScanInstance.hasDiagnostics())
       return false;
 
+    // Some DiagnosticConsumers require that finish() is called.
+    auto DiagConsumerFinisher =
+        llvm::make_scope_exit([DiagConsumer]() { DiagConsumer->finish(); });
+
     ScanInstance.getPreprocessorOpts().AllowPCHWithDifferentModulesCachePath =
         true;
 
@@ -307,7 +339,8 @@ class DependencyScanningAction : public tooling::ToolAction {
     ScanInstance.getFrontendOpts().UseGlobalModuleIndex = false;
     ScanInstance.getFrontendOpts().ModulesShareFileManager = false;
     ScanInstance.getHeaderSearchOpts().ModuleFormat = "raw";
-    ScanInstance.getHeaderSearchOpts().ModulesIncludeVFSUsage = true;
+    ScanInstance.getHeaderSearchOpts().ModulesIncludeVFSUsage =
+        any(OptimizeArgs & ScanningOptimizations::VFS);
 
     ScanInstance.setFileManager(FileMgr);
     // Support for virtual file system overlays.
@@ -320,12 +353,13 @@ class DependencyScanningAction : public tooling::ToolAction {
     // Store the list of prebuilt module files into header search options. This
     // will prevent the implicit build to create duplicate modules and will
     // force reuse of the existing prebuilt module files instead.
+    PrebuiltModuleVFSMapT PrebuiltModuleVFSMap;
     if (!ScanInstance.getPreprocessorOpts().ImplicitPCHInclude.empty())
       if (visitPrebuiltModule(
               ScanInstance.getPreprocessorOpts().ImplicitPCHInclude,
               ScanInstance,
               ScanInstance.getHeaderSearchOpts().PrebuiltModuleFiles,
-              ScanInstance.getDiagnostics()))
+              PrebuiltModuleVFSMap, ScanInstance.getDiagnostics()))
         return false;
 
     // Use the dependency scanning optimized file system if requested to do so.
@@ -369,8 +403,8 @@ class DependencyScanningAction : public tooling::ToolAction {
     case ScanningOutputFormat::Full:
       MDC = std::make_shared<ModuleDepCollector>(
           std::move(Opts), ScanInstance, Consumer, Controller,
-          OriginalInvocation, OptimizeArgs, EagerLoadModules,
-          Format == ScanningOutputFormat::P1689);
+          OriginalInvocation, std::move(PrebuiltModuleVFSMap), OptimizeArgs,
+          EagerLoadModules, Format == ScanningOutputFormat::P1689);
       ScanInstance.addDependencyCollector(MDC);
       break;
     }
@@ -399,6 +433,8 @@ class DependencyScanningAction : public tooling::ToolAction {
     if (ScanInstance.getDiagnostics().hasErrorOccurred())
       return false;
 
+    // Each action is responsible for calling finish.
+    DiagConsumerFinisher.release();
     const bool Result = ScanInstance.ExecuteAction(*Action);
 
     if (Result)
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 5a9e563c2d5b2..eb5c50c35428f 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -29,10 +29,11 @@ const std::vector<std::string> &ModuleDeps::getBuildArguments() {
   return std::get<std::vector<std::string>>(BuildInfo);
 }
 
-static void optimizeHeaderSearchOpts(HeaderSearchOptions &Opts,
-                                     ASTReader &Reader,
-                                     const serialization::ModuleFile &MF,
-                                     ScanningOptimizations OptimizeArgs) {
+static void
+optimizeHeaderSearchOpts(HeaderSearchOptions &Opts, ASTReader &Reader,
+                         const serialization::ModuleFile &MF,
+                         const PrebuiltModuleVFSMapT &PrebuiltModuleVFSMap,
+                         ScanningOptimizations OptimizeArgs) {
   if (any(OptimizeArgs & ScanningOptimizations::HeaderSearch)) {
     // Only preserve search paths that were used during the dependency scan.
     std::vector<HeaderSearchOptions::Entry> Entries;
@@ -65,11 +66,25 @@ static void optimizeHeaderSearchOpts(HeaderSearchOptions &Opts,
     llvm::DenseSet<const serialization::ModuleFile *> Visited;
     std::function<void(const serialization::ModuleFile *)> VisitMF =
         [&](const serialization::ModuleFile *MF) {
-          VFSUsage |= MF->VFSUsage;
           Visited.insert(MF);
-          for (const serialization::ModuleFile *Import : MF->Imports)
-            if (!Visited.contains(Import))
-              VisitMF(Import);
+          if (MF->Kind == serialization::MK_ImplicitModule) {
+            VFSUsage |= MF->VFSUsage;
+            // We only need to recurse into implicit modules. Other module types
+            // will have the correct set of VFSs for anything they depend on.
+            for (const serialization::ModuleFile *Import : MF->Imports)
+              if (!Visited.contains(Import))
+                VisitMF(Import);
+          } else {
+            // This is not an implicitly built module, so it may have different
+            // VFS options. Fall back to a string comparison instead.
+            auto VFSMap = PrebuiltModuleVFSMap.find(MF->FileName);
+            if (VFSMap == PrebuiltModuleVFSMap.end())
+              return;
+            for (std::size_t I = 0, E = VFSOverlayFiles.size(); I != E; ++I) {
+              if (VFSMap->second.contains(VFSOverlayFiles[I]))
+                VFSUsage[I] = true;
+            }
+          }
         };
     VisitMF(&MF);
 
@@ -596,6 +611,7 @@ ModuleDepCollectorPP::handleTopLevelModule(const Module *M) {
                                         ScanningOptimizations::VFS)))
               optimizeHeaderSearchOpts(BuildInvocation.getMutHeaderSearchOpts(),
                                        *MDC.ScanInstance.getASTReader(), *MF,
+                                       MDC.PrebuiltModuleVFSMap,
                                        MDC.OptimizeArgs);
             if (any(MDC.OptimizeArgs & ScanningOptimizations::SystemWarnings))
               optimizeDiagnosticOpts(
@@ -697,9 +713,11 @@ ModuleDepCollector::ModuleDepCollector(
     std::unique_ptr<DependencyOutputOptions> Opts,
     CompilerInstance &ScanInstance, DependencyConsumer &C,
     DependencyActionController &Controller, CompilerInvocation OriginalCI,
+    PrebuiltModuleVFSMapT PrebuiltModuleVFSMap,
     ScanningOptimizations OptimizeArgs, bool EagerLoadModules,
     bool IsStdModuleP1689Format)
     : ScanInstance(ScanInstance), Consumer(C), Controller(Controller),
+      PrebuiltModuleVFSMap(std::move(PrebuiltModuleVFSMap)),
       Opts(std::move(Opts)),
       CommonInvocation(
           makeCommonInvocationForModuleBuild(std::move(OriginalCI))),
diff --git a/clang/test/ClangScanDeps/optimize-vfs-pch.m b/clang/test/ClangScanDeps/optimize-vfs-pch.m
index e6acb73e1dd34..0b5cb08d365ee 100644
--- a/clang/test/ClangScanDeps/optimize-vfs-pch.m
+++ b/clang/test/ClangScanDeps/optimize-vfs-pch.m
@@ -4,7 +4,8 @@
 // RUN: split-file %s %t
 // RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands-pch.json.in > %t/build/compile-commands-pch.json
 // RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands-tu.json.in > %t/build/compile-commands-tu.json
-// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands-tu-no-vfs.json.in > %t/build/compile-commands-tu-no-vfs.json
+// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands-tu-no-vfs-error.json.in > %t/build/compile-commands-tu-no-vfs-error.json
+// RUN: sed -e "s|DIR|%/t|g" %t/build/compile-commands-tu1.json.in > %t/build/compile-commands-tu1.json
 // RUN: sed -e "s|DIR|%/t|g" %t/build/pch-overlay.yaml.in > %t/build/pch-overlay.yaml
 
 // RUN: clang-scan-deps -compilation-database %t/build/compile-commands-pch.json \
@@ -23,11 +24,66 @@
 // RUN: %clang @%t/C.rsp
 // RUN: %clang @%t/tu.rsp
 
-// RUN: not clang-scan-deps -compilation-database %t/build/compile-commands-tu-no-vfs.json \
-// RUN:   -j 1 -format experimental-full --optimize-args=vfs,header-search 2>&1 | FileCheck %s
-
-// CHECK: error: PCH was compiled with different VFS overlay files than are currently in use
-// CHECK: note: current translation unit has no VFS overlays
+// RUN: not clang-scan-deps -compilation-database %t/build/compile-commands-tu-no-vfs-error.json \
+// RUN:   -j 1 -format experimental-full --optimize-args=vfs,header-search 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+
+// CHECK-ERROR: error: PCH was compiled with different VFS overlay files than are currently in use
+// CHECK-ERROR: note: current translation unit has no VFS overlays
+
+// Next test is to verify that a module that doesn't use the VFS, that depends
+// on the PCH's A, which does use the VFS, still records that it needs the VFS.
+// This avoids a fatal error when emitting diagnostics.
+
+// RUN: clang-scan-deps -compilation-database %t/build/compile-commands-tu1.json \
+// RUN:   -j 1 -format experimental-full --optimize-args=vfs,header-search > %t/tu1-deps.db
+// RUN: %deps-to-rsp %t/tu1-deps.db --tu-index=0 > %t/tu1.rsp
+// Reuse existing B
+// RUN: %deps-to-rsp %t/tu1-deps.db --module-name=E > %t/E.rsp
+// RUN: %deps-to-rsp %t/tu1-deps.db --module-name=D > %t/D.rsp
+// The build of D depends on B which depend on the prebuilt A. D will only build
+// if it has A's VFS, as it needs to emit a diagnostic showing the content of A.
+// RUN: %clang @%t/E.rsp
+// RUN: %clang @%t/D.rsp -verify
+// RUN: %clang @%t/tu1.rsp
+// RUN: cat %t/tu1-deps.db | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t
+
+// Check that D has the overlay, but E doesn't.
+// CHECK:      {
+// CHECK-NEXT:   "modules": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "context-hash": "{{.*}}",
+// CHECK-NEXT:           "module-name": "E"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/modules/D/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:              "-ivfsoverlay"
+// CHECK-NEXT:         "[[PREFIX]]/build/pch-overlay.yaml"
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "{{.*}}"
+// CHECK-NEXT:         "{{.*}}"
+// CHECK-NEXT:         "{{.*}}"
+// CHECK-NEXT:         "{{.*}}"
+// CHECK-NEXT:       ],
+// CHECK:            "name": "D"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/modules/E/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK-NOT:          "-ivfsoverlay"
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "{{.*}}"
+// CHECK-NEXT:         "{{.*}}"
+// CHECK-NEXT:       ],
+// CHECK:            "name": "E"
+// CHECK-NEXT:     }
 
 //--- build/compile-commands-pch.json.in
 
@@ -49,16 +105,26 @@
 }
 ]
 
-//--- build/compile-commands-tu-no-vfs.json.in
+//--- build/compile-commands-tu-no-vfs-error.json.in
 
 [
 {
   "directory": "DIR",
-  "command": "clang -fsyntax-only DIR/tu.m -I DIR/modules/A -I DIR/modules/B -I DIR/modules/C -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache -include DIR/pch.h -o DIR/tu.o",
+  "command": "clang -Wpch-vfs-diff -Werror=pch-vfs-diff -fsyntax-only DIR/tu.m -I DIR/modules/A -I DIR/modules/B -I DIR/modules/C -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache -include DIR/pch.h -o DIR/tu.o",
   "file": "DIR/tu.m"
 }
 ]
 
+//--- build/compile-commands-tu1.json.in
+
+[
+{
+  "directory": "DIR",
+  "command": "clang -fsyntax-only DIR/tu1.m -I DIR/modules/B -I DIR/modules/D -I DIR/modules/E -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache -include DIR/pch.h -o DIR/tu1.o -ivfsoverlay DIR/build/pch-overlay.yaml",
+  "file": "DIR/tu1.m"
+}
+]
+
 //--- build/pch-overlay.yaml.in
 
 {
@@ -95,7 +161,7 @@
 
 //--- build/A.h
 
-typedef int A_t;
+typedef int A_t __attribute__((deprecated("yep, it's depr")));
 
 //--- modules/B/module.modulemap
 
@@ -127,3 +193,33 @@
 A_t a = 0;
 B_t b = 0;
 C_t c = 0;
+
+//--- modules/D/module.modulemap
+
+module D {
+  umbrella header "D.h"
+  export *
+}
+
+//--- modules/D/D.h
+#include <B.h>
+#include <E.h>
+
+typedef A_t D_t; // expected-warning{{'A_t' is deprecated}}
+// expected-note@*:* {{marked deprecated here}}
+
+//--- modules/E/module.modulemap
+
+module E {
+  umbrella header "E.h"
+}
+
+//--- modules/E/E.h
+typedef int E_t;
+
+//--- tu1.m
+
+#include <D.h>
+
+D_t d = 0;
+E_t e = 0;
diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h
index d7b63bc9c9685..bf2f04f424d13 100644
--- a/llvm/include/llvm/ADT/StringSet.h
+++ b/llvm/include/llvm/ADT/StringSet.h
@@ -29,6 +29,10 @@ class StringSet : public StringMap<std::nullopt_t, AllocatorTy> {
     for (StringRef str : initializer)
       insert(str);
   }
+  template <typename Container> explicit StringSet(Container &&C) {
+    for (auto &&Str : C)
+      insert(Str);
+  }
   explicit StringSet(AllocatorTy a) : Base(a) {}
 
   std::pair<typename Base::iterator, bool> insert(StringRef key) {

From bfcf7a0707592ccc7fd9e805aeb36c4da3f315a6 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 23 Feb 2024 21:14:38 -0500
Subject: [PATCH 223/546] [AMDGPU] Remove `hasAtomicFaddRtnForTy` as it is not
 used anywhere (#82841)

---
 llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp |  9 ---------
 llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h   |  2 --
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp        | 12 ------------
 llvm/lib/Target/AMDGPU/SIISelLowering.h          |  1 -
 4 files changed, 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 60b7813f24033..a98d4488bf77f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -69,12 +69,3 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
 
   return std::pair(Reg, 0);
 }
-
-bool AMDGPU::hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget,
-                                   const LLT &Ty) {
-  if (Ty == LLT::scalar(32))
-    return Subtarget.hasAtomicFaddRtnInsts();
-  if (Ty == LLT::fixed_vector(2, 16) || Ty == LLT::scalar(64))
-    return Subtarget.hasGFX90AInsts();
-  return false;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 5ee888d9db001..5972552b9a4fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -26,8 +26,6 @@ std::pair<Register, unsigned>
 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
                           GISelKnownBits *KnownBits = nullptr,
                           bool CheckNUW = false);
-
-bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
 }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d8f528d866118..84ef9679ab956 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5310,18 +5310,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   }
 }
 
-bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
-  switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
-  case MVT::f32:
-    return Subtarget->hasAtomicFaddRtnInsts();
-  case MVT::v2f16:
-  case MVT::f64:
-    return Subtarget->hasGFX90AInsts();
-  default:
-    return false;
-  }
-}
-
 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   // This currently forces unfolding various combinations of fsub into fma with
   // free fneg'd operands. As long as we have fast FMA (controlled by
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e436c23af5bca..f6e1d198f40ae 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -436,7 +436,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
 
-  bool hasAtomicFaddRtnForTy(SDValue &Op) const;
   bool enableAggressiveFMAFusion(EVT VT) const override;
   bool enableAggressiveFMAFusion(LLT Ty) const override;
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,

From 31ab2c4f616d686c06e9b573c8f1a4ae7ad2d8c3 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Fri, 23 Feb 2024 19:15:32 -0800
Subject: [PATCH 224/546] [flang] Ensure USE-associated objects can be in
 NAMELIST (#82846)

The name resolution for NAMELIST objects didn't allow for symbols that
are not ObjectEntityDetails symbols.

Fixes https://github.com/llvm/llvm-project/issues/82574.
---
 flang/lib/Semantics/resolve-names.cpp | 2 +-
 flang/test/Semantics/namelist01.f90   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 36deab969456d..0cbe0b492fa44 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -6061,7 +6061,7 @@ void DeclarationVisitor::FinishNamelists() {
             if (!symbol) {
               symbol = &MakeSymbol(name, ObjectEntityDetails{});
               ApplyImplicitRules(*symbol);
-            } else if (!ConvertToObjectEntity(*symbol)) {
+            } else if (!ConvertToObjectEntity(symbol->GetUltimate())) {
               SayWithDecl(name, *symbol, "'%s' is not a variable"_err_en_US);
               context().SetError(*groupSymbol);
             }
diff --git a/flang/test/Semantics/namelist01.f90 b/flang/test/Semantics/namelist01.f90
index b14bf04ac91f9..b93d4b25550e3 100644
--- a/flang/test/Semantics/namelist01.f90
+++ b/flang/test/Semantics/namelist01.f90
@@ -11,6 +11,7 @@ subroutine C8103a(x)
   integer :: x
   !ERROR: 'dupname' is already declared in this scoping unit
   namelist /dupName/ x, x
+  namelist /nl/ uniquename ! ok
 end subroutine C8103a
 
 subroutine C8103b(y)

From b0d2a52c87b36afab4734e1810fb9266aec1128f Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 23 Feb 2024 20:03:13 -0800
Subject: [PATCH 225/546] [clang-format][NFC] Enable RemoveSemicolon for
 clang-format style (#82735)

Also insert separators for decimal integers longer than 4 digits.
---
 clang/lib/Format/Format.cpp                 | 5 ++++-
 clang/lib/Format/FormatToken.cpp            | 2 +-
 clang/lib/Format/UnwrappedLineFormatter.cpp | 4 ++--
 clang/unittests/Format/FormatTest.cpp       | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 10ab406a15c6e..2f6b52510099a 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1591,7 +1591,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.PenaltyBreakScopeResolution = 500;
   LLVMStyle.PenaltyBreakString = 1000;
   LLVMStyle.PenaltyBreakTemplateDeclaration = prec::Relational;
-  LLVMStyle.PenaltyExcessCharacter = 1000000;
+  LLVMStyle.PenaltyExcessCharacter = 1'000'000;
   LLVMStyle.PenaltyIndentedWhitespace = 0;
   LLVMStyle.PenaltyReturnTypeOnItsOwnLine = 60;
 
@@ -1914,9 +1914,12 @@ FormatStyle getClangFormatStyle() {
   FormatStyle Style = getLLVMStyle();
   Style.InsertBraces = true;
   Style.InsertNewlineAtEOF = true;
+  Style.IntegerLiteralSeparator.Decimal = 3;
+  Style.IntegerLiteralSeparator.DecimalMinDigits = 5;
   Style.LineEnding = FormatStyle::LE_LF;
   Style.RemoveBracesLLVM = true;
   Style.RemoveParentheses = FormatStyle::RPS_ReturnStatement;
+  Style.RemoveSemicolon = true;
   return Style;
 }
 
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index b791c5a26bbe3..56a7b2d638776 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -137,7 +137,7 @@ unsigned CommaSeparatedList::formatAfterToken(LineState &State,
   // bin-packed. Add a severe penalty to this so that column layouts are
   // preferred if possible.
   if (!Format)
-    return 10000;
+    return 10'000;
 
   // Format the entire list.
   unsigned Penalty = 0;
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index adeb072434873..fb31980ab9f49 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -1221,7 +1221,7 @@ class OptimizingLineFormatter : public LineFormatter {
     // While not empty, take first element and follow edges.
     while (!Queue.empty()) {
       // Quit if we still haven't found a solution by now.
-      if (Count > 25000000)
+      if (Count > 25'000'000)
         return 0;
 
       Penalty = Queue.top().first.first;
@@ -1235,7 +1235,7 @@ class OptimizingLineFormatter : public LineFormatter {
 
       // Cut off the analysis of certain solutions if the analysis gets too
       // complex. See description of IgnoreStackForComparison.
-      if (Count > 50000)
+      if (Count > 50'000)
         Node->State.IgnoreStackForComparison = true;
 
       if (!Seen.insert(&Node->State).second) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index b8dc01f55b4fa..d9752c73e34e7 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -21669,7 +21669,7 @@ TEST_F(FormatTest, BreakPenaltyAfterCastLParen) {
                "    int)aaaaaaaaaaaaaaaaaaaaaaaa);",
 
                Style);
-  Style.PenaltyBreakOpenParenthesis = 100000;
+  Style.PenaltyBreakOpenParenthesis = 100'000;
   verifyFormat("foo((int)\n"
                "        aaaaaaaaaaaaaaaaaaaaaaaa);",
                "foo((\n"

From 330af6ed6194ca5365bc576517c247f545aee1f4 Mon Sep 17 00:00:00 2001
From: MalaySanghiIntel <148750629+MalaySanghiIntel@users.noreply.github.com>
Date: Sat, 24 Feb 2024 10:58:50 +0530
Subject: [PATCH 226/546] Convert argument to reference. (#82741)

Avoid copy of large object
---
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 432c63fb65f49..c67cc57ba675d 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -175,7 +175,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
   // make sure that the operands of the vector function obtained via VFABI match
   // the operands of the original vector instruction.
   if (CI) {
-    for (auto VFParam : OptInfo->Shape.Parameters) {
+    for (auto &VFParam : OptInfo->Shape.Parameters) {
       if (VFParam.ParamKind == VFParamKind::GlobalPredicate)
         continue;
 

From 96abee5eef31274415681018553e1d4a16dc16c9 Mon Sep 17 00:00:00 2001
From: yingopq <115543042+yingopq@users.noreply.github.com>
Date: Sat, 24 Feb 2024 15:13:43 +0800
Subject: [PATCH 227/546] =?UTF-8?q?[Mips]=20Fix=20unable=20to=20handle=20i?=
 =?UTF-8?q?nline=20assembly=20ends=20with=20compat-branch=20o=E2=80=A6=20(?=
 =?UTF-8?q?#77291)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…n MIPS

Modify:
Add a global variable 'CurForbiddenSlotAttr' to save current
instruction's forbidden slot and whether set reorder. This is the
judgment condition for whether to add nop. We would add a couple of
'.set noreorder' and '.set reorder' to wrap the current instruction and
the next instruction.
Then we can get previous instruction`s forbidden slot attribute and
whether set reorder by 'CurForbiddenSlotAttr'.
If previous instruction has forbidden slot and .set reorder is active
and current instruction is CTI. Then emit a NOP after it.

Fix https://github.com/llvm/llvm-project/issues/61045.

Because https://reviews.llvm.org/D158589 was 'Needs Review' state, not
ending, so we commit pull request again.
---
 lld/test/ELF/mips-pc-relocs.s                 | 18 +++--
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 78 ++++++++++++++++++-
 .../CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll | 71 +++++++++++++++++
 llvm/test/MC/Mips/forbidden-slot.s            | 18 +++++
 llvm/test/MC/Mips/mips32r6/relocations.s      | 22 +++---
 llvm/test/MC/Mips/mips64r6/relocations.s      | 26 +++----
 llvm/test/MC/Mips/relocation.s                |  4 +-
 7 files changed, 202 insertions(+), 35 deletions(-)
 create mode 100644 llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll
 create mode 100644 llvm/test/MC/Mips/forbidden-slot.s

diff --git a/lld/test/ELF/mips-pc-relocs.s b/lld/test/ELF/mips-pc-relocs.s
index 5e7dbed94ca7c..7d23f9d7469a4 100644
--- a/lld/test/ELF/mips-pc-relocs.s
+++ b/lld/test/ELF/mips-pc-relocs.s
@@ -40,11 +40,13 @@ __start:
 #                                         ^-- (0x20020-0x20000)>>2
 # CHECK-NEXT:    20004:       beqc    $5, $6, 0x20020
 #                                             ^-- (0x20020-4-0x20004)>>2
-# CHECK-NEXT:    20008:       beqzc   $9, 0x20020
-#                                         ^-- (0x20020-4-0x20008)>>2
-# CHECK-NEXT:    2000c:       bc      0x20020
-#                                     ^-- (0x20020-4-0x2000c)>>2
-# CHECK-NEXT:    20010:       aluipc  $2, 0
-#                                         ^-- %hi(0x20020-0x20010)
-# CHECK-NEXT:    20014:       addiu   $2, $2, 12
-#                                             ^-- %lo(0x20020-0x20014)
+# CHECK-NEXT:    20008:       nop
+# CHECK-NEXT:    2000c:       beqzc   $9, 0x20020
+#                                         ^-- (0x20020-4-0x2000c)>>2
+# CHECK-NEXT:    20010:       nop
+# CHECK-NEXT:    20014:       bc      0x20020
+#                                     ^-- (0x20020-4-0x200014)>>2
+# CHECK-NEXT:    20018:       aluipc  $2, 0
+#                                         ^-- %hi(0x20020-0x20018)
+# CHECK-NEXT:    2001c:       addiu   $2, $2, 4
+#                                             ^-- %lo(0x20020-0x2001c)
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 36aab383da68d..9d6e8dc573a8d 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -150,6 +150,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool IsLittleEndian;
   bool IsPicEnabled;
   bool IsCpRestoreSet;
+  bool CurForbiddenSlotAttr;
   int CpRestoreOffset;
   unsigned GPReg;
   unsigned CpSaveLocation;
@@ -552,6 +553,7 @@ class MipsAsmParser : public MCTargetAsmParser {
 
     CurrentFn = nullptr;
 
+    CurForbiddenSlotAttr = false;
     IsPicEnabled = getContext().getObjectFileInfo()->isPositionIndependent();
 
     IsCpRestoreSet = false;
@@ -723,6 +725,16 @@ class MipsAsmParser : public MCTargetAsmParser {
     return getSTI().hasFeature(Mips::FeatureGINV);
   }
 
+  bool hasForbiddenSlot(const MCInstrDesc &MCID) const {
+    return !inMicroMipsMode() && (MCID.TSFlags & MipsII::HasForbiddenSlot);
+  }
+
+  bool SafeInForbiddenSlot(const MCInstrDesc &MCID) const {
+    return !(MCID.TSFlags & MipsII::IsCTI);
+  }
+
+  void onEndOfFile() override;
+
   /// Warn if RegIndex is the same as the current AT.
   void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
 
@@ -2307,7 +2319,41 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
   bool FillDelaySlot =
       MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder();
-  if (FillDelaySlot)
+
+  // Get previous instruction`s forbidden slot attribute and
+  // whether set reorder.
+  bool PrevForbiddenSlotAttr = CurForbiddenSlotAttr;
+
+  // Flag represents we set reorder after nop.
+  bool SetReorderAfterNop = false;
+
+  // If previous instruction has forbidden slot and .set reorder
+  // is active and current instruction is CTI.
+  // Then emit a NOP after it.
+  if (PrevForbiddenSlotAttr && !SafeInForbiddenSlot(MCID)) {
+    TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+    // When 'FillDelaySlot' is true, the existing logic will add
+    // noreorder before instruction and reorder after it. So there
+    // need exclude this case avoiding two '.set reorder'.
+    // The format of the first case is:
+    // .set noreorder
+    // bnezc
+    // nop
+    // .set reorder
+    if (AssemblerOptions.back()->isReorder() && !FillDelaySlot) {
+      SetReorderAfterNop = true;
+      TOut.emitDirectiveSetReorder();
+    }
+  }
+
+  // Save current instruction`s forbidden slot and whether set reorder.
+  // This is the judgment condition for whether to add nop.
+  // We would add a couple of '.set noreorder' and '.set reorder' to
+  // wrap the current instruction and the next instruction.
+  CurForbiddenSlotAttr =
+      hasForbiddenSlot(MCID) && AssemblerOptions.back()->isReorder();
+
+  if (FillDelaySlot || CurForbiddenSlotAttr)
     TOut.emitDirectiveSetNoReorder();
 
   MacroExpanderResultTy ExpandResult =
@@ -2322,6 +2368,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     return true;
   }
 
+  // When current instruction was not CTI, recover reorder state.
+  // The format of the second case is:
+  // .set noreoder
+  // bnezc
+  // add
+  // .set reorder
+  if (PrevForbiddenSlotAttr && !SetReorderAfterNop && !FillDelaySlot &&
+      AssemblerOptions.back()->isReorder()) {
+    TOut.emitDirectiveSetReorder();
+  }
+
   // We know we emitted an instruction on the MER_NotAMacro or MER_Success path.
   // If we're in microMIPS mode then we must also set EF_MIPS_MICROMIPS.
   if (inMicroMipsMode()) {
@@ -2331,6 +2388,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
 
   // If this instruction has a delay slot and .set reorder is active,
   // emit a NOP after it.
+  // The format of the third case is:
+  // .set noreorder
+  // bnezc
+  // nop
+  // .set noreorder
+  // j
+  // nop
+  // .set reorder
   if (FillDelaySlot) {
     TOut.emitEmptyDelaySlot(hasShortDelaySlot(Inst), IDLoc, STI);
     TOut.emitDirectiveSetReorder();
@@ -2356,6 +2421,17 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
+void MipsAsmParser::onEndOfFile() {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  SMLoc IDLoc = SMLoc();
+  // If has pending forbidden slot, fill nop and recover reorder.
+  if (CurForbiddenSlotAttr) {
+    TOut.emitEmptyDelaySlot(false, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitDirectiveSetReorder();
+  }
+}
+
 MipsAsmParser::MacroExpanderResultTy
 MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                     const MCSubtargetInfo *STI) {
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll b/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll
new file mode 100644
index 0000000000000..3e6826f0cd1d1
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/llvm-ir/forbidden-slot-ir.ll
@@ -0,0 +1,71 @@
+target triple = "mipsisa32r6el-unknown-linux-gnu"
+
+; RUN: llc -filetype=asm %s -o - | FileCheck %s --check-prefix=MIPSELR6
+; Function Attrs: noinline nounwind optnone uwtable
+define i1 @foo0() nounwind {
+; MIPSELR6:      bnezc	$1, $BB0_2
+; MIPSELR6-NEXT: nop
+; MIPSELR6:      jr	$ra
+entry:
+  %0 = icmp eq i32 0, 1
+  br i1 %0, label %2, label %3
+  ret i1 %0
+2:                                                
+  ret i1 %0
+3:                                                
+  ret i1 %0
+}
+
+define i32 @foo1() nounwind {
+; MIPSELR6:      addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: beqzc	$2, $tmp0
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	reorder
+; MIPSELR6:      jrc	$ra
+entry:
+  %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b", "=r"() nounwind
+  ret i32 %0
+}
+
+define i32 @foo2() nounwind {
+; MIPSELR6:      .set	push
+; MIPSELR6-NEXT: .set	at
+; MIPSELR6-NEXT: .set	macro
+; MIPSELR6-NEXT: .set	reorder
+; MIPSELR6:      .set	noreorder
+; MIPSELR6-NEXT: beqzc	$9, End
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	reorder
+; MIPSELR6:      addiu	$9, $9, 1
+entry:
+  %0 = tail call i32 asm "beqzc $$t1, End", "=r"() nounwind
+  %1 = tail call i32 asm "addiu $$t1, $$t1, 1", "=r"() nounwind
+  %2 = add nsw i32 %1, %0
+  ret i32 %2
+}
+
+define i32 @foo3() nounwind {
+; MIPSELR6:      addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: beqzc	$2, $tmp1
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: j	End
+; MIPSELR6-NEXT: nop
+; MIPSELR6-NEXT: .set	reorder
+entry:
+  %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b; j End", "=r"() nounwind
+  ret i32 %0
+}
+
+define i32 @foo4() nounwind {
+; MIPSELR6:      addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	noreorder
+; MIPSELR6-NEXT: beqzc  $2, $tmp2
+; MIPSELR6-NEXT: addiu	$2, $2, 1
+; MIPSELR6-NEXT: .set	reorder
+entry:
+  %0 = tail call i32 asm "1: addiu $0, $0, 1; beqzc $0, 1b; addiu $0, $0, 1", "=r"() nounwind
+  ret i32 %0
+}
diff --git a/llvm/test/MC/Mips/forbidden-slot.s b/llvm/test/MC/Mips/forbidden-slot.s
new file mode 100644
index 0000000000000..da98e70561695
--- /dev/null
+++ b/llvm/test/MC/Mips/forbidden-slot.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -assemble -mcpu=mips64r6 -arch=mips64el -filetype=obj %s -o tmp.o
+# RUN: llvm-objdump -d tmp.o | FileCheck %s --check-prefix=MIPSELR6
+
+# MIPSELR6:      0000000000000000 <aaa>:
+# MIPSELR6-NEXT: beqzc	$13, 0x0 <aaa>
+# MIPSELR6-NEXT: b	0x0 <aaa>
+# MIPSELR6:      0000000000000008 <bbb>:
+# MIPSELR6-NEXT: beqzc	$13, 0x8 <bbb>
+# MIPSELR6-NEXT: nop    <aaa>
+# MIPSELR6:      b	0x8 <bbb>
+	.set noreorder
+aaa:
+	beqzc $t1, aaa
+	b aaa
+	.set reorder
+bbb:
+	beqzc $t1, bbb
+	b bbb
diff --git a/llvm/test/MC/Mips/mips32r6/relocations.s b/llvm/test/MC/Mips/mips32r6/relocations.s
index dfd75e633bc2f..8d4464bbbed77 100644
--- a/llvm/test/MC/Mips/mips32r6/relocations.s
+++ b/llvm/test/MC/Mips/mips32r6/relocations.s
@@ -52,17 +52,17 @@
 # CHECK-ELF: Relocations [
 # CHECK-ELF:     0x0 R_MIPS_PC19_S2 bar
 # CHECK-ELF:     0x4 R_MIPS_PC16 bar
-# CHECK-ELF:     0x8 R_MIPS_PC16 bar
-# CHECK-ELF:     0xC R_MIPS_PC21_S2 bar
-# CHECK-ELF:     0x10 R_MIPS_PC21_S2 bar
-# CHECK-ELF:     0x14 R_MIPS_PC26_S2 bar
-# CHECK-ELF:     0x18 R_MIPS_PC26_S2 bar
-# CHECK-ELF:     0x1C R_MIPS_PCHI16 bar
-# CHECK-ELF:     0x20 R_MIPS_PCLO16 bar
-# CHECK-ELF:     0x24 R_MIPS_PC19_S2 bar
-# CHECK-ELF:     0x28 R_MIPS_PC19_S2 bar
-# CHECK-ELF:     0x2C R_MIPS_LO16 bar
-# CHECK-ELF:     0x30 R_MIPS_LO16 bar
+# CHECK-ELF:     0xC R_MIPS_PC16 bar
+# CHECK-ELF:     0x14 R_MIPS_PC21_S2 bar
+# CHECK-ELF:     0x1C R_MIPS_PC21_S2 bar
+# CHECK-ELF:     0x24 R_MIPS_PC26_S2 bar
+# CHECK-ELF:     0x28 R_MIPS_PC26_S2 bar
+# CHECK-ELF:     0x2C R_MIPS_PCHI16 bar
+# CHECK-ELF:     0x30 R_MIPS_PCLO16 bar
+# CHECK-ELF:     0x34 R_MIPS_PC19_S2 bar
+# CHECK-ELF:     0x38 R_MIPS_PC19_S2 bar
+# CHECK-ELF:     0x3C R_MIPS_LO16 bar
+# CHECK-ELF:     0x40 R_MIPS_LO16 bar
 # CHECK-ELF: ]
 
   addiupc   $2,bar
diff --git a/llvm/test/MC/Mips/mips64r6/relocations.s b/llvm/test/MC/Mips/mips64r6/relocations.s
index 8353ec019a3ca..8b02be37284aa 100644
--- a/llvm/test/MC/Mips/mips64r6/relocations.s
+++ b/llvm/test/MC/Mips/mips64r6/relocations.s
@@ -59,19 +59,19 @@
 # CHECK-ELF: Relocations [
 # CHECK-ELF:     0x0 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
 # CHECK-ELF:     0x4 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x8 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0xC R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x10 R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x14 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x18 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
-# CHECK-ELF:     0x1C R_MIPS_PCHI16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x20 R_MIPS_PCLO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x24 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x28 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x2C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x30 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x34 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
-# CHECK-ELF:     0x38 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0xC R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x14 R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x1C R_MIPS_PC21_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x24 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x28 R_MIPS_PC26_S2/R_MIPS_NONE/R_MIPS_NONE bar 0xFFFFFFFFFFFFFFFC
+# CHECK-ELF:     0x2C R_MIPS_PCHI16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x30 R_MIPS_PCLO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x34 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x38 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x3C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x40 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x44 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
+# CHECK-ELF:     0x48 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0
 # CHECK-ELF: ]
 
   addiupc   $2,bar
diff --git a/llvm/test/MC/Mips/relocation.s b/llvm/test/MC/Mips/relocation.s
index 9c8bb657ea68c..a92c62744fcaa 100644
--- a/llvm/test/MC/Mips/relocation.s
+++ b/llvm/test/MC/Mips/relocation.s
@@ -237,7 +237,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: addiu $2, $3, %tprel_lo(foo) # encoding: [A,A,0x62,0x24]
                                            // FIXUP: # fixup A - offset: 0, value: %tprel_lo(foo), kind: fixup_Mips_TPREL_LO
 
-// DATA-NEXT:  00C0: D85FFFFF CBFFFFFF EC580000 EC480000
+// DATA-NEXT:  00C0: D85FFFFF 00000000 CBFFFFFF EC580000
                                            // ?????: R_MIPS_GLOB_DAT foo
         .set mips32r6
         beqzc $2, foo                      // RELOC: R_MIPS_PC21_S2 foo
@@ -262,7 +262,7 @@ baz:    .long foo                          // RELOC: R_MIPS_32 foo
                                            // ENCLE: lwpc $2, foo # encoding: [A,A,0b01001AAA,0xec]
                                            // FIXUP: # fixup A - offset: 0, value: foo, kind: fixup_MIPS_PC19_S2
 
-// DATA-NEXT:  00D0: 24620000 24620000 00000000
+// DATA-NEXT:  00D0: EC480000 24620000 24620000 00000000
         addiu $2, $3, %pcrel_hi(foo)       // RELOC: R_MIPS_PCHI16 foo
                                            // ENCBE: addiu $2, $3, %pcrel_hi(foo) # encoding: [0x24,0x62,A,A]
                                            // ENCLE: addiu $2, $3, %pcrel_hi(foo) # encoding: [A,A,0x62,0x24]

From 91d5653e3ae9742f7fb847f809b534ee128501b0 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sat, 24 Feb 2024 09:10:07 +0100
Subject: [PATCH 228/546] [mlir] Use `OpBuilder::createBlock` in op builders
 and patterns (#82770)

When creating a new block in (conversion) rewrite patterns,
`OpBuilder::createBlock` must be used. Otherwise, no
`notifyBlockInserted` notification is sent to the listener.

Note: The dialect conversion relies on listener notifications to keep
track of IR modifications. Creating blocks without the builder API can
lead to memory leaks during rollback.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  2 +-
 .../Dialect/SPIRV/IR/SPIRVControlFlowOps.td   |  4 ++--
 .../mlir/Interfaces/FunctionInterfaces.td     |  4 ++--
 .../Conversion/AsyncToLLVM/AsyncToLLVM.cpp    |  2 +-
 .../ControlFlowToSCF/ControlFlowToSCF.cpp     |  4 +---
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |  4 ++--
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  |  4 +---
 mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp | 11 ++++-----
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp      | 24 +++++++++++--------
 mlir/lib/Dialect/Async/IR/Async.cpp           | 17 +++++--------
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp           | 10 ++++----
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  7 +++---
 .../Linalg/Transforms/DropUnitDims.cpp        |  6 ++---
 .../Linalg/Transforms/ElementwiseOpFusion.cpp |  6 ++---
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       |  9 ++++---
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      |  3 ++-
 mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp  | 18 +++++++-------
 mlir/lib/Dialect/Shape/IR/Shape.cpp           | 16 ++++++-------
 .../Transforms/SparseTensorRewriting.cpp      |  4 +---
 .../SPIRV/Deserialization/Deserializer.cpp    |  4 ++--
 20 files changed, 71 insertions(+), 88 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 3da5deeb4ec7e..b523374f6c06b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1456,7 +1456,7 @@ def LLVM_LLVMFuncOp : LLVM_Op<"func", [
   let extraClassDeclaration = [{
     // Add an entry block to an empty function, and set up the block arguments
     // to match the signature of the function.
-    Block *addEntryBlock();
+    Block *addEntryBlock(OpBuilder &builder);
 
     bool isVarArg() { return getFunctionType().isVarArg(); }
 
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td
index 36ad6755cab25..991e753d1b359 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVControlFlowOps.td
@@ -285,7 +285,7 @@ def SPIRV_LoopOp : SPIRV_Op<"mlir.loop", [InFunctionScope]> {
 
     // Adds an empty entry block and loop merge block containing one
     // spirv.mlir.merge op.
-    void addEntryAndMergeBlock();
+    void addEntryAndMergeBlock(OpBuilder &builder);
   }];
 
   let hasOpcode = 0;
@@ -427,7 +427,7 @@ def SPIRV_SelectionOp : SPIRV_Op<"mlir.selection", [InFunctionScope]> {
     Block *getMergeBlock();
 
     /// Adds a selection merge block containing one spirv.mlir.merge op.
-    void addMergeBlock();
+    void addMergeBlock(OpBuilder &builder);
 
     /// Creates a spirv.mlir.selection op for `if (<condition>) then { <thenBody> }`
     /// with `builder`. `builder`'s insertion point will remain at after the
diff --git a/mlir/include/mlir/Interfaces/FunctionInterfaces.td b/mlir/include/mlir/Interfaces/FunctionInterfaces.td
index 970a781c998b9..873853eba0b17 100644
--- a/mlir/include/mlir/Interfaces/FunctionInterfaces.td
+++ b/mlir/include/mlir/Interfaces/FunctionInterfaces.td
@@ -131,6 +131,7 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
     static void buildWithEntryBlock(
         OpBuilder &builder, OperationState &state, StringRef name, Type type,
         ArrayRef<NamedAttribute> attrs, TypeRange inputTypes) {
+      OpBuilder::InsertionGuard g(builder);
       state.addAttribute(SymbolTable::getSymbolAttrName(),
                         builder.getStringAttr(name));
       state.addAttribute(ConcreteOp::getFunctionTypeAttrName(state.name),
@@ -139,8 +140,7 @@ def FunctionOpInterface : OpInterface<"FunctionOpInterface", [
 
       // Add the function body.
       Region *bodyRegion = state.addRegion();
-      Block *body = new Block();
-      bodyRegion->push_back(body);
+      Block *body = builder.createBlock(bodyRegion);
       for (Type input : inputTypes)
         body->addArgument(input, state.location);
     }
diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
index 0ab53ce7e3327..7760373913761 100644
--- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
+++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
@@ -259,7 +259,7 @@ static void addResumeFunction(ModuleOp module) {
       kResume, LLVM::LLVMFunctionType::get(voidTy, {ptrType}));
   resumeOp.setPrivate();
 
-  auto *block = resumeOp.addEntryBlock();
+  auto *block = resumeOp.addEntryBlock(moduleBuilder);
   auto blockBuilder = ImplicitLocOpBuilder::atBlockEnd(loc, block);
 
   blockBuilder.create<LLVM::CoroResumeOp>(resumeOp.getArgument(0));
diff --git a/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp b/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
index 363e5f9b8cefe..d3ee89743da9d 100644
--- a/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
+++ b/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
@@ -98,12 +98,10 @@ ControlFlowToSCFTransformation::createStructuredDoWhileLoopOp(
       loc, builder.create<arith::TruncIOp>(loc, builder.getI1Type(), condition),
       loopVariablesNextIter);
 
-  auto *afterBlock = new Block;
-  whileOp.getAfter().push_back(afterBlock);
+  Block *afterBlock = builder.createBlock(&whileOp.getAfter());
   afterBlock->addArguments(
       loopVariablesInit.getTypes(),
       SmallVector<Location>(loopVariablesInit.size(), loc));
-  builder.setInsertionPointToEnd(afterBlock);
   builder.create<scf::YieldOp>(loc, afterBlock->getArguments());
 
   return whileOp.getOperation();
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index bd50c67fb8795..53b44aa3241bb 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -135,7 +135,7 @@ static void wrapForExternalCallers(OpBuilder &rewriter, Location loc,
   propagateArgResAttrs(rewriter, !!resultStructType, funcOp, wrapperFuncOp);
 
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(wrapperFuncOp.addEntryBlock());
+  rewriter.setInsertionPointToStart(wrapperFuncOp.addEntryBlock(rewriter));
 
   SmallVector<Value, 8> args;
   size_t argOffset = resultStructType ? 1 : 0;
@@ -203,7 +203,7 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
 
   // The wrapper that we synthetize here should only be visible in this module.
   newFuncOp.setLinkage(LLVM::Linkage::Private);
-  builder.setInsertionPointToStart(newFuncOp.addEntryBlock());
+  builder.setInsertionPointToStart(newFuncOp.addEntryBlock(builder));
 
   // Get a ValueRange containing arguments.
   FunctionType type = cast<FunctionType>(funcOp.getFunctionType());
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 2bfca303b5fd4..2dc42f0a85e66 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -520,9 +520,7 @@ struct GlobalMemrefOpLowering
         global, arrayTy, global.getConstant(), linkage, global.getSymName(),
         initialValue, alignment, *addressSpace);
     if (!global.isExternal() && global.isUninitialized()) {
-      Block *blk = new Block();
-      newGlobal.getInitializerRegion().push_back(blk);
-      rewriter.setInsertionPointToStart(blk);
+      rewriter.createBlock(&newGlobal.getInitializerRegion());
       Value undef[] = {
           rewriter.create<LLVM::UndefOp>(global.getLoc(), arrayTy)};
       rewriter.create<LLVM::ReturnOp>(global.getLoc(), undef);
diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
index febfe97f6c0a9..d90cf931385fc 100644
--- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
+++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp
@@ -138,14 +138,13 @@ struct ForOpConversion final : SCFToSPIRVPattern<scf::ForOp> {
     // from header to merge.
     auto loc = forOp.getLoc();
     auto loopOp = rewriter.create<spirv::LoopOp>(loc, spirv::LoopControl::None);
-    loopOp.addEntryAndMergeBlock();
+    loopOp.addEntryAndMergeBlock(rewriter);
 
     OpBuilder::InsertionGuard guard(rewriter);
     // Create the block for the header.
-    auto *header = new Block();
-    // Insert the header.
-    loopOp.getBody().getBlocks().insert(getBlockIt(loopOp.getBody(), 1),
-                                        header);
+    Block *header = rewriter.createBlock(&loopOp.getBody(),
+                                         getBlockIt(loopOp.getBody(), 1));
+    rewriter.setInsertionPointAfter(loopOp);
 
     // Create the new induction variable to use.
     Value adapLowerBound = adaptor.getLowerBound();
@@ -342,7 +341,7 @@ struct WhileOpConversion final : SCFToSPIRVPattern<scf::WhileOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = whileOp.getLoc();
     auto loopOp = rewriter.create<spirv::LoopOp>(loc, spirv::LoopControl::None);
-    loopOp.addEntryAndMergeBlock();
+    loopOp.addEntryAndMergeBlock(rewriter);
 
     Region &beforeRegion = whileOp.getBefore();
     Region &afterRegion = whileOp.getAfter();
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index c4b13193f4e77..a4df863ab0834 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -1813,6 +1813,8 @@ void AffineForOp::build(OpBuilder &builder, OperationState &result,
          "upper bound operand count does not match the affine map");
   assert(step > 0 && "step has to be a positive integer constant");
 
+  OpBuilder::InsertionGuard guard(builder);
+
   // Set variadic segment sizes.
   result.addAttribute(
       getOperandSegmentSizeAttr(),
@@ -1841,12 +1843,11 @@ void AffineForOp::build(OpBuilder &builder, OperationState &result,
   // Create a region and a block for the body.  The argument of the region is
   // the loop induction variable.
   Region *bodyRegion = result.addRegion();
-  bodyRegion->push_back(new Block);
-  Block &bodyBlock = bodyRegion->front();
+  Block *bodyBlock = builder.createBlock(bodyRegion);
   Value inductionVar =
-      bodyBlock.addArgument(builder.getIndexType(), result.location);
+      bodyBlock->addArgument(builder.getIndexType(), result.location);
   for (Value val : iterArgs)
-    bodyBlock.addArgument(val.getType(), val.getLoc());
+    bodyBlock->addArgument(val.getType(), val.getLoc());
 
   // Create the default terminator if the builder is not provided and if the
   // iteration arguments are not provided. Otherwise, leave this to the caller
@@ -1855,9 +1856,9 @@ void AffineForOp::build(OpBuilder &builder, OperationState &result,
     ensureTerminator(*bodyRegion, builder, result.location);
   } else if (bodyBuilder) {
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(&bodyBlock);
+    builder.setInsertionPointToStart(bodyBlock);
     bodyBuilder(builder, result.location, inductionVar,
-                bodyBlock.getArguments().drop_front());
+                bodyBlock->getArguments().drop_front());
   }
 }
 
@@ -2895,18 +2896,20 @@ void AffineIfOp::build(OpBuilder &builder, OperationState &result,
                        TypeRange resultTypes, IntegerSet set, ValueRange args,
                        bool withElseRegion) {
   assert(resultTypes.empty() || withElseRegion);
+  OpBuilder::InsertionGuard guard(builder);
+
   result.addTypes(resultTypes);
   result.addOperands(args);
   result.addAttribute(getConditionAttrStrName(), IntegerSetAttr::get(set));
 
   Region *thenRegion = result.addRegion();
-  thenRegion->push_back(new Block());
+  builder.createBlock(thenRegion);
   if (resultTypes.empty())
     AffineIfOp::ensureTerminator(*thenRegion, builder, result.location);
 
   Region *elseRegion = result.addRegion();
   if (withElseRegion) {
-    elseRegion->push_back(new Block());
+    builder.createBlock(elseRegion);
     if (resultTypes.empty())
       AffineIfOp::ensureTerminator(*elseRegion, builder, result.location);
   }
@@ -3693,6 +3696,7 @@ void AffineParallelOp::build(OpBuilder &builder, OperationState &result,
          "expected upper bound maps to have as many inputs as upper bound "
          "operands");
 
+  OpBuilder::InsertionGuard guard(builder);
   result.addTypes(resultTypes);
 
   // Convert the reductions to integer attributes.
@@ -3738,11 +3742,11 @@ void AffineParallelOp::build(OpBuilder &builder, OperationState &result,
 
   // Create a region and a block for the body.
   auto *bodyRegion = result.addRegion();
-  auto *body = new Block();
+  Block *body = builder.createBlock(bodyRegion);
+
   // Add all the block arguments.
   for (unsigned i = 0, e = steps.size(); i < e; ++i)
     body->addArgument(IndexType::get(builder.getContext()), result.location);
-  bodyRegion->push_back(body);
   if (resultTypes.empty())
     ensureTerminator(*bodyRegion, builder, result.location);
 }
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 5f583f36cd2cb..a3e3f80954efc 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -68,7 +68,7 @@ void ExecuteOp::getSuccessorRegions(RegionBranchPoint point,
 void ExecuteOp::build(OpBuilder &builder, OperationState &result,
                       TypeRange resultTypes, ValueRange dependencies,
                       ValueRange operands, BodyBuilderFn bodyBuilder) {
-
+  OpBuilder::InsertionGuard guard(builder);
   result.addOperands(dependencies);
   result.addOperands(operands);
 
@@ -87,26 +87,21 @@ void ExecuteOp::build(OpBuilder &builder, OperationState &result,
 
   // Add a body region with block arguments as unwrapped async value operands.
   Region *bodyRegion = result.addRegion();
-  bodyRegion->push_back(new Block);
-  Block &bodyBlock = bodyRegion->front();
+  Block *bodyBlock = builder.createBlock(bodyRegion);
   for (Value operand : operands) {
     auto valueType = llvm::dyn_cast<ValueType>(operand.getType());
-    bodyBlock.addArgument(valueType ? valueType.getValueType()
-                                    : operand.getType(),
-                          operand.getLoc());
+    bodyBlock->addArgument(valueType ? valueType.getValueType()
+                                     : operand.getType(),
+                           operand.getLoc());
   }
 
   // Create the default terminator if the builder is not provided and if the
   // expected result is empty. Otherwise, leave this to the caller
   // because we don't know which values to return from the execute op.
   if (resultTypes.empty() && !bodyBuilder) {
-    OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(&bodyBlock);
     builder.create<async::YieldOp>(result.location, ValueRange());
   } else if (bodyBuilder) {
-    OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(&bodyBlock);
-    bodyBuilder(builder, result.location, bodyBlock.getArguments());
+    bodyBuilder(builder, result.location, bodyBlock->getArguments());
   }
 }
 
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 0fe2c0dcfc7c5..4df8149b94c95 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -261,20 +261,20 @@ LogicalResult ExpressionOp::verify() {
 
 void ForOp::build(OpBuilder &builder, OperationState &result, Value lb,
                   Value ub, Value step, BodyBuilderFn bodyBuilder) {
+  OpBuilder::InsertionGuard g(builder);
   result.addOperands({lb, ub, step});
   Type t = lb.getType();
   Region *bodyRegion = result.addRegion();
-  bodyRegion->push_back(new Block);
-  Block &bodyBlock = bodyRegion->front();
-  bodyBlock.addArgument(t, result.location);
+  Block *bodyBlock = builder.createBlock(bodyRegion);
+  bodyBlock->addArgument(t, result.location);
 
   // Create the default terminator if the builder is not provided.
   if (!bodyBuilder) {
     ForOp::ensureTerminator(*bodyRegion, builder, result.location);
   } else {
     OpBuilder::InsertionGuard guard(builder);
-    builder.setInsertionPointToStart(&bodyBlock);
-    bodyBuilder(builder, result.location, bodyBlock.getArgument(0));
+    builder.setInsertionPointToStart(bodyBlock);
+    bodyBuilder(builder, result.location, bodyBlock->getArgument(0));
   }
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index f4042a60541a6..3ba6ac6ccc814 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -2165,11 +2165,10 @@ LogicalResult ShuffleVectorOp::verify() {
 //===----------------------------------------------------------------------===//
 
 // Add the entry block to the function.
-Block *LLVMFuncOp::addEntryBlock() {
+Block *LLVMFuncOp::addEntryBlock(OpBuilder &builder) {
   assert(empty() && "function already has an entry block");
-
-  auto *entry = new Block;
-  push_back(entry);
+  OpBuilder::InsertionGuard g(builder);
+  Block *entry = builder.createBlock(&getBody());
 
   // FIXME: Allow passing in proper locations for the entry arguments.
   LLVMFunctionType type = getFunctionType();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 559ffda4494d2..c46e3694b70ec 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -132,12 +132,10 @@ struct MoveInitOperandsToInput : public OpRewritePattern<GenericOp> {
         newIndexingMaps, genericOp.getIteratorTypesArray(),
         /*bodyBuild=*/nullptr, linalg::getPrunedAttributeList(genericOp));
 
+    OpBuilder::InsertionGuard guard(rewriter);
     Region &region = newOp.getRegion();
-    Block *block = new Block();
-    region.push_back(block);
+    Block *block = rewriter.createBlock(&region);
     IRMapping mapper;
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPointToStart(block);
     for (auto bbarg : genericOp.getRegionInputArgs())
       mapper.map(bbarg, block->addArgument(bbarg.getType(), loc));
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 286b07669a47f..0d8d670904f2a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -178,11 +178,9 @@ static void generateFusedElementwiseOpRegion(
   // Build the region of the fused op.
   Block &producerBlock = producer->getRegion(0).front();
   Block &consumerBlock = consumer->getRegion(0).front();
-  Block *fusedBlock = new Block();
-  fusedOp.getRegion().push_back(fusedBlock);
-  IRMapping mapper;
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPointToStart(fusedBlock);
+  Block *fusedBlock = rewriter.createBlock(&fusedOp.getRegion());
+  IRMapping mapper;
 
   // 2. Add an index operation for every fused loop dimension and use the
   // `consumerToProducerLoopsMap` to map the producer indices.
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 5d220c6cdd7e5..43c408a97687c 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -275,14 +275,13 @@ GenericOp makeTransposeOp(OpBuilder &b, Location loc, Value inputTensor,
   auto transposeOp =
       b.create<GenericOp>(loc, resultTensorType, inputTensor, outputTensor,
                           indexingMaps, iteratorTypes);
-  Region &body = transposeOp.getRegion();
-  body.push_back(new Block());
-  body.front().addArguments({elementType, elementType}, {loc, loc});
 
   // Create the body of the transpose operation.
   OpBuilder::InsertionGuard g(b);
-  b.setInsertionPointToEnd(&body.front());
-  b.create<YieldOp>(loc, transposeOp.getRegion().front().getArgument(0));
+  Region &body = transposeOp.getRegion();
+  Block *bodyBlock = b.createBlock(&body, /*insertPt=*/{},
+                                   {elementType, elementType}, {loc, loc});
+  b.create<YieldOp>(loc, bodyBlock->getArgument(0));
   return transposeOp;
 }
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index beb7e721ca53b..248193481acfc 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1420,6 +1420,7 @@ OpFoldResult ExtractStridedMetadataOp::getConstifiedMixedOffset() {
 
 void GenericAtomicRMWOp::build(OpBuilder &builder, OperationState &result,
                                Value memref, ValueRange ivs) {
+  OpBuilder::InsertionGuard g(builder);
   result.addOperands(memref);
   result.addOperands(ivs);
 
@@ -1428,7 +1429,7 @@ void GenericAtomicRMWOp::build(OpBuilder &builder, OperationState &result,
     result.addTypes(elementType);
 
     Region *bodyRegion = result.addRegion();
-    bodyRegion->push_back(new Block());
+    builder.createBlock(bodyRegion);
     bodyRegion->addArgument(elementType, memref.getLoc());
   }
 }
diff --git a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
index 580782043c81b..7170a899069ee 100644
--- a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
@@ -365,12 +365,11 @@ Block *LoopOp::getMergeBlock() {
   return &getBody().back();
 }
 
-void LoopOp::addEntryAndMergeBlock() {
+void LoopOp::addEntryAndMergeBlock(OpBuilder &builder) {
   assert(getBody().empty() && "entry and merge block already exist");
-  getBody().push_back(new Block());
-  auto *mergeBlock = new Block();
-  getBody().push_back(mergeBlock);
-  OpBuilder builder = OpBuilder::atBlockEnd(mergeBlock);
+  OpBuilder::InsertionGuard g(builder);
+  builder.createBlock(&getBody());
+  builder.createBlock(&getBody());
 
   // Add a spirv.mlir.merge op into the merge block.
   builder.create<spirv::MergeOp>(getLoc());
@@ -525,11 +524,10 @@ Block *SelectionOp::getMergeBlock() {
   return &getBody().back();
 }
 
-void SelectionOp::addMergeBlock() {
+void SelectionOp::addMergeBlock(OpBuilder &builder) {
   assert(getBody().empty() && "entry and merge block already exist");
-  auto *mergeBlock = new Block();
-  getBody().push_back(mergeBlock);
-  OpBuilder builder = OpBuilder::atBlockEnd(mergeBlock);
+  OpBuilder::InsertionGuard guard(builder);
+  builder.createBlock(&getBody());
 
   // Add a spirv.mlir.merge op into the merge block.
   builder.create<spirv::MergeOp>(getLoc());
@@ -542,7 +540,7 @@ SelectionOp::createIfThen(Location loc, Value condition,
   auto selectionOp =
       builder.create<spirv::SelectionOp>(loc, spirv::SelectionControl::None);
 
-  selectionOp.addMergeBlock();
+  selectionOp.addMergeBlock(builder);
   Block *mergeBlock = selectionOp.getMergeBlock();
   Block *thenBlock = nullptr;
 
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 4f829db1305c8..d9ee39a4e8dd3 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -375,15 +375,13 @@ void AssumingOp::inlineRegionIntoParent(AssumingOp &op,
 void AssumingOp::build(
     OpBuilder &builder, OperationState &result, Value witness,
     function_ref<SmallVector<Value, 2>(OpBuilder &, Location)> bodyBuilder) {
+  OpBuilder::InsertionGuard g(builder);
 
   result.addOperands(witness);
   Region *bodyRegion = result.addRegion();
-  bodyRegion->push_back(new Block);
-  Block &bodyBlock = bodyRegion->front();
+  builder.createBlock(bodyRegion);
 
   // Build body.
-  OpBuilder::InsertionGuard guard(builder);
-  builder.setInsertionPointToStart(&bodyBlock);
   SmallVector<Value, 2> yieldValues = bodyBuilder(builder, result.location);
   builder.create<AssumingYieldOp>(result.location, yieldValues);
 
@@ -1904,23 +1902,23 @@ bool ToExtentTensorOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
 
 void ReduceOp::build(OpBuilder &builder, OperationState &result, Value shape,
                      ValueRange initVals) {
+  OpBuilder::InsertionGuard g(builder);
   result.addOperands(shape);
   result.addOperands(initVals);
 
   Region *bodyRegion = result.addRegion();
-  bodyRegion->push_back(new Block);
-  Block &bodyBlock = bodyRegion->front();
-  bodyBlock.addArgument(builder.getIndexType(), result.location);
+  Block *bodyBlock = builder.createBlock(
+      bodyRegion, /*insertPt=*/{}, builder.getIndexType(), result.location);
 
   Type elementType;
   if (auto tensorType = llvm::dyn_cast<TensorType>(shape.getType()))
     elementType = tensorType.getElementType();
   else
     elementType = SizeType::get(builder.getContext());
-  bodyBlock.addArgument(elementType, shape.getLoc());
+  bodyBlock->addArgument(elementType, shape.getLoc());
 
   for (Value initVal : initVals) {
-    bodyBlock.addArgument(initVal.getType(), initVal.getLoc());
+    bodyBlock->addArgument(initVal.getType(), initVal.getLoc());
     result.addTypes(initVal.getType());
   }
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 2ccb2361b5efe..1bcc131781d34 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -299,8 +299,7 @@ struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {
     Block &prodBlock = prod.getRegion().front();
     Block &consBlock = op.getRegion().front();
     IRMapping mapper;
-    Block *fusedBlock = new Block();
-    fusedOp.getRegion().push_back(fusedBlock);
+    Block *fusedBlock = rewriter.createBlock(&fusedOp.getRegion());
     unsigned num = prodBlock.getNumArguments();
     for (unsigned i = 0; i < num - 1; i++)
       addArg(mapper, fusedBlock, prodBlock.getArgument(i));
@@ -309,7 +308,6 @@ struct FuseSparseMultiplyOverAdd : public OpRewritePattern<GenericOp> {
     // Clone bodies of the producer and consumer in new evaluation order.
     auto *acc = prodBlock.getTerminator()->getOperand(0).getDefiningOp();
     auto *sampler = consBlock.getTerminator()->getOperand(0).getDefiningOp();
-    rewriter.setInsertionPointToStart(fusedBlock);
     Value last;
     for (auto &op : prodBlock.without_terminator())
       if (&op != acc) {
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index 83ef01b4e3a46..bfd289eb350b5 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -1829,7 +1829,7 @@ ControlFlowStructurizer::createSelectionOp(uint32_t selectionControl) {
 
   auto control = static_cast<spirv::SelectionControl>(selectionControl);
   auto selectionOp = builder.create<spirv::SelectionOp>(location, control);
-  selectionOp.addMergeBlock();
+  selectionOp.addMergeBlock(builder);
 
   return selectionOp;
 }
@@ -1841,7 +1841,7 @@ spirv::LoopOp ControlFlowStructurizer::createLoopOp(uint32_t loopControl) {
 
   auto control = static_cast<spirv::LoopControl>(loopControl);
   auto loopOp = builder.create<spirv::LoopOp>(location, control);
-  loopOp.addEntryAndMergeBlock();
+  loopOp.addEntryAndMergeBlock(builder);
 
   return loopOp;
 }

From 60a904b2ad9842b93cc5fa0ad5bda5e22c550b7e Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sat, 24 Feb 2024 08:32:03 +0000
Subject: [PATCH 229/546] [llvm-exegesis] Fix typos in README

---
 llvm/tools/llvm-exegesis/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/README.md b/llvm/tools/llvm-exegesis/README.md
index bbc24688a2a2c..f606fbf8e0eb6 100644
--- a/llvm/tools/llvm-exegesis/README.md
+++ b/llvm/tools/llvm-exegesis/README.md
@@ -1,7 +1,7 @@
 # llvm-exegesis
 
-`llvm-exegesis` is a benchmarking tool that accepts or assembles a snippet and
-can measure characteristics of that snippet by executing it while keeping track
+`llvm-exegesis` is a benchmarking tool that accepts or generates snippets and
+can measure characteristics of those snippets by executing it while keeping track
 of performance counters.
 
 ### Currently Supported Platforms
@@ -32,7 +32,7 @@ architectures:
 * MIPS
 * PowerPC (PowerPC64LE only)
 
-Note that not benchmarking functionality is guaranteed to work on all platforms.
+Note that not all benchmarking functionality is guaranteed to work on all platforms.
 
 Memory annotations are currently only supported on 64-bit X86. There is no
 inherent limitations for porting memory annotations to other architectures, but

From 00c0638b5613912a7d1b65c8789bbb8ad1003115 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Sat, 24 Feb 2024 20:25:21 +0700
Subject: [PATCH 230/546] [AArch64] Intrinsics aarch64_{get,set}_fpsr (#81867)

Two new intrinsics are introduced to read/write FPSR. They are similar
to the existing intrinsics aarch64_{get,set}_fpcr.
---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 12 ++---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 13 +++++-
 .../lib/Target/AArch64/AArch64RegisterInfo.td |  3 ++
 llvm/test/CodeGen/AArch64/arm64-fpenv.ll      | 45 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/preserve.ll         |  4 +-
 5 files changed, 69 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-fpenv.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6b045e412cd51..5a9a7c4b43a1f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -703,17 +703,19 @@ def int_aarch64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic;
 def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic;
 
 let TargetPrefix = "aarch64" in {
-  class FPCR_Get_Intrinsic
+  class FPENV_Get_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>;
-  class FPCR_Set_Intrinsic
+  class FPENV_Set_Intrinsic
     : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrNoMem, IntrHasSideEffects]>;
   class RNDR_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_i64_ty, llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects]>;
 }
 
-// FPCR
-def int_aarch64_get_fpcr : FPCR_Get_Intrinsic;
-def int_aarch64_set_fpcr : FPCR_Set_Intrinsic;
+// FP environment registers.
+def int_aarch64_get_fpcr : FPENV_Get_Intrinsic;
+def int_aarch64_set_fpcr : FPENV_Set_Intrinsic;
+def int_aarch64_get_fpsr : FPENV_Get_Intrinsic;
+def int_aarch64_set_fpsr : FPENV_Set_Intrinsic;
 
 // Armv8.5-A Random number generation intrinsics
 def int_aarch64_rndr : RNDR_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 8e73f57ced42b..e73bc0d89e4c9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1805,7 +1805,7 @@ def HWASAN_CHECK_MEMACCESS_SHORTGRANULES : Pseudo<
 // The virtual cycle counter register is CNTVCT_EL0.
 def : Pat<(readcyclecounter), (MRS 0xdf02)>;
 
-// FPCR register
+// FPCR and FPSR registers.
 let Uses = [FPCR] in
 def MRS_FPCR : Pseudo<(outs GPR64:$dst), (ins),
                       [(set GPR64:$dst, (int_aarch64_get_fpcr))]>,
@@ -1817,6 +1817,17 @@ def MSR_FPCR : Pseudo<(outs), (ins GPR64:$val),
                PseudoInstExpansion<(MSR 0xda20, GPR64:$val)>,
                Sched<[WriteSys]>;
 
+let Uses = [FPSR] in
+def MRS_FPSR : Pseudo<(outs GPR64:$dst), (ins),
+                      [(set GPR64:$dst, (int_aarch64_get_fpsr))]>,
+               PseudoInstExpansion<(MRS GPR64:$dst, 0xda21)>,
+               Sched<[WriteSys]>;
+let Defs = [FPSR] in
+def MSR_FPSR : Pseudo<(outs), (ins GPR64:$val),
+                      [(int_aarch64_set_fpsr i64:$val)]>,
+               PseudoInstExpansion<(MSR 0xda21, GPR64:$val)>,
+               Sched<[WriteSys]>;
+
 // Generic system instructions
 def SYSxt  : SystemXtI<0, "sys">;
 def SYSLxt : SystemLXtI<1, "sysl">;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 569944e0e660b..fef1748021b07 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -147,6 +147,9 @@ def VG : AArch64Reg<0, "vg">, DwarfRegNum<[46]>;
 // Floating-point control register
 def FPCR : AArch64Reg<0, "fpcr">;
 
+// Floating-point status register.
+def FPSR : AArch64Reg<0, "fpsr">;
+
 // GPR register classes with the intersections of GPR32/GPR32sp and
 // GPR64/GPR64sp for use by the coalescer.
 def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
diff --git a/llvm/test/CodeGen/AArch64/arm64-fpenv.ll b/llvm/test/CodeGen/AArch64/arm64-fpenv.ll
new file mode 100644
index 0000000000000..030809caee339
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-fpenv.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define i64 @get_fpcr() #0 {
+; CHECK-LABEL: get_fpcr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mrs x0, FPCR
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.aarch64.get.fpcr()
+  ret i64 %1
+}
+
+define void @set_fpcr(i64 %cr) {
+; CHECK-LABEL: set_fpcr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    msr FPCR, x0
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.set.fpcr(i64 %cr)
+  ret void
+}
+
+define i64 @get_fpsr() {
+; CHECK-LABEL: get_fpsr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mrs x0, FPSR
+; CHECK-NEXT:    ret
+  %1 = tail call i64 @llvm.aarch64.get.fpsr()
+  ret i64 %1
+}
+
+define void @set_fpsr(i64 %sr) {
+; CHECK-LABEL: set_fpsr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    msr FPSR, x0
+; CHECK-NEXT:    ret
+  call void @llvm.aarch64.set.fpsr(i64 %sr)
+  ret void
+}
+
+declare i64 @llvm.aarch64.get.fpcr()
+declare void @llvm.aarch64.set.fpcr(i64)
+declare i64 @llvm.aarch64.get.fpsr()
+declare void @llvm.aarch64.set.fpsr(i64)
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/preserve.ll b/llvm/test/CodeGen/AArch64/preserve.ll
index f95de60fbb24c..d11a45144a904 100644
--- a/llvm/test/CodeGen/AArch64/preserve.ll
+++ b/llvm/test/CodeGen/AArch64/preserve.ll
@@ -4,13 +4,13 @@
 target triple = "aarch64-unknown-unknown"
 declare void @bar1()
 define preserve_mostcc void @baz() #0 {
-; CHECK: baz Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
+; CHECK: baz Clobbered Registers: $ffr $fpcr $fpsr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $b16 $b17 $b18 $b19 $b20 $b21 $b22 $b23 $b24 $b25 $b26 $b27 $b28 $b29 $b30 $b31 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $d16 $d17 $d18 $d19 $d20 $d21 $d22 $d23 $d24 $d25 $d26 $d27 $d28 $d29 $d30 $d31 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $h16 $h17 $h18 $h19 $h20 $h21 $h22 $h23 $h24 $h25 $h26 $h27 $h28 $h29 $h30 $h31 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $q8 $q9 $q10 $q11 $q12 $q13 $q14 $q15 $q16 $q17 $q18 $q19 $q20 $q21 $q22 $q23 $q24 $q25 $q26 $q27 $q28 $q29 $q30 $q31 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $s16 $s17 $s18 $s19 $s20 $s21 $s22 $s23 $s24 $s25 $s26 $s27 $s28 $s29 $s30 $s31 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
   call void @bar1()
   call void @bar2()
   ret void
 }
 define preserve_allcc void @foo() #0 {
-; CHECK: foo Clobbered Registers: $ffr $fpcr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
+; CHECK: foo Clobbered Registers: $ffr $fpcr $fpsr $nzcv $sp $vg $wsp $za $b0 $b1 $b2 $b3 $b4 $b5 $b6 $b7 $d0 $d1 $d2 $d3 $d4 $d5 $d6 $d7 $h0 $h1 $h2 $h3 $h4 $h5 $h6 $h7 $p0 $p1 $p2 $p3 $p4 $p5 $p6 $p7 $p8 $p9 $p10 $p11 $p12 $p13 $p14 $p15 $pn0 $pn1 $pn2 $pn3 $pn4 $pn5 $pn6 $pn7 $pn8 $pn9 $pn10 $pn11 $pn12 $pn13 $pn14 $pn15 $q0 $q1 $q2 $q3 $q4 $q5 $q6 $q7 $s0 $s1 $s2 $s3 $s4 $s5 $s6 $s7 $w0 $w1 $w2 $w3 $w4 $w5 $w6 $w7 $w8 $w16 $w17 $w18 $x0 $x1 $x2 $x3 $x4 $x5 $x6 $x7 $x8 $x16 $x17 $x18 $z0 $z1 $z2 $z3 $z4 $z5 $z6 $z7 $z8 $z9 $z10 $z11 $z12 $z13 $z14 $z15 $z16 $z17 $z18 $z19 $z20 $z21 $z22 $z23 $z24 $z25 $z26 $z27 $z28 $z29 $z30 $z31 $zab0 $zad0 $zad1 $zad2 $zad3 $zad4 $zad5 $zad6 $zad7 $zah0 $zah1 $zaq0 $zaq1 $zaq2 $zaq3 $zaq4 $zaq5 $zaq6 $zaq7 $zaq8 $zaq9 $zaq10 $zaq11 $zaq12 $zaq13 $zaq14 $zaq15 $zas0 $zas1 $zas2 $zas3 $zt0 $d0_d1 $d1_d2 $d2_d3 $d3_d4 $d4_d5 $d5_d6 $d6_d7 $d7_d8 $d15_d16 $d16_d17 $d17_d18 $d18_d19 $d19_d20 $d20_d21 $d21_d22 $d22_d23 $d23_d24 $d24_d25 $d25_d26 $d26_d27 $d27_d28 $d28_d29 $d29_d30 $d30_d31 $d31_d0 $d0_d1_d2_d3 $d1_d2_d3_d4 $d2_d3_d4_d5 $d3_d4_d5_d6 $d4_d5_d6_d7 $d5_d6_d7_d8 $d6_d7_d8_d9 $d7_d8_d9_d10 $d13_d14_d15_d16 $d14_d15_d16_d17 $d15_d16_d17_d18 $d16_d17_d18_d19 $d17_d18_d19_d20 $d18_d19_d20_d21 $d19_d20_d21_d22 $d20_d21_d22_d23 $d21_d22_d23_d24 $d22_d23_d24_d25 $d23_d24_d25_d26 $d24_d25_d26_d27 $d25_d26_d27_d28 $d26_d27_d28_d29 $d27_d28_d29_d30 $d28_d29_d30_d31 $d29_d30_d31_d0 $d30_d31_d0_d1 $d31_d0_d1_d2 $d0_d1_d2 $d1_d2_d3 $d2_d3_d4 $d3_d4_d5 $d4_d5_d6 $d5_d6_d7 $d6_d7_d8 $d7_d8_d9 $d14_d15_d16 $d15_d16_d17 $d16_d17_d18 $d17_d18_d19 $d18_d19_d20 $d19_d20_d21 $d20_d21_d22 $d21_d22_d23 $d22_d23_d24 $d23_d24_d25 $d24_d25_d26 $d25_d26_d27 $d26_d27_d28 $d27_d28_d29 $d28_d29_d30 $d29_d30_d31 $d30_d31_d0 $d31_d0_d1 $p0_p1 $p1_p2 $p2_p3 $p3_p4 $p4_p5 $p5_p6 $p6_p7 $p7_p8 $p8_p9 $p9_p10 $p10_p11 $p11_p12 $p12_p13 $p13_p14 $p14_p15 $p15_p0 $q0_q1 $q1_q2 $q2_q3 $q3_q4 $q4_q5 $q5_q6 $q6_q7 $q7_q8 $q8_q9 $q9_q10 $q10_q11 $q11_q12 $q12_q13 $q13_q14 $q14_q15 $q15_q16 $q16_q17 $q17_q18 $q18_q19 $q19_q20 $q20_q21 $q21_q22 $q22_q23 $q23_q24 $q24_q25 $q25_q26 $q26_q27 $q27_q28 $q28_q29 $q29_q30 $q30_q31 $q31_q0 $q0_q1_q2_q3 $q1_q2_q3_q4 $q2_q3_q4_q5 $q3_q4_q5_q6 $q4_q5_q6_q7 $q5_q6_q7_q8 $q6_q7_q8_q9 $q7_q8_q9_q10 $q8_q9_q10_q11 $q9_q10_q11_q12 $q10_q11_q12_q13 $q11_q12_q13_q14 $q12_q13_q14_q15 $q13_q14_q15_q16 $q14_q15_q16_q17 $q15_q16_q17_q18 $q16_q17_q18_q19 $q17_q18_q19_q20 $q18_q19_q20_q21 $q19_q20_q21_q22 $q20_q21_q22_q23 $q21_q22_q23_q24 $q22_q23_q24_q25 $q23_q24_q25_q26 $q24_q25_q26_q27 $q25_q26_q27_q28 $q26_q27_q28_q29 $q27_q28_q29_q30 $q28_q29_q30_q31 $q29_q30_q31_q0 $q30_q31_q0_q1 $q31_q0_q1_q2 $q0_q1_q2 $q1_q2_q3 $q2_q3_q4 $q3_q4_q5 $q4_q5_q6 $q5_q6_q7 $q6_q7_q8 $q7_q8_q9 $q8_q9_q10 $q9_q10_q11 $q10_q11_q12 $q11_q12_q13 $q12_q13_q14 $q13_q14_q15 $q14_q15_q16 $q15_q16_q17 $q16_q17_q18 $q17_q18_q19 $q18_q19_q20 $q19_q20_q21 $q20_q21_q22 $q21_q22_q23 $q22_q23_q24 $q23_q24_q25 $q24_q25_q26 $q25_q26_q27 $q26_q27_q28 $q27_q28_q29 $q28_q29_q30 $q29_q30_q31 $q30_q31_q0 $q31_q0_q1 $x0_x1_x2_x3_x4_x5_x6_x7 $x2_x3_x4_x5_x6_x7_x8_x9 $x4_x5_x6_x7_x8_x9_x10_x11 $x6_x7_x8_x9_x10_x11_x12_x13 $x8_x9_x10_x11_x12_x13_x14_x15 $x10_x11_x12_x13_x14_x15_x16_x17 $x12_x13_x14_x15_x16_x17_x18_x19 $x14_x15_x16_x17_x18_x19_x20_x21 $x16_x17_x18_x19_x20_x21_x22_x23 $x18_x19_x20_x21_x22_x23_x24_x25 $w30_wzr $w0_w1 $w2_w3 $w4_w5 $w6_w7 $w8_w9 $w10_w11 $w12_w13 $w14_w15 $w16_w17 $w18_w19 $lr_xzr $x0_x1 $x2_x3 $x4_x5 $x6_x7 $x8_x9 $x10_x11 $x12_x13 $x14_x15 $x16_x17 $x18_x19 $z0_z1 $z1_z2 $z2_z3 $z3_z4 $z4_z5 $z5_z6 $z6_z7 $z7_z8 $z8_z9 $z9_z10 $z10_z11 $z11_z12 $z12_z13 $z13_z14 $z14_z15 $z15_z16 $z16_z17 $z17_z18 $z18_z19 $z19_z20 $z20_z21 $z21_z22 $z22_z23 $z23_z24 $z24_z25 $z25_z26 $z26_z27 $z27_z28 $z28_z29 $z29_z30 $z30_z31 $z31_z0 $z0_z1_z2_z3 $z1_z2_z3_z4 $z2_z3_z4_z5 $z3_z4_z5_z6 $z4_z5_z6_z7 $z5_z6_z7_z8 $z6_z7_z8_z9 $z7_z8_z9_z10 $z8_z9_z10_z11 $z9_z10_z11_z12 $z10_z11_z12_z13 $z11_z12_z13_z14 $z12_z13_z14_z15 $z13_z14_z15_z16 $z14_z15_z16_z17 $z15_z16_z17_z18 $z16_z17_z18_z19 $z17_z18_z19_z20 $z18_z19_z20_z21 $z19_z20_z21_z22 $z20_z21_z22_z23 $z21_z22_z23_z24 $z22_z23_z24_z25 $z23_z24_z25_z26 $z24_z25_z26_z27 $z25_z26_z27_z28 $z26_z27_z28_z29 $z27_z28_z29_z30 $z28_z29_z30_z31 $z29_z30_z31_z0 $z30_z31_z0_z1 $z31_z0_z1_z2 $z0_z1_z2 $z1_z2_z3 $z2_z3_z4 $z3_z4_z5 $z4_z5_z6 $z5_z6_z7 $z6_z7_z8 $z7_z8_z9 $z8_z9_z10 $z9_z10_z11 $z10_z11_z12 $z11_z12_z13 $z12_z13_z14 $z13_z14_z15 $z14_z15_z16 $z15_z16_z17 $z16_z17_z18 $z17_z18_z19 $z18_z19_z20 $z19_z20_z21 $z20_z21_z22 $z21_z22_z23 $z22_z23_z24 $z23_z24_z25 $z24_z25_z26 $z25_z26_z27 $z26_z27_z28 $z27_z28_z29 $z28_z29_z30 $z29_z30_z31 $z30_z31_z0 $z31_z0_z1 $z16_z24 $z17_z25 $z18_z26 $z19_z27 $z20_z28 $z21_z29 $z22_z30 $z23_z31 $z0_z8 $z1_z9 $z2_z10 $z3_z11 $z4_z12 $z5_z13 $z6_z14 $z7_z15 $z16_z20_z24_z28 $z17_z21_z25_z29 $z18_z22_z26_z30 $z19_z23_z27_z31 $z0_z4_z8_z12 $z1_z5_z9_z13 $z2_z6_z10_z14 $z3_z7_z11_z15
   call void @bar1()
   call void @bar2()
   ret void

From 1901f442ca6374787e6810adb573d138f80893dd Mon Sep 17 00:00:00 2001
From: Artem Tyurin <artem.tyurin@gmail.com>
Date: Sat, 24 Feb 2024 14:35:35 +0100
Subject: [PATCH 231/546] [InstCombine] Handle more even/odd math functions
 (#81324)

At the moment this PR adds support only for `erf` function.

Fixes #77220.
---
 .../llvm/Analysis/TargetLibraryInfo.def       |  15 +++
 .../llvm/Transforms/Utils/SimplifyLibCalls.h  |   1 +
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |   3 +
 llvm/lib/Transforms/Utils/BuildLibCalls.cpp   |   3 +
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 102 ++++++++++--------
 .../InstCombine/math-odd-even-parity.ll       |  50 +++++++++
 .../tools/llvm-tli-checker/ps4-tli-check.yaml |   4 +-
 .../Analysis/TargetLibraryInfoTest.cpp        |   3 +
 8 files changed, 135 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/math-odd-even-parity.ll

diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
index 6bd922eed89e1..37221eb9e4711 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -1069,6 +1069,21 @@ TLI_DEFINE_ENUM_INTERNAL(ctermid)
 TLI_DEFINE_STRING_INTERNAL("ctermid")
 TLI_DEFINE_SIG_INTERNAL(Ptr, Ptr)
 
+/// double erf(double x);
+TLI_DEFINE_ENUM_INTERNAL(erf)
+TLI_DEFINE_STRING_INTERNAL("erf")
+TLI_DEFINE_SIG_INTERNAL(Dbl, Dbl)
+
+/// float erff(float x);
+TLI_DEFINE_ENUM_INTERNAL(erff)
+TLI_DEFINE_STRING_INTERNAL("erff")
+TLI_DEFINE_SIG_INTERNAL(Flt, Flt)
+
+/// long double erfl(long double x);
+TLI_DEFINE_ENUM_INTERNAL(erfl)
+TLI_DEFINE_STRING_INTERNAL("erfl")
+TLI_DEFINE_SIG_INTERNAL(LDbl, LDbl)
+
 /// int execl(const char *path, const char *arg, ...);
 TLI_DEFINE_ENUM_INTERNAL(execl)
 TLI_DEFINE_STRING_INTERNAL("execl")
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 1b6b525b19cae..e2682b429e9db 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -204,6 +204,7 @@ class LibCallSimplifier {
   Value *mergeSqrtToExp(CallInst *CI, IRBuilderBase &B);
   Value *optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBase &B);
   Value *optimizeTrigInversionPairs(CallInst *CI, IRBuilderBase &B);
+  Value *optimizeSymmetric(CallInst *CI, LibFunc Func, IRBuilderBase &B);
   // Wrapper for all floating point library call optimizations
   Value *optimizeFloatingPointLibCall(CallInst *CI, LibFunc Func,
                                       IRBuilderBase &B);
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 710762a6c0ad1..835268bb2d852 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -808,6 +808,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_cabs);
     TLI.setUnavailable(LibFunc_cabsf);
     TLI.setUnavailable(LibFunc_cabsl);
+    TLI.setUnavailable(LibFunc_erf);
+    TLI.setUnavailable(LibFunc_erff);
+    TLI.setUnavailable(LibFunc_erfl);
     TLI.setUnavailable(LibFunc_ffs);
     TLI.setUnavailable(LibFunc_flockfile);
     TLI.setUnavailable(LibFunc_fseeko);
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 12741dc5af5a1..ed0ed345435c4 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1137,6 +1137,9 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
   case LibFunc_cosl:
   case LibFunc_cospi:
   case LibFunc_cospif:
+  case LibFunc_erf:
+  case LibFunc_erff:
+  case LibFunc_erfl:
   case LibFunc_exp:
   case LibFunc_expf:
   case LibFunc_expl:
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 26a34aa99e1b8..2e68a9c01898c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1908,49 +1908,6 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
       *CI, B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs"));
 }
 
-static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
-                                      IRBuilderBase &B) {
-  if (!isa<FPMathOperator>(Call))
-    return nullptr;
-
-  IRBuilderBase::FastMathFlagGuard Guard(B);
-  B.setFastMathFlags(Call->getFastMathFlags());
-
-  // TODO: Can this be shared to also handle LLVM intrinsics?
-  Value *X;
-  switch (Func) {
-  case LibFunc_sin:
-  case LibFunc_sinf:
-  case LibFunc_sinl:
-  case LibFunc_tan:
-  case LibFunc_tanf:
-  case LibFunc_tanl:
-    // sin(-X) --> -sin(X)
-    // tan(-X) --> -tan(X)
-    if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X)))))
-      return B.CreateFNeg(
-          copyFlags(*Call, B.CreateCall(Call->getCalledFunction(), X)));
-    break;
-  case LibFunc_cos:
-  case LibFunc_cosf:
-  case LibFunc_cosl: {
-    // cos(-x) --> cos(x)
-    // cos(fabs(x)) --> cos(x)
-    // cos(copysign(x, y)) --> cos(x)
-    Value *Sign;
-    Value *Src = Call->getArgOperand(0);
-    if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X))) ||
-        match(Src, m_CopySign(m_Value(X), m_Value(Sign))))
-      return copyFlags(*Call,
-                       B.CreateCall(Call->getCalledFunction(), X, "cos"));
-    break;
-  }
-  default:
-    break;
-  }
-  return nullptr;
-}
-
 // Return a properly extended integer (DstWidth bits wide) if the operation is
 // an itofp.
 static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B, unsigned DstWidth) {
@@ -2797,6 +2754,63 @@ static bool insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
   return true;
 }
 
+static Value *optimizeSymmetricCall(CallInst *CI, bool IsEven,
+                                    IRBuilderBase &B) {
+  Value *X;
+  Value *Src = CI->getArgOperand(0);
+
+  if (match(Src, m_OneUse(m_FNeg(m_Value(X))))) {
+    IRBuilderBase::FastMathFlagGuard Guard(B);
+    B.setFastMathFlags(CI->getFastMathFlags());
+
+    auto *CallInst = copyFlags(*CI, B.CreateCall(CI->getCalledFunction(), {X}));
+    if (IsEven) {
+      // Even function: f(-x) = f(x)
+      return CallInst;
+    }
+    // Odd function: f(-x) = -f(x)
+    return B.CreateFNeg(CallInst);
+  }
+
+  // Even function: f(abs(x)) = f(x), f(copysign(x, y)) = f(x)
+  if (IsEven && (match(Src, m_FAbs(m_Value(X))) ||
+                 match(Src, m_CopySign(m_Value(X), m_Value())))) {
+    IRBuilderBase::FastMathFlagGuard Guard(B);
+    B.setFastMathFlags(CI->getFastMathFlags());
+
+    auto *CallInst = copyFlags(*CI, B.CreateCall(CI->getCalledFunction(), {X}));
+    return CallInst;
+  }
+
+  return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSymmetric(CallInst *CI, LibFunc Func,
+                                            IRBuilderBase &B) {
+  switch (Func) {
+  case LibFunc_cos:
+  case LibFunc_cosf:
+  case LibFunc_cosl:
+    return optimizeSymmetricCall(CI, /*IsEven*/ true, B);
+
+  case LibFunc_sin:
+  case LibFunc_sinf:
+  case LibFunc_sinl:
+
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanl:
+
+  case LibFunc_erf:
+  case LibFunc_erff:
+  case LibFunc_erfl:
+    return optimizeSymmetricCall(CI, /*IsEven*/ false, B);
+
+  default:
+    return nullptr;
+  }
+}
+
 Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, bool IsSin, IRBuilderBase &B) {
   // Make sure the prototype is as expected, otherwise the rest of the
   // function is probably invalid and likely to abort.
@@ -3678,7 +3692,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   if (CI->isStrictFP())
     return nullptr;
 
-  if (Value *V = optimizeTrigReflections(CI, Func, Builder))
+  if (Value *V = optimizeSymmetric(CI, Func, Builder))
     return V;
 
   switch (Func) {
diff --git a/llvm/test/Transforms/InstCombine/math-odd-even-parity.ll b/llvm/test/Transforms/InstCombine/math-odd-even-parity.ll
new file mode 100644
index 0000000000000..4c8cf7a99d895
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/math-odd-even-parity.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare double @erf(double)
+declare double @cos(double)
+declare double @fabs(double)
+
+declare void @use(double) nounwind
+
+; Check odd parity: -erf(-x) == erf(x)
+define double @test_erf(double %x) {
+; CHECK-LABEL: define double @test_erf(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call reassoc double @erf(double [[X]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %neg_x = fneg double %x
+  %res = tail call reassoc double @erf(double %neg_x)
+  %neg_res = fneg double %res
+  ret double %neg_res
+}
+
+; Check even parity: cos(fabs(x)) == cos(x)
+define double @test_cos_fabs(double %x) {
+; CHECK-LABEL: define double @test_cos_fabs(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = tail call reassoc double @cos(double [[X]])
+; CHECK-NEXT:    ret double [[RES]]
+;
+  %fabs_res = call double @fabs(double %x)
+  %res = tail call reassoc double @cos(double %fabs_res)
+  ret double %res
+}
+
+; Do nothing in case of multi-use
+define double @test_erf_multi_use(double %x) {
+; CHECK-LABEL: define double @test_erf_multi_use(
+; CHECK-SAME: double [[X:%.*]]) {
+; CHECK-NEXT:    [[NEG_X:%.*]] = fneg double [[X]]
+; CHECK-NEXT:    call void @use(double [[NEG_X]])
+; CHECK-NEXT:    [[RES:%.*]] = call double @erf(double [[NEG_X]])
+; CHECK-NEXT:    [[NEG_RES:%.*]] = fneg double [[RES]]
+; CHECK-NEXT:    ret double [[NEG_RES]]
+;
+  %neg_x = fneg double %x
+  call void @use(double %neg_x)
+  %res = call double @erf(double %neg_x)
+  %neg_res = fneg double %res
+  ret double %neg_res
+}
diff --git a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
index 23d3482fb89a7..61caf727b0f96 100644
--- a/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
+++ b/llvm/test/tools/llvm-tli-checker/ps4-tli-check.yaml
@@ -47,10 +47,10 @@
 ## the exact count first; the two directives should add up to that.
 ## Yes, this means additions to TLI will fail this test, but the argument
 ## to -COUNT can't be an expression.
-# AVAIL: TLI knows 476 symbols, 243 available
+# AVAIL: TLI knows 479 symbols, 243 available
 # AVAIL-COUNT-243: {{^}} available
 # AVAIL-NOT:       {{^}} available
-# UNAVAIL-COUNT-233: not available
+# UNAVAIL-COUNT-236: not available
 # UNAVAIL-NOT:       not available
 
 ## This is a large file so it's worth telling lit to stop here.
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 34b06fe480f36..8e3fe3b44a84a 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -264,6 +264,9 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
       "declare double @pow(double, double)\n"
       "declare float @powf(float, float)\n"
       "declare x86_fp80 @powl(x86_fp80, x86_fp80)\n"
+      "declare double @erf(double)\n"
+      "declare float @erff(float)\n"
+      "declare x86_fp80 @erfl(x86_fp80)\n"
       "declare i32 @printf(i8*, ...)\n"
       "declare i32 @putc(i32, %struct*)\n"
       "declare i32 @putc_unlocked(i32, %struct*)\n"

From d877ab1b99496feb48db1158963abd130e2aee5c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 22 Feb 2024 17:57:15 +0000
Subject: [PATCH 232/546] MachineInstr - update TargetRegisterInfo arguments
 comments. NFC.

"TargetRegisterInfo is passed" -> "TargetRegisterInfo is non-null" - matches the term in the rest of the header.
---
 llvm/include/llvm/CodeGen/MachineInstr.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index bd72ac23fc9c0..5bb4a67555222 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1438,7 +1438,7 @@ class MachineInstr
   unsigned getBundleSize() const;
 
   /// Return true if the MachineInstr reads the specified register.
-  /// If TargetRegisterInfo is passed, then it also checks if there
+  /// If TargetRegisterInfo is non-null, then it also checks if there
   /// is a read of a super-register.
   /// This does not count partial redefines of virtual registers as reads:
   ///   %reg1024:6 = OP.
@@ -1461,7 +1461,7 @@ class MachineInstr
                                 SmallVectorImpl<unsigned> *Ops = nullptr) const;
 
   /// Return true if the MachineInstr kills the specified register.
-  /// If TargetRegisterInfo is passed, then it also checks if there is
+  /// If TargetRegisterInfo is non-null, then it also checks if there is
   /// a kill of a super-register.
   bool killsRegister(Register Reg,
                      const TargetRegisterInfo *TRI = nullptr) const {
@@ -1469,7 +1469,7 @@ class MachineInstr
   }
 
   /// Return true if the MachineInstr fully defines the specified register.
-  /// If TargetRegisterInfo is passed, then it also checks
+  /// If TargetRegisterInfo is non-null, then it also checks
   /// if there is a def of a super-register.
   /// NOTE: It's ignoring subreg indices on virtual registers.
   bool definesRegister(Register Reg,
@@ -1486,7 +1486,7 @@ class MachineInstr
   }
 
   /// Returns true if the register is dead in this machine instruction.
-  /// If TargetRegisterInfo is passed, then it also checks
+  /// If TargetRegisterInfo is non-null, then it also checks
   /// if there is a dead def of a super-register.
   bool registerDefIsDead(Register Reg,
                          const TargetRegisterInfo *TRI = nullptr) const {

From cf9201cfdbc10f4606fc4ca22bf1ccaf5ee841b3 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sat, 24 Feb 2024 17:01:54 +0100
Subject: [PATCH 233/546] [llvm-ar] Use COFF archive format for COFF targets.
 (#82642)

Detect COFF files by default and allow specifying it with --format
argument.

This is important for ARM64EC, which uses a separated symbol map for EC
symbols. Since K_COFF is mostly compatible with K_GNU, this shouldn't
really make a difference for other targets.
---
 llvm/include/llvm/Object/Archive.h       |  1 +
 llvm/lib/Object/Archive.cpp              | 15 +++--
 llvm/lib/Object/ArchiveWriter.cpp        | 21 +++---
 llvm/test/tools/llvm-ar/coff-symtab.test | 85 ++++++++++++++++++++++++
 llvm/tools/llvm-ar/llvm-ar.cpp           |  7 +-
 5 files changed, 114 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ar/coff-symtab.test

diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index 3dd99a46507a2..66f07939b1105 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -339,6 +339,7 @@ class Archive : public Binary {
   Kind kind() const { return (Kind)Format; }
   bool isThin() const { return IsThin; }
   static object::Archive::Kind getDefaultKindForHost();
+  static object::Archive::Kind getDefaultKindForTriple(Triple &T);
 
   child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
   child_iterator child_end() const;
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index e447e5b23316f..d3fdcd9ee8811 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -969,12 +969,19 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
+object::Archive::Kind Archive::getDefaultKindForTriple(Triple &T) {
+  if (T.isOSDarwin())
+    return object::Archive::K_DARWIN;
+  if (T.isOSAIX())
+    return object::Archive::K_AIXBIG;
+  if (T.isOSWindows())
+    return object::Archive::K_COFF;
+  return object::Archive::K_GNU;
+}
+
 object::Archive::Kind Archive::getDefaultKindForHost() {
   Triple HostTriple(sys::getProcessTriple());
-  return HostTriple.isOSDarwin()
-             ? object::Archive::K_DARWIN
-             : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
-                                     : object::Archive::K_GNU);
+  return getDefaultKindForTriple(HostTriple);
 }
 
 Archive::child_iterator Archive::child_begin(Error &Err,
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 155926a8c5949..02f72521c8b54 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -62,12 +62,16 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
   Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
       object::ObjectFile::createObjectFile(MemBufferRef);
 
-  if (OptionalObject)
-    return isa<object::MachOObjectFile>(**OptionalObject)
-               ? object::Archive::K_DARWIN
-               : (isa<object::XCOFFObjectFile>(**OptionalObject)
-                      ? object::Archive::K_AIXBIG
-                      : object::Archive::K_GNU);
+  if (OptionalObject) {
+    if (isa<object::MachOObjectFile>(**OptionalObject))
+      return object::Archive::K_DARWIN;
+    if (isa<object::XCOFFObjectFile>(**OptionalObject))
+      return object::Archive::K_AIXBIG;
+    if (isa<object::COFFObjectFile>(**OptionalObject) ||
+        isa<object::COFFImportFile>(**OptionalObject))
+      return object::Archive::K_COFF;
+    return object::Archive::K_GNU;
+  }
 
   // Squelch the error in case we had a non-object file.
   consumeError(OptionalObject.takeError());
@@ -80,10 +84,7 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
             MemBufferRef, file_magic::bitcode, &Context)) {
       auto &IRObject = cast<object::IRObjectFile>(**ObjOrErr);
       auto TargetTriple = Triple(IRObject.getTargetTriple());
-      return TargetTriple.isOSDarwin()
-                 ? object::Archive::K_DARWIN
-                 : (TargetTriple.isOSAIX() ? object::Archive::K_AIXBIG
-                                           : object::Archive::K_GNU);
+      return object::Archive::getDefaultKindForTriple(TargetTriple);
     } else {
       // Squelch the error in case this was not a SymbolicFile.
       consumeError(ObjOrErr.takeError());
diff --git a/llvm/test/tools/llvm-ar/coff-symtab.test b/llvm/test/tools/llvm-ar/coff-symtab.test
new file mode 100644
index 0000000000000..50d08fba3b02f
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/coff-symtab.test
@@ -0,0 +1,85 @@
+Verify that llvm-ar uses COFF archive format by ensuring that archive map is sorted.
+
+RUN: rm -rf %t.dif && split-file %s %t.dir && cd %t.dir
+
+RUN: yaml2obj coff-symtab.yaml -o coff-symtab.obj
+RUN: llvm-ar crs out.a coff-symtab.obj
+RUN: llvm-nm --print-armap out.a | FileCheck %s
+
+RUN: llvm-as coff-symtab.ll -o coff-symtab.bc
+RUN: llvm-ar crs out2.a coff-symtab.bc
+RUN: llvm-nm --print-armap out2.a | FileCheck %s
+
+RUN: yaml2obj elf.yaml -o coff-symtab.o
+RUN: llvm-ar crs --format coff out3.a coff-symtab.o
+RUN: llvm-nm --print-armap out3.a | FileCheck %s
+
+CHECK: Archive map
+CHECK-NEXT: a in coff-symtab
+CHECK-NEXT: b in coff-symtab
+CHECK-NEXT: c in coff-symtab
+CHECK-EMPTY:
+
+#--- coff-symtab.yaml
+--- !COFF
+header:
+  Machine:           IMAGE_FILE_MACHINE_UNKNOWN
+  Characteristics:   [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            b
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            c
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            a
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
+
+
+#--- coff-symtab.ll
+target triple = "x86_64-unknown-windows-msvc"
+
+define void @b() { ret void }
+define void @c() { ret void }
+define void @a() { ret void }
+
+#--- elf.yaml
+--- !ELF
+FileHeader:
+  Class:             ELFCLASS64
+  Data  :            ELFDATA2LSB
+  Type:              ET_REL
+  Machine:           EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+Symbols:
+  - Name:            b
+    Binding:         STB_GLOBAL
+    Section:         .text
+  - Name:            c
+    Binding:         STB_GLOBAL
+    Section:         .text
+  - Name:            a
+    Binding:         STB_GLOBAL
+    Section:         .text
+...
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index c8800303bc1e4..c58a85c695dac 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -82,6 +82,7 @@ static void printArHelp(StringRef ToolName) {
     =darwin             -   darwin
     =bsd                -   bsd
     =bigarchive         -   big archive (AIX OS)
+    =coff               -   coff
   --plugin=<string>     - ignored for compatibility
   -h --help             - display this help and exit
   --output              - the directory to extract archive members to
@@ -193,7 +194,7 @@ static SmallVector<const char *, 256> PositionalArgs;
 static bool MRI;
 
 namespace {
-enum Format { Default, GNU, BSD, DARWIN, BIGARCHIVE, Unknown };
+enum Format { Default, GNU, COFF, BSD, DARWIN, BIGARCHIVE, Unknown };
 }
 
 static Format FormatType = Default;
@@ -1044,6 +1045,9 @@ static void performWriteOperation(ArchiveOperation Operation,
   case GNU:
     Kind = object::Archive::K_GNU;
     break;
+  case COFF:
+    Kind = object::Archive::K_COFF;
+    break;
   case BSD:
     if (Thin)
       fail("only the gnu format has a thin mode");
@@ -1376,6 +1380,7 @@ static int ar_main(int argc, char **argv) {
                        .Case("darwin", DARWIN)
                        .Case("bsd", BSD)
                        .Case("bigarchive", BIGARCHIVE)
+                       .Case("coff", COFF)
                        .Default(Unknown);
       if (FormatType == Unknown)
         fail(std::string("Invalid format ") + Match);

From 8a5aa103c52265337b43330e55e05567046f3ede Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sat, 24 Feb 2024 17:43:59 +0100
Subject: [PATCH 234/546] Revert "[llvm-ar] Use COFF archive format for COFF
 targets." (#82889)

Reverts llvm/llvm-project#82642 for
lld/test/ELF/invalid/Output/data-encoding.test.tmp.a failures on
Windows.
---
 llvm/include/llvm/Object/Archive.h       |  1 -
 llvm/lib/Object/Archive.cpp              | 15 ++---
 llvm/lib/Object/ArchiveWriter.cpp        | 21 +++---
 llvm/test/tools/llvm-ar/coff-symtab.test | 85 ------------------------
 llvm/tools/llvm-ar/llvm-ar.cpp           |  7 +-
 5 files changed, 15 insertions(+), 114 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-ar/coff-symtab.test

diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index 66f07939b1105..3dd99a46507a2 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -339,7 +339,6 @@ class Archive : public Binary {
   Kind kind() const { return (Kind)Format; }
   bool isThin() const { return IsThin; }
   static object::Archive::Kind getDefaultKindForHost();
-  static object::Archive::Kind getDefaultKindForTriple(Triple &T);
 
   child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
   child_iterator child_end() const;
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index d3fdcd9ee8811..e447e5b23316f 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -969,19 +969,12 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
-object::Archive::Kind Archive::getDefaultKindForTriple(Triple &T) {
-  if (T.isOSDarwin())
-    return object::Archive::K_DARWIN;
-  if (T.isOSAIX())
-    return object::Archive::K_AIXBIG;
-  if (T.isOSWindows())
-    return object::Archive::K_COFF;
-  return object::Archive::K_GNU;
-}
-
 object::Archive::Kind Archive::getDefaultKindForHost() {
   Triple HostTriple(sys::getProcessTriple());
-  return getDefaultKindForTriple(HostTriple);
+  return HostTriple.isOSDarwin()
+             ? object::Archive::K_DARWIN
+             : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
+                                     : object::Archive::K_GNU);
 }
 
 Archive::child_iterator Archive::child_begin(Error &Err,
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 02f72521c8b54..155926a8c5949 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -62,16 +62,12 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
   Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
       object::ObjectFile::createObjectFile(MemBufferRef);
 
-  if (OptionalObject) {
-    if (isa<object::MachOObjectFile>(**OptionalObject))
-      return object::Archive::K_DARWIN;
-    if (isa<object::XCOFFObjectFile>(**OptionalObject))
-      return object::Archive::K_AIXBIG;
-    if (isa<object::COFFObjectFile>(**OptionalObject) ||
-        isa<object::COFFImportFile>(**OptionalObject))
-      return object::Archive::K_COFF;
-    return object::Archive::K_GNU;
-  }
+  if (OptionalObject)
+    return isa<object::MachOObjectFile>(**OptionalObject)
+               ? object::Archive::K_DARWIN
+               : (isa<object::XCOFFObjectFile>(**OptionalObject)
+                      ? object::Archive::K_AIXBIG
+                      : object::Archive::K_GNU);
 
   // Squelch the error in case we had a non-object file.
   consumeError(OptionalObject.takeError());
@@ -84,7 +80,10 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
             MemBufferRef, file_magic::bitcode, &Context)) {
       auto &IRObject = cast<object::IRObjectFile>(**ObjOrErr);
       auto TargetTriple = Triple(IRObject.getTargetTriple());
-      return object::Archive::getDefaultKindForTriple(TargetTriple);
+      return TargetTriple.isOSDarwin()
+                 ? object::Archive::K_DARWIN
+                 : (TargetTriple.isOSAIX() ? object::Archive::K_AIXBIG
+                                           : object::Archive::K_GNU);
     } else {
       // Squelch the error in case this was not a SymbolicFile.
       consumeError(ObjOrErr.takeError());
diff --git a/llvm/test/tools/llvm-ar/coff-symtab.test b/llvm/test/tools/llvm-ar/coff-symtab.test
deleted file mode 100644
index 50d08fba3b02f..0000000000000
--- a/llvm/test/tools/llvm-ar/coff-symtab.test
+++ /dev/null
@@ -1,85 +0,0 @@
-Verify that llvm-ar uses COFF archive format by ensuring that archive map is sorted.
-
-RUN: rm -rf %t.dif && split-file %s %t.dir && cd %t.dir
-
-RUN: yaml2obj coff-symtab.yaml -o coff-symtab.obj
-RUN: llvm-ar crs out.a coff-symtab.obj
-RUN: llvm-nm --print-armap out.a | FileCheck %s
-
-RUN: llvm-as coff-symtab.ll -o coff-symtab.bc
-RUN: llvm-ar crs out2.a coff-symtab.bc
-RUN: llvm-nm --print-armap out2.a | FileCheck %s
-
-RUN: yaml2obj elf.yaml -o coff-symtab.o
-RUN: llvm-ar crs --format coff out3.a coff-symtab.o
-RUN: llvm-nm --print-armap out3.a | FileCheck %s
-
-CHECK: Archive map
-CHECK-NEXT: a in coff-symtab
-CHECK-NEXT: b in coff-symtab
-CHECK-NEXT: c in coff-symtab
-CHECK-EMPTY:
-
-#--- coff-symtab.yaml
---- !COFF
-header:
-  Machine:           IMAGE_FILE_MACHINE_UNKNOWN
-  Characteristics:   [  ]
-sections:
-  - Name:            .text
-    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
-    Alignment:       4
-    SectionData:     ''
-symbols:
-  - Name:            b
-    Value:           0
-    SectionNumber:   1
-    SimpleType:      IMAGE_SYM_TYPE_NULL
-    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
-    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
-  - Name:            c
-    Value:           0
-    SectionNumber:   1
-    SimpleType:      IMAGE_SYM_TYPE_NULL
-    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
-    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
-  - Name:            a
-    Value:           0
-    SectionNumber:   1
-    SimpleType:      IMAGE_SYM_TYPE_NULL
-    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
-    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
-...
-
-
-#--- coff-symtab.ll
-target triple = "x86_64-unknown-windows-msvc"
-
-define void @b() { ret void }
-define void @c() { ret void }
-define void @a() { ret void }
-
-#--- elf.yaml
---- !ELF
-FileHeader:
-  Class:             ELFCLASS64
-  Data  :            ELFDATA2LSB
-  Type:              ET_REL
-  Machine:           EM_X86_64
-Sections:
-  - Name:            .text
-    Type:            SHT_PROGBITS
-    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
-    AddressAlign:    0x0000000000000004
-    Content:         ''
-Symbols:
-  - Name:            b
-    Binding:         STB_GLOBAL
-    Section:         .text
-  - Name:            c
-    Binding:         STB_GLOBAL
-    Section:         .text
-  - Name:            a
-    Binding:         STB_GLOBAL
-    Section:         .text
-...
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index c58a85c695dac..c8800303bc1e4 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -82,7 +82,6 @@ static void printArHelp(StringRef ToolName) {
     =darwin             -   darwin
     =bsd                -   bsd
     =bigarchive         -   big archive (AIX OS)
-    =coff               -   coff
   --plugin=<string>     - ignored for compatibility
   -h --help             - display this help and exit
   --output              - the directory to extract archive members to
@@ -194,7 +193,7 @@ static SmallVector<const char *, 256> PositionalArgs;
 static bool MRI;
 
 namespace {
-enum Format { Default, GNU, COFF, BSD, DARWIN, BIGARCHIVE, Unknown };
+enum Format { Default, GNU, BSD, DARWIN, BIGARCHIVE, Unknown };
 }
 
 static Format FormatType = Default;
@@ -1045,9 +1044,6 @@ static void performWriteOperation(ArchiveOperation Operation,
   case GNU:
     Kind = object::Archive::K_GNU;
     break;
-  case COFF:
-    Kind = object::Archive::K_COFF;
-    break;
   case BSD:
     if (Thin)
       fail("only the gnu format has a thin mode");
@@ -1380,7 +1376,6 @@ static int ar_main(int argc, char **argv) {
                        .Case("darwin", DARWIN)
                        .Case("bsd", BSD)
                        .Case("bigarchive", BIGARCHIVE)
-                       .Case("coff", COFF)
                        .Default(Unknown);
       if (FormatType == Unknown)
         fail(std::string("Invalid format ") + Match);

From 8e22fffc85b36784146041499b716cec74285660 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Sat, 24 Feb 2024 12:24:13 -0500
Subject: [PATCH 235/546] [clang] Remove trailing whitespace

Fix commit 66f6929fec3ae
---
 clang/docs/HLSL/ExpectedDifferences.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/HLSL/ExpectedDifferences.rst b/clang/docs/HLSL/ExpectedDifferences.rst
index 60001b22dc792..d1b6010f10f43 100644
--- a/clang/docs/HLSL/ExpectedDifferences.rst
+++ b/clang/docs/HLSL/ExpectedDifferences.rst
@@ -93,7 +93,7 @@ behavior between Clang and DXC. Some examples include:
     fma(X, Y, Z); // DXC: Fails to resolve no known conversion from float to double.
                   // Clang: Resolves to fma(double,double,double).
   #endif
-    
+
     double D = dot(A, B); // DXC: Resolves to dot(double3, double3), fails DXIL Validation.
                           // FXC: Expands to compute double dot product with fmul/fadd
                           // Clang: Resolves to dot(float3, float3), emits conversion warnings.
@@ -102,7 +102,7 @@ behavior between Clang and DXC. Some examples include:
 
 .. note::
 
-  In Clang, a conscious decision was made to exclude the ``dot(vector<double,N>, vector<double,N>)`` 
+  In Clang, a conscious decision was made to exclude the ``dot(vector<double,N>, vector<double,N>)``
   overload and allow overload resolution to resolve the
   ``vector<float,N>`` overload. This approach provides ``-Wconversion``
   diagnostic notifying the user of the conversion rather than silently altering

From 1e98d4883d78ac2c65b87e24694e8b2f1dc9f02d Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Sat, 24 Feb 2024 14:35:39 -0500
Subject: [PATCH 236/546] [mlir][linalg] NFC: Use tablegen macro for pass
 constructors (#82892)

This uses the tablegen macros for generating pass constructors, exposing
pass options for fold-unit-extent-dims and linalg-detensorize.

Additionally aligns some of the pass namings to their text counterpart.
This includes an API change:

createLinalgGeneralizationPass -> createLinalgGeneralizeNamedOpsPass
---
 mlir/include/mlir/Dialect/Linalg/Passes.h     | 38 +-------
 mlir/include/mlir/Dialect/Linalg/Passes.td    | 87 ++++++++-----------
 .../Dialect/Linalg/Transforms/Bufferize.cpp   | 10 +--
 .../Dialect/Linalg/Transforms/Detensorize.cpp | 10 +--
 .../Linalg/Transforms/DropUnitDims.cpp        | 11 ++-
 .../Linalg/Transforms/ElementwiseOpFusion.cpp | 11 +--
 .../Linalg/Transforms/ElementwiseToLinalg.cpp | 10 +--
 .../Linalg/Transforms/Generalization.cpp      | 15 ++--
 .../Transforms/InlineScalarOperands.cpp       | 10 +--
 mlir/lib/Dialect/Linalg/Transforms/Loops.cpp  | 30 +++----
 .../Linalg/Transforms/NamedOpConversions.cpp  | 13 ++-
 .../Pipelines/SparseTensorPipelines.cpp       |  2 +-
 12 files changed, 89 insertions(+), 158 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h
index 5f46affe592a2..d36d1e70f0b14 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.h
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.h
@@ -27,43 +27,7 @@ struct OneShotBufferizationOptions;
 } // namespace bufferization
 
 #define GEN_PASS_DECL
-#include "mlir/Dialect/Linalg/Passes.h.inc"
-
-std::unique_ptr<Pass> createConvertElementwiseToLinalgPass();
-
-std::unique_ptr<Pass> createLinalgFoldUnitExtentDimsPass();
-
-std::unique_ptr<Pass> createLinalgElementwiseOpFusionPass();
-std::unique_ptr<Pass> createFoldReshapeOpsByLinearizationPass();
-
-std::unique_ptr<Pass> createLinalgNamedOpConversionPass();
-
-std::unique_ptr<Pass> createLinalgInlineScalarOperandsPass();
-
-/// Create a pass to convert Linalg operations to scf.for loops and
-/// memref.load/memref.store accesses.
-std::unique_ptr<Pass> createConvertLinalgToLoopsPass();
-
-/// Create a pass to convert Linalg operations to scf.parallel loops and
-/// memref.load/memref.store accesses.
-std::unique_ptr<Pass> createConvertLinalgToParallelLoopsPass();
-
-/// Create a pass to convert Linalg operations to affine.for loops and
-/// affine_load/affine_store accesses.
-/// Placeholder for now, this is NYI.
-std::unique_ptr<Pass> createConvertLinalgToAffineLoopsPass();
-
-/// Create a pass to convert Linalg operations which work on tensors to use
-/// buffers instead.
-std::unique_ptr<Pass> createLinalgBufferizePass();
-
-/// Create a pass to convert named Linalg operations to Linalg generic
-/// operations.
-std::unique_ptr<Pass> createLinalgGeneralizationPass();
-
-/// Create a pass to convert Linalg operations to equivalent operations that
-/// work on primitive types, if possible.
-std::unique_ptr<Pass> createLinalgDetensorizePass();
+#include "mlir/Dialect/Linalg/Passes.h.inc" // IWYU pragma: keep
 
 //===----------------------------------------------------------------------===//
 // Registration
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index cca50e21d5ce0..85f11c66d29a7 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -11,7 +11,7 @@
 
 include "mlir/Pass/PassBase.td"
 
-def ConvertElementwiseToLinalg : Pass<"convert-elementwise-to-linalg", ""> {
+def ConvertElementwiseToLinalgPass : Pass<"convert-elementwise-to-linalg", ""> {
   let summary = "Convert ElementwiseMappable ops to linalg";
   let description = [{
     Convert ops with the `ElementwiseMappable` trait to linalg parallel loops.
@@ -20,54 +20,17 @@ def ConvertElementwiseToLinalg : Pass<"convert-elementwise-to-linalg", ""> {
     run on op which contains linalg ops (most commonly a
     FunctionOpInterface op).
   }];
-  let constructor = "mlir::createConvertElementwiseToLinalgPass()";
   let dependentDialects = ["linalg::LinalgDialect", "memref::MemRefDialect"];
 }
 
-def LinalgFoldUnitExtentDims : Pass<"linalg-fold-unit-extent-dims", ""> {
-  let summary = "Remove unit-extent dimension in Linalg ops on tensors";
-  let constructor = "mlir::createLinalgFoldUnitExtentDimsPass()";
-  let options = [
-    Option<"useRankReducingSlices", "use-rank-reducing-slices", "bool",
-           /*default=*/"false",
-           "Generate rank-reducing slices instead of reassociative reshapes">
-  ];
-  let dependentDialects = [
-    "linalg::LinalgDialect", "affine::AffineDialect", "memref::MemRefDialect"
-  ];
-}
-
-def LinalgElementwiseOpFusion : Pass<"linalg-fuse-elementwise-ops"> {
-  let summary = "Fuse elementwise operations on tensors";
-  let constructor = "mlir::createLinalgElementwiseOpFusionPass()";
-  let dependentDialects = [
-    "affine::AffineDialect", "linalg::LinalgDialect", "memref::MemRefDialect"
-  ];
-}
-
-def LinalgNamedOpConversion: Pass<"linalg-named-op-conversion"> {
-  let summary = "Convert from one named linalg op to another.";
-  let constructor = "mlir::createLinalgNamedOpConversionPass()";
-  let dependentDialects = ["linalg::LinalgDialect", "tensor::TensorDialect"];
-}
-
-def LinalgInlineScalarOperands : Pass<"linalg-inline-scalar-operands"> {
-  let summary = "Inline scalar operands into linalg generic ops";
-  let constructor = "mlir::createLinalgInlineScalarOperandsPass()";
-  let dependentDialects = [
-    "linalg::LinalgDialect"
-  ];
-}
-
-def LinalgLowerToAffineLoops : Pass<"convert-linalg-to-affine-loops"> {
+def ConvertLinalgToAffineLoopsPass : Pass<"convert-linalg-to-affine-loops"> {
   let summary = "Lower the operations from the linalg dialect into affine "
                 "loops";
-  let constructor = "mlir::createConvertLinalgToAffineLoopsPass()";
   let dependentDialects = [
     "affine::AffineDialect", "linalg::LinalgDialect", "memref::MemRefDialect"];
 }
 
-def LinalgLowerToLoops : Pass<"convert-linalg-to-loops"> {
+def ConvertLinalgToLoopsPass : Pass<"convert-linalg-to-loops"> {
   let summary = "Lower the operations from the linalg dialect into loops";
   let description = [{
     Lowers the `linalg` ops to loop nests using `scf.for`.
@@ -76,7 +39,6 @@ def LinalgLowerToLoops : Pass<"convert-linalg-to-loops"> {
     i.e., tensor operands and results must be converted to memrefs via
     bufferization.
   }];
-  let constructor = "mlir::createConvertLinalgToLoopsPass()";
   let dependentDialects = [
     "linalg::LinalgDialect",
     "scf::SCFDialect",
@@ -84,11 +46,10 @@ def LinalgLowerToLoops : Pass<"convert-linalg-to-loops"> {
   ];
 }
 
-def LinalgLowerToParallelLoops
+def ConvertLinalgToParallelLoopsPass
     : Pass<"convert-linalg-to-parallel-loops"> {
   let summary = "Lower the operations from the linalg dialect into parallel "
                 "loops";
-  let constructor = "mlir::createConvertLinalgToParallelLoopsPass()";
   let dependentDialects = [
     "affine::AffineDialect",
     "linalg::LinalgDialect",
@@ -97,9 +58,39 @@ def LinalgLowerToParallelLoops
   ];
 }
 
-def LinalgBufferize : Pass<"linalg-bufferize"> {
+def LinalgFoldUnitExtentDimsPass : Pass<"linalg-fold-unit-extent-dims", ""> {
+  let summary = "Remove unit-extent dimension in Linalg ops on tensors";
+  let options = [
+    Option<"useRankReducingSlices", "use-rank-reducing-slices", "bool",
+           /*default=*/"false",
+           "Generate rank-reducing slices instead of reassociative reshapes">
+  ];
+  let dependentDialects = [
+    "linalg::LinalgDialect", "affine::AffineDialect", "memref::MemRefDialect"
+  ];
+}
+
+def LinalgElementwiseOpFusionPass : Pass<"linalg-fuse-elementwise-ops"> {
+  let summary = "Fuse elementwise operations on tensors";
+  let dependentDialects = [
+    "affine::AffineDialect", "linalg::LinalgDialect", "memref::MemRefDialect"
+  ];
+}
+
+def LinalgNamedOpConversionPass: Pass<"linalg-named-op-conversion"> {
+  let summary = "Convert from one named linalg op to another.";
+  let dependentDialects = ["linalg::LinalgDialect", "tensor::TensorDialect"];
+}
+
+def LinalgInlineScalarOperandsPass : Pass<"linalg-inline-scalar-operands"> {
+  let summary = "Inline scalar operands into linalg generic ops";
+  let dependentDialects = [
+    "linalg::LinalgDialect"
+  ];
+}
+
+def LinalgBufferizePass : Pass<"linalg-bufferize"> {
   let summary = "Bufferize the linalg dialect";
-  let constructor = "mlir::createLinalgBufferizePass()";
   let dependentDialects = [
     "affine::AffineDialect",
     "bufferization::BufferizationDialect",
@@ -108,15 +99,13 @@ def LinalgBufferize : Pass<"linalg-bufferize"> {
   ];
 }
 
-def LinalgGeneralization : Pass<"linalg-generalize-named-ops"> {
+def LinalgGeneralizeNamedOpsPass : Pass<"linalg-generalize-named-ops"> {
   let summary = "Convert named ops into generic ops";
-  let constructor = "mlir::createLinalgGeneralizationPass()";
   let dependentDialects = ["linalg::LinalgDialect"];
 }
 
-def LinalgDetensorize : InterfacePass<"linalg-detensorize", "FunctionOpInterface"> {
+def LinalgDetensorizePass : InterfacePass<"linalg-detensorize", "FunctionOpInterface"> {
   let summary = "Detensorize linalg ops";
-  let constructor = "mlir::createLinalgDetensorizePass()";
   let dependentDialects = [];
 
   let description = [{
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
index 73c4d4779750d..8812ca14ba610 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
@@ -21,7 +21,7 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGBUFFERIZE
+#define GEN_PASS_DEF_LINALGBUFFERIZEPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -32,7 +32,9 @@ namespace {
 /// Converts Linalg operations that work on tensor-type operands or results to
 /// work on buffers.
 struct LinalgBufferizePass
-    : public impl::LinalgBufferizeBase<LinalgBufferizePass> {
+    : public impl::LinalgBufferizePassBase<LinalgBufferizePass> {
+  using impl::LinalgBufferizePassBase<
+      LinalgBufferizePass>::LinalgBufferizePassBase;
   void runOnOperation() override {
     BufferizationOptions options = getPartialBufferizationOptions();
     options.opFilter.allowDialect<linalg::LinalgDialect>();
@@ -48,7 +50,3 @@ struct LinalgBufferizePass
   }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLinalgBufferizePass() {
-  return std::make_unique<LinalgBufferizePass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
index 98cd0444760ec..22968096a6891 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Detensorize.cpp
@@ -21,7 +21,7 @@
 #include <utility>
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGDETENSORIZE
+#define GEN_PASS_DEF_LINALGDETENSORIZEPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -164,7 +164,9 @@ class DetensorizeTypeConverter : public TypeConverter {
 
 /// @see LinalgDetensorize in Linalg/Passes.td for more details.
 struct LinalgDetensorize
-    : public impl::LinalgDetensorizeBase<LinalgDetensorize> {
+    : public impl::LinalgDetensorizePassBase<LinalgDetensorize> {
+  using impl::LinalgDetensorizePassBase<
+      LinalgDetensorize>::LinalgDetensorizePassBase;
   LinalgDetensorize() = default;
 
   class CostModel {
@@ -576,7 +578,3 @@ struct LinalgDetensorize
   }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLinalgDetensorizePass() {
-  return std::make_unique<LinalgDetensorize>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index c46e3694b70ec..45cab81be4f5f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -33,7 +33,7 @@
 #include "llvm/Support/Debug.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGFOLDUNITEXTENTDIMS
+#define GEN_PASS_DEF_LINALGFOLDUNITEXTENTDIMSPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -689,7 +689,10 @@ void mlir::linalg::populateMoveInitOperandsToInputPattern(
 namespace {
 /// Pass that removes unit-extent dims within generic ops.
 struct LinalgFoldUnitExtentDimsPass
-    : public impl::LinalgFoldUnitExtentDimsBase<LinalgFoldUnitExtentDimsPass> {
+    : public impl::LinalgFoldUnitExtentDimsPassBase<
+          LinalgFoldUnitExtentDimsPass> {
+  using impl::LinalgFoldUnitExtentDimsPassBase<
+      LinalgFoldUnitExtentDimsPass>::LinalgFoldUnitExtentDimsPassBase;
   void runOnOperation() override {
     Operation *op = getOperation();
     MLIRContext *context = op->getContext();
@@ -705,7 +708,3 @@ struct LinalgFoldUnitExtentDimsPass
   }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLinalgFoldUnitExtentDimsPass() {
-  return std::make_unique<LinalgFoldUnitExtentDimsPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 0d8d670904f2a..4977940cfbd79 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -27,8 +27,7 @@
 #include <utility>
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGFOLDUNITEXTENTDIMS
-#define GEN_PASS_DEF_LINALGELEMENTWISEOPFUSION
+#define GEN_PASS_DEF_LINALGELEMENTWISEOPFUSIONPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -1927,8 +1926,10 @@ namespace {
 // favor of test passes that check the functionality of each of the patterns
 // added here individually.
 struct LinalgElementwiseOpFusionPass
-    : public impl::LinalgElementwiseOpFusionBase<
+    : public impl::LinalgElementwiseOpFusionPassBase<
           LinalgElementwiseOpFusionPass> {
+  using impl::LinalgElementwiseOpFusionPassBase<
+      LinalgElementwiseOpFusionPass>::LinalgElementwiseOpFusionPassBase;
   void runOnOperation() override {
     Operation *op = getOperation();
     MLIRContext *context = op->getContext();
@@ -1963,7 +1964,3 @@ struct LinalgElementwiseOpFusionPass
 };
 
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLinalgElementwiseOpFusionPass() {
-  return std::make_unique<LinalgElementwiseOpFusionPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp
index 9088a1de837a7..5508aaf9d8753 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseToLinalg.cpp
@@ -15,7 +15,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_CONVERTELEMENTWISETOLINALG
+#define GEN_PASS_DEF_CONVERTELEMENTWISETOLINALGPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -121,8 +121,10 @@ void mlir::linalg::populateElementwiseToLinalgConversionPatterns(
 
 namespace {
 class ConvertElementwiseToLinalgPass
-    : public impl::ConvertElementwiseToLinalgBase<
+    : public impl::ConvertElementwiseToLinalgPassBase<
           ConvertElementwiseToLinalgPass> {
+  using impl::ConvertElementwiseToLinalgPassBase<
+      ConvertElementwiseToLinalgPass>::ConvertElementwiseToLinalgPassBase;
 
   void runOnOperation() final {
     auto *func = getOperation();
@@ -140,7 +142,3 @@ class ConvertElementwiseToLinalgPass
   }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createConvertElementwiseToLinalgPass() {
-  return std::make_unique<ConvertElementwiseToLinalgPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp
index d03d1f3a163c3..7ab3fef5dd039 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Generalization.cpp
@@ -26,7 +26,7 @@
 #include "llvm/Support/Debug.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGGENERALIZATION
+#define GEN_PASS_DEF_LINALGGENERALIZENAMEDOPSPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -76,14 +76,17 @@ FailureOr<GenericOp> mlir::linalg::generalizeNamedOp(RewriterBase &rewriter,
 
 namespace {
 
-struct LinalgGeneralizationPass
-    : public impl::LinalgGeneralizationBase<LinalgGeneralizationPass> {
+struct LinalgGeneralizeNamedOpsPass
+    : public impl::LinalgGeneralizeNamedOpsPassBase<
+          LinalgGeneralizeNamedOpsPass> {
+  using impl::LinalgGeneralizeNamedOpsPassBase<
+      LinalgGeneralizeNamedOpsPass>::LinalgGeneralizeNamedOpsPassBase;
   void runOnOperation() override;
 };
 
 } // namespace
 
-void LinalgGeneralizationPass::runOnOperation() {
+void LinalgGeneralizeNamedOpsPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
   populateLinalgNamedOpsGeneralizationPatterns(patterns);
   (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
@@ -93,7 +96,3 @@ void mlir::linalg::populateLinalgNamedOpsGeneralizationPatterns(
     RewritePatternSet &patterns) {
   patterns.add<LinalgGeneralizationPattern>(patterns.getContext());
 }
-
-std::unique_ptr<Pass> mlir::createLinalgGeneralizationPass() {
-  return std::make_unique<LinalgGeneralizationPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/InlineScalarOperands.cpp b/mlir/lib/Dialect/Linalg/Transforms/InlineScalarOperands.cpp
index 34db710b1721d..6db51f4b84d11 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/InlineScalarOperands.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/InlineScalarOperands.cpp
@@ -23,7 +23,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGINLINESCALAROPERANDS
+#define GEN_PASS_DEF_LINALGINLINESCALAROPERANDSPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -101,8 +101,10 @@ void mlir::linalg::populateInlineConstantOperandsPatterns(
 namespace {
 /// Pass that removes unit-extent dims within generic ops.
 struct LinalgInlineScalarOperandsPass
-    : public impl::LinalgInlineScalarOperandsBase<
+    : public impl::LinalgInlineScalarOperandsPassBase<
           LinalgInlineScalarOperandsPass> {
+  using impl::LinalgInlineScalarOperandsPassBase<
+      LinalgInlineScalarOperandsPass>::LinalgInlineScalarOperandsPassBase;
   void runOnOperation() override {
     Operation *op = getOperation();
     MLIRContext &ctx = getContext();
@@ -112,7 +114,3 @@ struct LinalgInlineScalarOperandsPass
   }
 };
 } // namespace
-
-std::unique_ptr<Pass> mlir::createLinalgInlineScalarOperandsPass() {
-  return std::make_unique<LinalgInlineScalarOperandsPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
index 4c93da6fe9253..b0a4de2da1e86 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -27,9 +27,9 @@
 #include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGLOWERTOAFFINELOOPS
-#define GEN_PASS_DEF_LINALGLOWERTOLOOPS
-#define GEN_PASS_DEF_LINALGLOWERTOPARALLELLOOPS
+#define GEN_PASS_DEF_CONVERTLINALGTOAFFINELOOPSPASS
+#define GEN_PASS_DEF_CONVERTLINALGTOLOOPSPASS
+#define GEN_PASS_DEF_CONVERTLINALGTOPARALLELLOOPSPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -326,7 +326,9 @@ static void lowerLinalgToLoopsImpl(Operation *enclosingOp) {
 }
 
 struct LowerToAffineLoops
-    : public impl::LinalgLowerToAffineLoopsBase<LowerToAffineLoops> {
+    : public impl::ConvertLinalgToAffineLoopsPassBase<LowerToAffineLoops> {
+  using impl::ConvertLinalgToAffineLoopsPassBase<
+      LowerToAffineLoops>::ConvertLinalgToAffineLoopsPassBase;
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<memref::MemRefDialect>();
   }
@@ -335,7 +337,9 @@ struct LowerToAffineLoops
   }
 };
 
-struct LowerToLoops : public impl::LinalgLowerToLoopsBase<LowerToLoops> {
+struct LowerToLoops : public impl::ConvertLinalgToLoopsPassBase<LowerToLoops> {
+  using impl::ConvertLinalgToLoopsPassBase<
+      LowerToLoops>::ConvertLinalgToLoopsPassBase;
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<memref::MemRefDialect, scf::SCFDialect>();
   }
@@ -345,7 +349,9 @@ struct LowerToLoops : public impl::LinalgLowerToLoopsBase<LowerToLoops> {
 };
 
 struct LowerToParallelLoops
-    : public impl::LinalgLowerToParallelLoopsBase<LowerToParallelLoops> {
+    : public impl::ConvertLinalgToParallelLoopsPassBase<LowerToParallelLoops> {
+  using impl::ConvertLinalgToParallelLoopsPassBase<
+      LowerToParallelLoops>::ConvertLinalgToParallelLoopsPassBase;
   void runOnOperation() override {
     lowerLinalgToLoopsImpl<scf::ParallelOp>(getOperation());
   }
@@ -353,18 +359,6 @@ struct LowerToParallelLoops
 
 } // namespace
 
-std::unique_ptr<Pass> mlir::createConvertLinalgToLoopsPass() {
-  return std::make_unique<LowerToLoops>();
-}
-
-std::unique_ptr<Pass> mlir::createConvertLinalgToParallelLoopsPass() {
-  return std::make_unique<LowerToParallelLoops>();
-}
-
-std::unique_ptr<Pass> mlir::createConvertLinalgToAffineLoopsPass() {
-  return std::make_unique<LowerToAffineLoops>();
-}
-
 /// Emits a loop nest of `affine.for` with the proper body for `linalgOp`.
 FailureOr<LinalgLoops>
 mlir::linalg::linalgOpToAffineLoops(RewriterBase &rewriter, LinalgOp linalgOp) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/NamedOpConversions.cpp b/mlir/lib/Dialect/Linalg/Transforms/NamedOpConversions.cpp
index 250360603fa5d..84bde1bc0b846 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/NamedOpConversions.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/NamedOpConversions.cpp
@@ -21,7 +21,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
-#define GEN_PASS_DEF_LINALGNAMEDOPCONVERSION
+#define GEN_PASS_DEF_LINALGNAMEDOPCONVERSIONPASS
 #include "mlir/Dialect/Linalg/Passes.h.inc"
 } // namespace mlir
 
@@ -143,9 +143,10 @@ struct SimplifyDepthwiseConvQOp
 };
 
 struct LinalgNamedOpConversionPass
-    : public impl::LinalgNamedOpConversionBase<LinalgNamedOpConversionPass> {
-  LinalgNamedOpConversionPass() = default;
-  LinalgNamedOpConversionPass(const LinalgNamedOpConversionPass &) = default;
+    : public impl::LinalgNamedOpConversionPassBase<
+          LinalgNamedOpConversionPass> {
+  using impl::LinalgNamedOpConversionPassBase<
+      LinalgNamedOpConversionPass>::LinalgNamedOpConversionPassBase;
 
   void runOnOperation() override {
     Operation *op = getOperation();
@@ -162,7 +163,3 @@ void mlir::linalg::populateLinalgNamedOpConversionPatterns(
   patterns.add<SimplifyDepthwiseConvOp, SimplifyDepthwiseConvQOp>(
       patterns.getContext());
 }
-
-std::unique_ptr<Pass> mlir::createLinalgNamedOpConversionPass() {
-  return std::make_unique<LinalgNamedOpConversionPass>();
-}
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 66362d5be4f19..e58503d508ced 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -32,7 +32,7 @@
 void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
                                           const SparsifierOptions &options) {
   // Rewrite named linalg ops into generic ops.
-  pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());
+  pm.addNestedPass<func::FuncOp>(createLinalgGeneralizeNamedOpsPass());
 
   // Sparsification and bufferization mini-pipeline.
   pm.addPass(createSparsificationAndBufferizationPass(

From 7b9504fc012e8b96c2bca9d641e16f719696d723 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sun, 25 Feb 2024 08:22:55 +0900
Subject: [PATCH 237/546] test: Refine InstrProfiling/mcdc.ll

---
 .../Instrumentation/InstrProfiling/mcdc.ll    | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
index fccb026c25bf2..f61f6ca136ce7 100644
--- a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -23,24 +23,24 @@ entry:
   %tobool = icmp ne i32 %0, 0
 
   call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 99278, i32 0, ptr %mcdc.addr, i1 %tobool)
-  ; CHECK:      %mcdc.temp = load i32, ptr %mcdc.addr, align 4
-  ; CHECK-NEXT: %1 = zext i1 %tobool to i32
-  ; CHECK-NEXT: %2 = shl i32 %1, 0
-  ; CHECK-NEXT: %3 = or i32 %mcdc.temp, %2
-  ; CHECK-NEXT: store i32 %3, ptr %mcdc.addr, align 4
+  ; CHECK:      %[[TEMP:mcdc.*]] = load i32, ptr %mcdc.addr, align 4
+  ; CHECK-NEXT: %[[LAB1:[0-9]+]] = zext i1 %tobool to i32
+  ; CHECK-NEXT: %[[LAB2:[0-9]+]] = shl i32 %[[LAB1]], 0
+  ; CHECK-NEXT: %[[LAB3:[0-9]+]] = or i32 %[[TEMP]], %[[LAB2]]
+  ; CHECK-NEXT: store i32 %[[LAB3]], ptr %mcdc.addr, align 4
 
   call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 99278, i32 1, i32 0, ptr %mcdc.addr)
-  ; CHECK:       %mcdc.temp1 = load i32, ptr %mcdc.addr, align 4
-  ; CHECK-NEXT: %4 = lshr i32 %mcdc.temp1, 3
-  ; CHECK-NEXT: %5 = zext i32 %4 to i64
-  ; CHECK-NEXT: %6 = add i64 ptrtoint (ptr @__profbm_test to i64), %5
-  ; CHECK-NEXT: %7 = inttoptr i64 %6 to ptr
-  ; CHECK-NEXT: %8 = and i32 %mcdc.temp1, 7
-  ; CHECK-NEXT: %9 = trunc i32 %8 to i8
-  ; CHECK-NEXT: %10 = shl i8 1, %9
-  ; CHECK-NEXT: %mcdc.bits = load i8, ptr %7, align 1
-  ; CHECK-NEXT: %11 = or i8 %mcdc.bits, %10
-  ; CHECK-NEXT: store i8 %11, ptr %7, align 1
+  ; CHECK:      %[[TEMP:mcdc.*]] = load i32, ptr %mcdc.addr, align 4
+  ; CHECK-NEXT: %[[LAB4:[0-9]+]] = lshr i32 %[[TEMP]], 3
+  ; CHECK-NEXT: %[[LAB5:[0-9]+]] = zext i32 %[[LAB4]] to i64
+  ; CHECK-NEXT: %[[LAB6:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB5]]
+  ; CHECK-NEXT: %[[LAB7:[0-9]+]] = inttoptr i64 %[[LAB6]] to ptr
+  ; CHECK-NEXT: %[[LAB8:[0-9]+]] = and i32 %[[TEMP]], 7
+  ; CHECK-NEXT: %[[LAB9:[0-9]+]] = trunc i32 %[[LAB8]] to i8
+  ; CHECK-NEXT: %[[LAB10:[0-9]+]] = shl i8 1, %[[LAB9]]
+  ; CHECK-NEXT: %[[BITS:mcdc.*]] = load i8, ptr %[[LAB7]], align 1
+  ; CHECK-NEXT: %[[LAB11:[0-9]+]] = or i8 %[[BITS]], %[[LAB10]]
+  ; CHECK-NEXT: store i8 %[[LAB11]], ptr %[[LAB7]], align 1
   ret void
 }
 

From cc53707a5c104eb7789829ecdb2e3ae2be1a42da Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sun, 25 Feb 2024 08:23:30 +0900
Subject: [PATCH 238/546] LLVMInstrumentation: Simplify mcdc.tvbitmap.update
 with GEP.

---
 clang/test/Profile/c-mcdc-class.cpp              |  8 ++------
 clang/test/Profile/c-mcdc-logicalop-ternary.c    | 16 ++++------------
 clang/test/Profile/c-mcdc-nested-ternary.c       |  4 +---
 clang/test/Profile/c-mcdc-not.c                  |  4 +---
 clang/test/Profile/c-mcdc.c                      |  4 +---
 .../Instrumentation/InstrProfiling.cpp           | 12 ++----------
 llvm/test/Instrumentation/InstrProfiling/mcdc.ll |  4 +---
 7 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/clang/test/Profile/c-mcdc-class.cpp b/clang/test/Profile/c-mcdc-class.cpp
index 2206a39ee4ffb..6aab55add3280 100644
--- a/clang/test/Profile/c-mcdc-class.cpp
+++ b/clang/test/Profile/c-mcdc-class.cpp
@@ -54,9 +54,7 @@ Value::~Value(void) {
 // UPDATE FINAL BITMASK WITH RESULT.
 // MCDCCTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDCCTOR:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDCCTOR:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDCCTOR:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm__ZN5ValueC2Ev to i64), %[[LAB2]]
-// MCDCCTOR:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDCCTOR:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm__ZN5ValueC2Ev, i32 %[[LAB1]]
 // MCDCCTOR:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDCCTOR:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDCCTOR:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
@@ -93,9 +91,7 @@ Value::~Value(void) {
 // UPDATE FINAL BITMASK WITH RESULT.
 // MCDCDTOR-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDCDTOR:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDCDTOR:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDCDTOR:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm__ZN5ValueD2Ev to i64), %[[LAB2]]
-// MCDCDTOR:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDCDTOR:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm__ZN5ValueD2Ev, i32 %[[LAB1]]
 // MCDCDTOR:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDCDTOR:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDCDTOR:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
diff --git a/clang/test/Profile/c-mcdc-logicalop-ternary.c b/clang/test/Profile/c-mcdc-logicalop-ternary.c
index 558643f422021..3e6b6b1e380d1 100644
--- a/clang/test/Profile/c-mcdc-logicalop-ternary.c
+++ b/clang/test/Profile/c-mcdc-logicalop-ternary.c
@@ -20,9 +20,7 @@ int test(int a, int b, int c, int d, int e, int f) {
 // MCDC-LABEL: cond.true:
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm_test, i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
@@ -38,9 +36,7 @@ int test(int a, int b, int c, int d, int e, int f) {
 // MCDC-LABEL: land.end:
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr getelementptr inbounds ([3 x i8], ptr @__profbm_test, i32 0, i32 1) to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr getelementptr inbounds ([3 x i8], ptr @__profbm_test, i32 0, i32 1), i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
@@ -52,9 +48,7 @@ int test(int a, int b, int c, int d, int e, int f) {
 // MCDC-LABEL: cond.false:
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm_test, i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
@@ -70,9 +64,7 @@ int test(int a, int b, int c, int d, int e, int f) {
 // MCDC-LABEL: lor.end:
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr getelementptr inbounds ([3 x i8], ptr @__profbm_test, i32 0, i32 2) to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr getelementptr inbounds ([3 x i8], ptr @__profbm_test, i32 0, i32 2), i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
diff --git a/clang/test/Profile/c-mcdc-nested-ternary.c b/clang/test/Profile/c-mcdc-nested-ternary.c
index 4b014e07f6df5..ebea17ca146ae 100644
--- a/clang/test/Profile/c-mcdc-nested-ternary.c
+++ b/clang/test/Profile/c-mcdc-nested-ternary.c
@@ -57,9 +57,7 @@ int test(int b, int c, int d, int e, int f) {
 // UPDATE FINAL BITMASK WITH RESULT.
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm_test, i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
diff --git a/clang/test/Profile/c-mcdc-not.c b/clang/test/Profile/c-mcdc-not.c
index aa638b9680b84..165bfbae3349d 100644
--- a/clang/test/Profile/c-mcdc-not.c
+++ b/clang/test/Profile/c-mcdc-not.c
@@ -77,9 +77,7 @@ int test(int a, int b, int c, int d, int e, int f) {
 // UPDATE FINAL BITMASK WITH RESULT.
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm_test, i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
diff --git a/clang/test/Profile/c-mcdc.c b/clang/test/Profile/c-mcdc.c
index ac845d204853d..823160329b31f 100644
--- a/clang/test/Profile/c-mcdc.c
+++ b/clang/test/Profile/c-mcdc.c
@@ -91,9 +91,7 @@ int test(int a, int b, int c, int d, int e, int f) {
 // NOPROFPASS: call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 [[HASH]], i32 8, i32 0, ptr %mcdc.addr)
 // MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
 // MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
-// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
-// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
-// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB4:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm_test, i32 %[[LAB1]]
 // MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
 // MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
 // MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index a19b140872544..dbd44bd36e114 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -969,9 +969,7 @@ void InstrLowerer::lowerMCDCTestVectorBitmapUpdate(
     InstrProfMCDCTVBitmapUpdate *Update) {
   IRBuilder<> Builder(Update);
   auto *Int8Ty = Type::getInt8Ty(M.getContext());
-  auto *Int8PtrTy = PointerType::getUnqual(M.getContext());
   auto *Int32Ty = Type::getInt32Ty(M.getContext());
-  auto *Int64Ty = Type::getInt64Ty(M.getContext());
   auto *MCDCCondBitmapAddr = Update->getMCDCCondBitmapAddr();
   auto *BitmapAddr = getBitmapAddress(Update);
 
@@ -984,15 +982,9 @@ void InstrLowerer::lowerMCDCTestVectorBitmapUpdate(
   auto *BitmapByteOffset = Builder.CreateLShr(Temp, 0x3);
 
   // Add byte offset to section base byte address.
-  //  %2 = zext i32 %1 to i64
-  //  %3 = add i64 ptrtoint (ptr @__profbm_test to i64), %2
+  // %4 = getelementptr inbounds i8, ptr @__profbm_test, i32 %1
   auto *BitmapByteAddr =
-      Builder.CreateAdd(Builder.CreatePtrToInt(BitmapAddr, Int64Ty),
-                        Builder.CreateZExtOrBitCast(BitmapByteOffset, Int64Ty));
-
-  // Convert to a pointer.
-  //  %4 = inttoptr i32 %3 to ptr
-  BitmapByteAddr = Builder.CreateIntToPtr(BitmapByteAddr, Int8PtrTy);
+      Builder.CreateInBoundsPtrAdd(BitmapAddr, BitmapByteOffset);
 
   // Calculate bit offset into bitmap byte by using div8 remainder (AND ~8)
   //  %5 = and i32 %mcdc.temp, 7
diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
index f61f6ca136ce7..a7f1e606e35fa 100644
--- a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -32,9 +32,7 @@ entry:
   call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 99278, i32 1, i32 0, ptr %mcdc.addr)
   ; CHECK:      %[[TEMP:mcdc.*]] = load i32, ptr %mcdc.addr, align 4
   ; CHECK-NEXT: %[[LAB4:[0-9]+]] = lshr i32 %[[TEMP]], 3
-  ; CHECK-NEXT: %[[LAB5:[0-9]+]] = zext i32 %[[LAB4]] to i64
-  ; CHECK-NEXT: %[[LAB6:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB5]]
-  ; CHECK-NEXT: %[[LAB7:[0-9]+]] = inttoptr i64 %[[LAB6]] to ptr
+  ; CHECK-NEXT: %[[LAB7:[0-9]+]] = getelementptr inbounds i8, ptr @__profbm_test, i32 %[[LAB4]]
   ; CHECK-NEXT: %[[LAB8:[0-9]+]] = and i32 %[[TEMP]], 7
   ; CHECK-NEXT: %[[LAB9:[0-9]+]] = trunc i32 %[[LAB8]] to i8
   ; CHECK-NEXT: %[[LAB10:[0-9]+]] = shl i8 1, %[[LAB9]]

From a4096eaeb63e7086a2e0a32bd69523c1fa72db3e Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Sun, 25 Feb 2024 13:16:09 +0700
Subject: [PATCH 239/546] [AArch64] Add FPSR to reserved registers (#82907)

This is follow-up of #81867. FPSR was not added to reserved registers,
this resulted in machine verifier fails.
---
 llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 48e1c1bc73022..b919c116445c8 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -450,6 +450,7 @@ AArch64RegisterInfo::getStrictlyReservedRegs(const MachineFunction &MF) const {
   }
 
   markSuperRegs(Reserved, AArch64::FPCR);
+  markSuperRegs(Reserved, AArch64::FPSR);
 
   if (MF.getFunction().getCallingConv() == CallingConv::GRAAL) {
     markSuperRegs(Reserved, AArch64::X27);

From a8c3b3e20db01d2947dbe87d0c557150ed777865 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Sat, 24 Feb 2024 22:30:31 -0800
Subject: [PATCH 240/546] [nfc][compiler-rt]Replace Type::getInt8PtrTy with
 PointerType::getUnqual as a clean-up (#82434)

This is a follow up of
https://github.com/llvm/llvm-project/commit/7b9d73c2f90c0ed8497339a16fc39785349d9610
and
https://github.com/llvm/llvm-project/commit/5ef9ba74120dcc2da70ec25571d459f81ab8a705
* The definition of `Type::getInt8PtrTy` is deleted. This doesn't cause
a compile error because the `Initializer` part of the macro doesn't run.
---
 compiler-rt/include/profile/InstrProfData.inc   | 2 +-
 llvm/include/llvm/ProfileData/InstrProfData.inc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index c907a9736f316..fce407f547f3d 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -119,7 +119,7 @@ INSTR_PROF_VALUE_NODE(uint64_t, llvm::Type::getInt64Ty(Ctx), Value, \
 INSTR_PROF_VALUE_NODE(uint64_t, llvm::Type::getInt64Ty(Ctx), Count, \
                       ConstantInt::get(llvm::Type::GetInt64Ty(Ctx), 0))
 INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \
-                      ConstantInt::get(llvm::Type::GetInt8PtrTy(Ctx), 0))
+                      ConstantInt::get(llvm::PointerType::getUnqual(Ctx), 0))
 #undef INSTR_PROF_VALUE_NODE
 /* INSTR_PROF_VALUE_NODE end. */
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index c907a9736f316..fce407f547f3d 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -119,7 +119,7 @@ INSTR_PROF_VALUE_NODE(uint64_t, llvm::Type::getInt64Ty(Ctx), Value, \
 INSTR_PROF_VALUE_NODE(uint64_t, llvm::Type::getInt64Ty(Ctx), Count, \
                       ConstantInt::get(llvm::Type::GetInt64Ty(Ctx), 0))
 INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \
-                      ConstantInt::get(llvm::Type::GetInt8PtrTy(Ctx), 0))
+                      ConstantInt::get(llvm::PointerType::getUnqual(Ctx), 0))
 #undef INSTR_PROF_VALUE_NODE
 /* INSTR_PROF_VALUE_NODE end. */
 

From 3b27cc2ceec265d751c909c431ed62c0d7ed9b51 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sun, 25 Feb 2024 14:59:52 +0900
Subject: [PATCH 241/546] Regenerate llvm-cov tests

---
 llvm/test/tools/llvm-cov/Inputs/mcdc-const.o  | Bin 5112 -> 5208 bytes
 .../test/tools/llvm-cov/Inputs/mcdc-general.o | Bin 6200 -> 6456 bytes
 llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o  | Bin 6424 -> 6480 bytes
 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o  | Bin 4632 -> 4112 bytes
 llvm/test/tools/llvm-cov/mcdc-macro.test      |   3 ++-
 llvm/test/tools/llvm-cov/mcdc-maxbs.test      |   4 +++-
 6 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-const.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-const.o
index 1232088ee60fa0e1c96960f1ba11a0e7e7e982eb..1145fcc6f7125452262d0745b9de0591c1e48cda 100644
GIT binary patch
literal 5208
zcmb_fU1(fY5T0{yHb4EzZW>eVuNx>*)ZA`jn<mAdZJNXtOAU%B72R%jH$RyDVfQAj
zKG>+#Kd7kCHw#jI5PT6tPy}i1gH#`iP(@L&4+X0)RtpMNj5FuX+<S7mOTjv@cg}n>
zb7sz*Ip^LzJhpe-QOX4-1$ToI!QkJniiETtreQN$W|m?k9a-`RF2^f+JQZI!GZnvb
zYH>AboXhdU8fMGP(Ypx#nH~(;j^YtapiVy|u0br@yjh%0I>gIAUThfZicYGD@AWuI
zdYrvfLlz}!s*qs3zAzc7<2jvUXvJCepM2~zElHEZT=s0qqF2s*Sz7w^pEq7WyVoxa
zoqZemyU&$(eVF;BqLib0HuZL^6}PoG38g?O)dq0|ZLonJ$*mA0rl1`<Qaz4pacR^8
ztD!Rm&;l-vx}Y02biobKgP0q0XtW8|!TMBhYYZG3b->M#L@gzYqZst6Ba4fR?J*UD
zEx`i*`ugZkgN0+?ccXYy02z6LtGZN&lZauC2?rnAf^3Oe@#iD}I!pONzAs-X*L^Qv
zt!mx>^^iJB4X-tC2OG3lhO$msqouw|lzkON0<!zJ`^CCHiKI%o*)j?J$wDzdTc53z
zlYVW!2#}x4m8X*n#hS+YhrN`Snx6FmHzy$V^n;)dCDevhZSAjN5G)^TWL<$d<%qkc
zshbkn-I;jaal2G!yWRn+JHhSR0$?R`I`^IUi?iItoc4?2_pscEy^3H9{I=9c=jsMp
zu6Y(k1viGEtZ$wY!<lS)oN-#OO)ki;MH)O6NGl@17`fq!BXF&@H!jOarH_jthC^3Z
zD-`wMWBqKDB_1-6ibg)4>*1ORFh(w(&_?8j-LPEi!rJ8kN}F+AYmE9!kKB*CMp>Rk
zob1I;_YY#!W4#V_PCzG^Q`veQ=_G({jLVCOt`6FTzxjTX-V%HZ+v`go$Ar^tVt)(k
zZTmh8myX$P;FN>RGt4;IQB?&`GcNPXS;|<rbjG7i@FxtM^1s6CoiT7)yTq*+xLKcr
z7B2NUV&G<d=-d%W{$hVDfJ0pB^QM8D^?8SJvNP-Ro?&n1`GJK?+)oUg-h#5;QwDD4
z|Am3m{4&p17EXB)eHSo&LzsD<XI$zaaW7i9%zw$i%{>1wa5GQ2*kpd0r&Zq@QT-*)
zPR2<#^9=5p$X@ztqcIP~mHz27aH@~k@33&`+ucp@jD_FM>)dDH=6WA7aC6_Y22N{}
zI!_z8X<uR7u7lh?^sXX%Q>N4ir+kPLnR+@$rc8RWqxYBCCvi+f`$az?^L`+_z>1!=
zt?esTu8t?VAe+rs7HY+Oc64NT$Kdum2FG>|jgIaZ-#s6xh6l#SM#jd+2S@je>=_<H
zl~*gy<vhQ5$OkW9DV2(4AFA_y9ew8J7E0M_Z7Q2DFPA50m&@f`saOYZZ{4qfH(jgD
zSHb!;;>j?<Qm!gfqck;NMyXn>Oih-u^mV~fJ`lWe#V>mI?t8ee?&tDPf~Ql;6$<#B
zG0mwLXR=c@Ody)a_+kmCGcgbI^mCKo)t@S9P9-pwTgdr2oV-HZ%j4J1Ag*oC#C`qQ
z{t(6~`aKi_JSE%opZTrWZ}RRjC)v**InQuEip~<@PE-&Uz3BqWbim-C<v@LH(oPfz
zOPe5$-{J|A>_G23>ObmnBI(-)aDED5e-WJIxJfcLNX~)q7g2ZhR|dck$CY>#pT6fL
zf8liB>rhS-M-iRp_$MMrk7Pc9o5fEzh)-WD5?6S$_%BhhkhTRZeA@Akqb5i-8QRd`
zBlh3242|k9aI^d~4f1b8C&^p*0mLZ2%kgli=?9j@PH--(h|nzl1Ut$;)Be#{ugoWW
z0<lK%&vX1t7%3Ro^}7e18~JbLLzTXU|1NaaxmfEzNtZlQDqukQ(|-+#Fa6(a{Ren~
zVo(0`E|hf(KY&=H{AU_n3T%rkq-ODFIDlDy%3tCNZ&rVaFZoMATW(hW>UGxtQiJ-_
ze=dnH^{*mE^>ZMJ&xqdV`1ZX-wkAy>K-ho@;`lT6m;ZU97JuO*sGza*iP(L^{_6x_
zE`Gw#AueOn^jG#bF&f*pe+F39YG1g-{uyfmsh_~V<A8c0EdcW*zkg(1)W7TTC;n3Z
r3y8UUm+0@oBK|^}V1M35eSS!3=Rbgeojchb;s)KyO7f$<wB!E^!RA9P

delta 1658
zcmZ9MO=uHA6vuZqAKfII-Ay(jtq58zG}3@wTd+OYjc6g@QNduL=s~d})ZhoLmlUgq
zg2je+(OVCK-ZT~pdJs#cv`|4rK~Sg#kxDPEy%Z|MdAl>=xes<`-*5i!&3lu{e$DT8
z&g9ux?MQny`#Pp5N@+U6M?Ph7+RT+2p6{&uzffwZrG_%Kys!}x(us9eoM**PzlvI~
zRlMK0WsA_#^a)J$8;8ZS68dR(%uz4>74uH*QKUB1TKiV5J=Ag&NmR2Pv+7qf=}jtB
z*r8_HskIFhIq!xR=T_Jm_24MF+*<k5?Y${Vobi{FhBc{%H0TXKZMlv>3B$UYozo))
z#zvt(+8L9vaTtl7s^UTXZTM!Ro}0=a<OOuN?enK5?-2gs0N*cs87K2(1n&vxbw#Oq
zXpk(l@ON3zFyUBGSooI-5Ak=Yza}{9(uCg>91AiCzbp1{b|w?uBLTRv?0AN7TIdtP
zseU%VWdqM7$Np&tFT_HygLxUiXWT#*`ZmC2p-Y5gE~#H792=<0g)d}*cJMvGWd}b6
zN1q<`W<W3V?*w><?^l&QvVh5C0P}9Ie;Mgfd)J)n<mD2HloZip1DFe~O>p#Sq3s^$
z=PwiS+vwR*k_b266aS7r!f^wscLQANO93wb9p@;=-(M~a(FM*6{6V`!I6a9G!fD49
z!Gj%Jemd`?fqL}ebKGjfr1&L4qap)7#B)rCY@#PEzZ#irq$L*D4bz&tkQipLpr40a
zvMr`2{P^3@pLE$cj3<xB$D%&l3c;J?0Gy6mpc#3V2gk5k1_q5T_7N(E%epyn*%)l|
z@i{oA3*APBMd5+j#a5xt{g;LYyQa=Gh*%iqST1XY3LmdR-Ev{e)S%DMS;C7cgOhSu
z8MaNECB29;)H%86MLY&4?Xp>@q&s0N9M0l#9rG(~j<2B3HA}F~$2D+lm;HbWAD8)V
fHoUhB>TVqV*aL7Q6+3`8h#Tff@kc|4n>^BgBsNOB

diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-general.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-general.o
index afe6f95cf0b829a79efebe9718d3cbd04718d4aa..e802f51132c068f0f4e2921c952710cb5c5665e5 100644
GIT binary patch
delta 1587
zcmZ`(Z%9*76o2<k*QRstz5VfR3aKTf;s^hMW#Tq#rextVLO~FrD2P5tvo94EJf)zC
z>)?k#C?QC~pct4!ff|K<skVqBA(BcQQs@^G`m@e`yVKjC1Mj?hf4_5n=bZcQHdQfI
zHC#%Ra9Q=q9nFbM0Zp2G>I+#0wV9yIkmit+5-auB<{9FXCf|iDHuJ0&3kDeGl_uYY
zGNOoFV94yUfu%N+xd$Zn$s~qn?fu$9u$o2qwL~y~rCaCp;@X0wO<*m?U(TV0#rSRX
z$u+5bfl1Q9C?fIgL=d)$cqfTBw&XR{$W-Ms1~8R>#7xDTwbz5~Y>mA?!=anJ+BZpy
zU?!B4^0r-@CF4U|4X~7HtSw-C@H_+93y<b`$l|Bd__g(52ZM3w!zfw+ABa@SbXT&r
z2Ip0`P%w&LBdg-5LCh6l$e8bVO{^t8Yhk0+)o7J{R+HaaaW?YAxL?f46|o}PA$(+5
z%&<eD*+kRI%!BSaox+pEw;7*biKO}Ap;>iza%?uPi`o7REF#;-v3%|TC|RoeI>#Dt
zol*{iFH0q((2FtmxfovIF=fddjG=!?eM|g5ks>{8G}=Gyu%2$v%xU<6BrHe}&Q|-?
zG`5DZJJaxE4BwZAcQM@0;k18tc!^dHr(LzO`1T|Ywb=^kfGZZ2bVE<JN>0(P+<jaY
zRr|x_F3}kG5XVx@i%fF_){vb@V>QM;f+CCRp5<7o%(lWj1dx5jvGjJPzSKv(!!LSo
zYg`OJ=?3ksWgEL<sawQ~3wazabIL^=PG|YaCM|<XtLm=eSR<EJr?WnerMam)c@TU#
zs{07X(msDP<>SzcY*!ka`d;T?EJr1Sx(`7V*CVtRxut8~OUp`WM)ZEB)7X^e8(nRQ
zdKk7|#UETGPx?WK3^t{Se1}e@);vyIL|cZNO0A*7AV81F3Cl_e$SxDScUYj<R!+PS
zuz5%YL>+dxgZmkX;(iqryNB%2(G&J^vrGS*6a*tIW){GP;=;VB-6Ixeg3ZwYveN_|
zj=e;J8;%CjhG`zM0gBUu7cqc-1Hw*^h}kjRNMOrtpmlRS<S2yE4?z_DT~JgH8Ha%C
Rr9!xtA*MEv)1bI^{{_S5hfM$g

delta 1417
zcmZ`&T}TvB6y7_l<802(?#}KyJ6h;M4;Hd%KUje~uBj~;;fg&}ngvD{5lI_J54qtY
zMs2ws=;tNy!H3?g1+kY>N=Tw$gdng74TKeu^r13RwsU9i!ZPTAd*__*JLfy+-kF&^
zI9WZsk2sV25A7}sX9_K7j*0i7gg6+VirT^n16etcjCuP{GQQq1F_RFdkw3!taK>`~
z<8;~@TP%7_=6-||cB&(I1XdAF)KMT!x0RCZQ{(f|t1P`Ko{8RMek?16Q8}%ynQe;y
zj&?DUPJ*L0*T+|*Ezn1*jvveB6PlN~UAjDgHCc$OnO*4{CvyveaU`bjhF}6M?}Sfe
zkEk&ihY6rwc~yAegK$1{A=K5mOxxqt;K{c;K)X-?E!>h7lTrce3oEA+_`;WYo)W1t
zBvmv>z6Qw|l7uFy`ci6(Kf&2`+8V6|mlOPjZn$U@XxTN}pc*tNi-p5yF)oqCZQ$Rf
zsNE*Eip6(e-KMB<6I;OI2XGH-hgn2X53bEj*2ZWpNx>`3dB<{c-5Xgv1FmAF|Felb
z%;I?y-p=BmCcJ~ifAeq~`fQMlg<VXp`ze-Jl*OS)bg3fCF`vQ>HKIbwpto2d0b{QN
z9sNcFKLbfoQ9Dff*O~q;m`3)liG9e}Ay`NDv5D<w>=W?YmHx3Erh1!KMNgW$^RIch
zGY|ishts$Jk6q1@2^U%S)hym%d@g*R(61c|Hcj|mCd5HVQdAqgc{KW<<8HC`F6c#8
zF|oPV7JyMnAt7TAgN}YHEZP-v&d|JUs>*5JpgC&P#O5>~8ET_+hhfJM{vD}O=O02G
zAj{3<C7hLOZ9Q~Ev`Tm`*IL&&3ycb0NXwO6ng`h(Bo>IcHExZE5e#CWyEQTnE|11-
z@X+Q7+P=|`2#p6nUk)3x4=Z$}7Yac321y@8y!Awa5%is)do^hF3g9a_N_-G0$wtpG
sn!-Gdo033QjGl@{WJuzE8g$&RfXk<m9*Fo1G4vfU=L>?<jbduWe?00>`2YX_

diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-macro.o
index 667ccd132d2fb848ae86b9bbbc72f7b3a917feab..c0a9d6044553030ed1cc28bce2e764bcf3877869 100644
GIT binary patch
delta 1110
zcmZ8dO-vI}5bi7mc4@oamZiJ(mnI~Da$xabtA=F}wDF>7L}N7K(ZobbIB-Jb$I!$^
znhC~(2Dox^K@&MiFQj;ozl-rE2Lq`G_9Bp|bsl|?rjzXVX6E~5-nZ{f%WBI+gP<ts
zGpA~ATX(GPTQ`y$R&qC<FLzNS|8^R`$MTkca;g7jy0`J!lG1;cbNn!tV>$m+CgmL;
z^z+tSd9Kb_%n8Ztk*0Gj@1M-TvXq&tWclbT+Md}et6iIe_04$7$8Hl<kq>OK@<2CC
z32zShTv!8j@B}ZwsbiV`>(<CuH|OH11HaodB*({aPWBngIcYp=rMycc88=Qr*qgPI
zOEe<Ha{c#k5%i`GX(yH&yvSFO8}1_x+4#s}xi$2&at=C@&ibaPU>-tVk{mVhw4h^~
zQ0r$_i(<Y<h`1q!!xdKqNx&}^$JGsx8!WlDUDn%W#gfHNx%t8Te#Mj!7Wtqg6p~ds
zf-b=oKwEkW8b+!#Xu>YSOAN!8xt__{lQr@BE<PvrVF`PJ*L`RcJ5s`Wc|D6ii2YE)
zx_P~T0go15E@AKUdIc?y2H!A2xWy2AJtFEJ1@>`<<^T#p=C_Q#KlwsPxaQG}lSM51
zo6&;URaeyaP>(;3s~N$ofrSyD2G`J_yqoQT1U~X=;enzlw(Tw7_8QZ~zAIs0a`q!;
zXwT<^o!`+R<u+Hc7pS4{(DOg03D<Bl><AXIjo9T|0TCmz27&pQybJIG4drUUICM{9
z7H&)`^-@Dsp&Gyi<va|bqnOxREn+kfg$YaqOo*dR_z9f=F{+9k!6ul-M9_pR+CdX{
zR1x=rQTT;MwFy0#AlyNla1R~AX*5)t42d|PMj?u}YQg|IgmE-#m?}ZIiZ<amI)qU)
xLMFSNko1IY=!8tcC1Z=$N+uJU32C$mXVD>ikA`jv%`*1st+d<L*{zOV{~y?n5W4^X

delta 1232
zcmZ{jT}V?=9Kd((KHPO~ciWxbjUp!*?url2gyloKyOK62LMtn<kM)7MC~UBoB6OgG
zW(<^nf^TB<kUc~?sF#9W3v>{r)`Jj1Mx=93CRotNI%hi$xzK_8|KIcf{Xfq=ccE##
zX~e@UbEEZpJY4mVhO936w0!DBN8-Jl*b3(L#FY5c=FP|RGB-9C?HKJ92j;BNeUwq<
z&9vmW2_>n!+i#+#oZFi&(YE*&H>RKrtB|*$dnD_8Vpdt#)|LgHp@b4x`W8$Uss)VA
z42@eO)p0=d7B$4o!~#63X}rA2^%eYkxL*am$qf{=Hy!`Tjiu$}(K%gIqDgai;0D^S
zWxN@9f?Na2V1{YQMZ0L)?Kab7h_v!q$0{wkejkku=5epenl`plHiMSF1e1j(j3<ry
z?eM|Z#%}q%eBoS{wD&*flGiJ?XZy6FlX0hFOpOj5fCq+lW);D@VV5I?EFRh7ZnZdD
zEmqlLY_l|Wr(9(djLS+Y;I)d1dI2lDhQWuj$o8RXkjn+s3=u335$QTobR!-b(su*h
zhhP%Zt17yi?r*~)rg0Ump!*p>Cc*VmMRjyP3tmj$s_0p|{{R<F0{R4_rfL{BS<!dm
zjsAjyV+?dA(yJCfzw^aqfen-3-MfR5Ns8AMPo%OtXr+VDSs}REcTf{1G-duc$?3YZ
zgOZ%rX>1s#aqOUqKBROQRxq7Z(fgD>FVvk?(JPcLf@~JhSK@<o5W(+GNSNQ1zH5y7
zkuxBVCKa*#K<F5Rt%q&yf`xiuWd+4r#~RCvmJn=MeJBK?O+s!6V|f#%u=GF{%W)9x
z5^97nmNA&JOY9^1#cE;N=EL_-<R#{f4v@YIV!Q_hV2YQ}0%WmV2GJp*UI;rR=(RiG
zg`<IW7XCJTt02Pr&>0XbCDuc2Pho@iG0S?WcldGTY$XX6t0cy0DCNh^6%FgWpK%+A
zD>H9~Yl2@NGT@1z)5b$u@Sq4}1&J9hMh2W7Gy-9##QZeFW2YatnRSwhL|5HETM85u

diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o
index 8e592cf6563da92b98429e83244084649d49fd81..bc993b19e84733fdf7670a5aec58219519837ec2 100644
GIT binary patch
delta 381
zcmYk1ze~eF6vuNf(HMJel0_pOELd!DvKef<oIgM*_zws=baiXH3EdoYZ22zT?ATF~
zMVtgF)X54iI@PUi;#5S4uQq~TI6imxeed4)I@wmHl_$q)Z>-X0vwe7*u0(Sh>pY^S
zjQ8);i^AP~_#u1}Wvm(dA|0L@kHYFtEmk_*oxcz(gV{+VrsVY<&Gd{nbJ=@|U4v==
zb5XG_ML&G9t^~cBXe18YQC#{50}qMZV(&Marga3q5?j>O;Y%elsLO5Y=?J7kO)g*}
zOIB$bftAIXEYh`(MpDrUAz<9-_+(TGeo9e~&|vJC2p9wQ7-K>~U&0`nry=*+${NQH
zIw+@#G?_KkO!$tX5|=RcID(_ZF$#7Lp6iJ!>Zu%#T#vf^EmzTyhcHGw#9jXZIyh(G

delta 807
zcmZ8eO-LI-6rOpzG45_OruDZ7ii8-V)F`4U$mqoq^w>iS;=zkxK=21lkWz&RT0t;A
z5IyOk(2|pogT_<5we-+{7xC7fL@%ZErm()rjxzdS`R08;@6E8=?R(+vC^S2{SR3?w
z%8j2Url(&A+t!$@r0bK=TgtldaYl6tvTn~(=i~WJdOY_bVmEm9qzk#t$mGvQd1CS9
z2`8;6=Owo)wwq%m_m+xM%eqqwUlmm}D#UVn*NZt?Y?aPbg?8QtJb~|wQGQw2H-2%e
z>?!QBFz;LEE$6BQ!T^1Du2;V?<Zo)Z(p0~u{>L+4(+ZO}qLNvu*2{*fO5^2QZq>|M
zjhB8?nyFrCyzUQcy!5wL+>8m2U#TpXfrm?vX#suEdySWa!fMd$Grn8#OV9X+#h;(>
z9gOv>z?UZiV?&C^5&s4+7k>B;IQ+3NWxWmMl~p-d9iCa8cBabf%Ygr^0hlu}W()oo
zXY66fn3%T(lmM@7VO7h&P8G&^dk|JlOgRF&4a^g4n0QSPH!v0ukTEbtkhCyQutyrf
ziGfqlh$SZqhm@@mc;(b#A`pk~o;_n>tX4oZi3BfbH$j&vQ#-*Z#OY>10pI8`1Q#@$
J;D}}iBlks%u*LuY

diff --git a/llvm/test/tools/llvm-cov/mcdc-macro.test b/llvm/test/tools/llvm-cov/mcdc-macro.test
index 339284bba2c9b..14dd5ebd68eb1 100644
--- a/llvm/test/tools/llvm-cov/mcdc-macro.test
+++ b/llvm/test/tools/llvm-cov/mcdc-macro.test
@@ -89,11 +89,12 @@ Instructions for regenerating the test:
 cd %S/Inputs # Or copy mcdc-macro.c into the working directory
 
 clang -fcoverage-mcdc -fprofile-instr-generate -fcoverage-compilation-dir=. \
-    -O3 -mllvm -enable-name-compression=false \
+    -Os -mllvm -enable-name-compression=false \
     -fcoverage-mapping mcdc-macro.c -c
 
 # Instructions for generating proftext
 clang -fprofile-instr-generate mcdc-macro.o
+rm -f default.prof*
 ./a.out
 llvm-profdata merge --sparse -o default.profdata default.profraw
 llvm-profdata merge --text -o mcdc-macro.proftext default.profdata
diff --git a/llvm/test/tools/llvm-cov/mcdc-maxbs.test b/llvm/test/tools/llvm-cov/mcdc-maxbs.test
index bbb5fb8b6059a..c6bd18048fb95 100644
--- a/llvm/test/tools/llvm-cov/mcdc-maxbs.test
+++ b/llvm/test/tools/llvm-cov/mcdc-maxbs.test
@@ -31,7 +31,8 @@
 # Instructions for regenerating the test object:
 
 cd %S/Inputs # or copy %S/Inputs/mcdc-maxbs.c into the working directory
-clang -O3 -fcoverage-mcdc -fprofile-instr-generate \
+
+clang -Os -fcoverage-mcdc -fprofile-instr-generate \
     -fcoverage-mapping -fcoverage-compilation-dir=. \
     -mllvm -enable-name-compression=false \
     mcdc-maxbs.c -c -o mcdc-maxbs.o
@@ -39,6 +40,7 @@ clang -O3 -fcoverage-mcdc -fprofile-instr-generate \
 # Instructions for regenerating the test vector:
 
 clang -fprofile-instr-generate mcdc-maxbs.o
+rm -f default.prof*
 
 # Doesn't crash if argc > 1
 ./a.out

From 1f6a347c8abf8868fb4630c404480226c2efc2c2 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sun, 25 Feb 2024 09:21:17 +0900
Subject: [PATCH 242/546] Refactor: Let MCDC::State have DecisionByStmt and
 BranchByStmt

- Prune `RegionMCDCBitmapMap` and `RegionCondIDMap`. They are handled
  by `MCDCState`.
- Rename `s/BitmapMap/DecisionByStmt/`. It can handle Decision stuff.
- Rename `s/CondIDMap/BranchByStmt/`. It can be handle Branch stuff.
- `MCDCRecordProcessor`: Use `DecisionParams.BitmapIdx` directly.
---
 clang/lib/CodeGen/CodeGenPGO.cpp              | 27 ++++++++++---------
 clang/lib/CodeGen/CodeGenPGO.h                |  2 --
 clang/lib/CodeGen/CoverageMappingGen.cpp      | 25 ++++++++---------
 clang/lib/CodeGen/MCDCState.h                 | 14 ++++++++--
 .../ProfileData/Coverage/CoverageMapping.cpp  |  5 +---
 5 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 1ef7be3c72593..8aebd3557690a 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -165,6 +165,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
   llvm::DenseMap<const Stmt *, unsigned> &CounterMap;
   /// The next bitmap byte index to assign.
   unsigned NextMCDCBitmapIdx;
+  /// The state of MC/DC Coverage in this function.
   MCDC::State &MCDCState;
   /// Maximum number of supported MC/DC conditions in a boolean expression.
   unsigned MCDCMaxCond;
@@ -311,7 +312,7 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
 
           // Otherwise, allocate the number of bytes required for the bitmap
           // based on the number of conditions. Must be at least 1-byte long.
-          MCDCState.BitmapMap[BinOp] = NextMCDCBitmapIdx;
+          MCDCState.DecisionByStmt[BinOp].BitmapIdx = NextMCDCBitmapIdx;
           unsigned SizeInBits = std::max<unsigned>(1L << NumCond, CHAR_BIT);
           NextMCDCBitmapIdx += SizeInBits / CHAR_BIT;
         }
@@ -1034,7 +1035,7 @@ void CodeGenPGO::emitCounterRegionMapping(const Decl *D) {
 
   std::string CoverageMapping;
   llvm::raw_string_ostream OS(CoverageMapping);
-  RegionCondIDMap.reset(new llvm::DenseMap<const Stmt *, int16_t>);
+  RegionMCDCState->BranchByStmt.clear();
   CoverageMappingGen MappingGen(
       *CGM.getCoverageMapping(), CGM.getContext().getSourceManager(),
       CGM.getLangOpts(), RegionCounterMap.get(), RegionMCDCState.get());
@@ -1142,12 +1143,12 @@ void CodeGenPGO::emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder,
 
   S = S->IgnoreParens();
 
-  auto ExprMCDCBitmapMapIterator = RegionMCDCState->BitmapMap.find(S);
-  if (ExprMCDCBitmapMapIterator == RegionMCDCState->BitmapMap.end())
+  auto DecisionStateIter = RegionMCDCState->DecisionByStmt.find(S);
+  if (DecisionStateIter == RegionMCDCState->DecisionByStmt.end())
     return;
 
-  // Extract the ID of the global bitmap associated with this expression.
-  unsigned MCDCTestVectorBitmapID = ExprMCDCBitmapMapIterator->second;
+  // Extract the offset of the global bitmap associated with this expression.
+  unsigned MCDCTestVectorBitmapOffset = DecisionStateIter->second.BitmapIdx;
   auto *I8PtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());
 
   // Emit intrinsic responsible for updating the global bitmap corresponding to
@@ -1158,7 +1159,7 @@ void CodeGenPGO::emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder,
   llvm::Value *Args[5] = {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
                           Builder.getInt64(FunctionHash),
                           Builder.getInt32(RegionMCDCState->BitmapBytes),
-                          Builder.getInt32(MCDCTestVectorBitmapID),
+                          Builder.getInt32(MCDCTestVectorBitmapOffset),
                           MCDCCondBitmapAddr.getPointer()};
   Builder.CreateCall(
       CGM.getIntrinsic(llvm::Intrinsic::instrprof_mcdc_tvbitmap_update), Args);
@@ -1171,7 +1172,7 @@ void CodeGenPGO::emitMCDCCondBitmapReset(CGBuilderTy &Builder, const Expr *S,
 
   S = S->IgnoreParens();
 
-  if (!RegionMCDCState->BitmapMap.contains(S))
+  if (!RegionMCDCState->DecisionByStmt.contains(S))
     return;
 
   // Emit intrinsic that resets a dedicated temporary value on the stack to 0.
@@ -1193,13 +1194,13 @@ void CodeGenPGO::emitMCDCCondBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
   // also make debugging a bit easier.
   S = CodeGenFunction::stripCond(S);
 
-  auto ExprMCDCConditionIDMapIterator = RegionMCDCState->CondIDMap.find(S);
-  if (ExprMCDCConditionIDMapIterator == RegionMCDCState->CondIDMap.end())
+  auto BranchStateIter = RegionMCDCState->BranchByStmt.find(S);
+  if (BranchStateIter == RegionMCDCState->BranchByStmt.end())
     return;
 
   // Extract the ID of the condition we are setting in the bitmap.
-  auto CondID = ExprMCDCConditionIDMapIterator->second;
-  assert(CondID >= 0 && "Condition has no ID!");
+  const auto &Branch = BranchStateIter->second;
+  assert(Branch.ID >= 0 && "Condition has no ID!");
 
   auto *I8PtrTy = llvm::PointerType::getUnqual(CGM.getLLVMContext());
 
@@ -1208,7 +1209,7 @@ void CodeGenPGO::emitMCDCCondBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
   // the resulting value is used to update the boolean expression's bitmap.
   llvm::Value *Args[5] = {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
                           Builder.getInt64(FunctionHash),
-                          Builder.getInt32(CondID),
+                          Builder.getInt32(Branch.ID),
                           MCDCCondBitmapAddr.getPointer(), Val};
   Builder.CreateCall(
       CGM.getIntrinsic(llvm::Intrinsic::instrprof_mcdc_condbitmap_update),
diff --git a/clang/lib/CodeGen/CodeGenPGO.h b/clang/lib/CodeGen/CodeGenPGO.h
index 369bf05b59a0d..d3c2b277238fc 100644
--- a/clang/lib/CodeGen/CodeGenPGO.h
+++ b/clang/lib/CodeGen/CodeGenPGO.h
@@ -36,8 +36,6 @@ class CodeGenPGO {
   unsigned NumRegionCounters;
   uint64_t FunctionHash;
   std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionCounterMap;
-  std::unique_ptr<llvm::DenseMap<const Stmt *, unsigned>> RegionMCDCBitmapMap;
-  std::unique_ptr<llvm::DenseMap<const Stmt *, int16_t>> RegionCondIDMap;
   std::unique_ptr<llvm::DenseMap<const Stmt *, uint64_t>> StmtCountMap;
   std::unique_ptr<llvm::InstrProfRecord> ProfRecord;
   std::unique_ptr<MCDC::State> RegionMCDCState;
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index d8fa69d825b8d..d98ab79eac3ae 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -684,7 +684,6 @@ struct MCDCCoverageBuilder {
 
   llvm::SmallVector<mcdc::ConditionIDs> DecisionStack;
   MCDC::State &MCDCState;
-  llvm::DenseMap<const Stmt *, mcdc::ConditionID> &CondIDs;
   mcdc::ConditionID NextID = 0;
   bool NotMapped = false;
 
@@ -699,8 +698,8 @@ struct MCDCCoverageBuilder {
 
 public:
   MCDCCoverageBuilder(CodeGenModule &CGM, MCDC::State &MCDCState)
-      : CGM(CGM), DecisionStack(1, DecisionStackSentinel), MCDCState(MCDCState),
-        CondIDs(MCDCState.CondIDMap) {}
+      : CGM(CGM), DecisionStack(1, DecisionStackSentinel),
+        MCDCState(MCDCState) {}
 
   /// Return whether the build of the control flow map is at the top-level
   /// (root) of a logical operator nest in a boolean expression prior to the
@@ -714,16 +713,16 @@ struct MCDCCoverageBuilder {
 
   /// Set the given condition's ID.
   void setCondID(const Expr *Cond, mcdc::ConditionID ID) {
-    CondIDs[CodeGenFunction::stripCond(Cond)] = ID;
+    MCDCState.BranchByStmt[CodeGenFunction::stripCond(Cond)].ID = ID;
   }
 
   /// Return the ID of a given condition.
   mcdc::ConditionID getCondID(const Expr *Cond) const {
-    auto I = CondIDs.find(CodeGenFunction::stripCond(Cond));
-    if (I == CondIDs.end())
+    auto I = MCDCState.BranchByStmt.find(CodeGenFunction::stripCond(Cond));
+    if (I == MCDCState.BranchByStmt.end())
       return -1;
     else
-      return I->second;
+      return I->second.ID;
   }
 
   /// Return the LHS Decision ([0,0] if not set).
@@ -738,7 +737,7 @@ struct MCDCCoverageBuilder {
 
     // If binary expression is disqualified, don't do mapping.
     if (!isBuilding() &&
-        !MCDCState.BitmapMap.contains(CodeGenFunction::stripCond(E)))
+        !MCDCState.DecisionByStmt.contains(CodeGenFunction::stripCond(E)))
       NotMapped = true;
 
     // Don't go any further if we don't need to map condition IDs.
@@ -750,7 +749,7 @@ struct MCDCCoverageBuilder {
     // If the operator itself has an assigned ID, this means it represents a
     // larger subtree.  In this case, assign that ID to its LHS node.  Its RHS
     // will receive a new ID below. Otherwise, assign ID+1 to LHS.
-    if (CondIDs.contains(CodeGenFunction::stripCond(E)))
+    if (MCDCState.BranchByStmt.contains(CodeGenFunction::stripCond(E)))
       setCondID(E->getLHS(), getCondID(E));
     else
       setCondID(E->getLHS(), NextID++);
@@ -853,7 +852,9 @@ struct CounterCoverageMappingBuilder
     return Counter::getCounter(CounterMap[S]);
   }
 
-  unsigned getRegionBitmap(const Stmt *S) { return MCDCState.BitmapMap[S]; }
+  auto getBitmapIdx(const Stmt *S) {
+    return MCDCState.DecisionByStmt[S].BitmapIdx;
+  }
 
   /// Push a region onto the stack.
   ///
@@ -1984,7 +1985,7 @@ struct CounterCoverageMappingBuilder
     // Create MCDC Decision Region if at top-level (root).
     unsigned NumConds = 0;
     if (IsRootNode && (NumConds = MCDCBuilder.getTotalConditionsAndReset(E)))
-      createDecisionRegion(E, getRegionBitmap(E), NumConds);
+      createDecisionRegion(E, getBitmapIdx(E), NumConds);
 
     // Extract the RHS's Execution Counter.
     Counter RHSExecCnt = getRegionCounter(E);
@@ -2037,7 +2038,7 @@ struct CounterCoverageMappingBuilder
     // Create MCDC Decision Region if at top-level (root).
     unsigned NumConds = 0;
     if (IsRootNode && (NumConds = MCDCBuilder.getTotalConditionsAndReset(E)))
-      createDecisionRegion(E, getRegionBitmap(E), NumConds);
+      createDecisionRegion(E, getBitmapIdx(E), NumConds);
 
     // Extract the RHS's Execution Counter.
     Counter RHSExecCnt = getRegionCounter(E);
diff --git a/clang/lib/CodeGen/MCDCState.h b/clang/lib/CodeGen/MCDCState.h
index e6e39237a1b41..3b86cd9cedeea 100644
--- a/clang/lib/CodeGen/MCDCState.h
+++ b/clang/lib/CodeGen/MCDCState.h
@@ -27,8 +27,18 @@ using namespace llvm::coverage::mcdc;
 /// Per-Function MC/DC state
 struct State {
   unsigned BitmapBytes = 0;
-  llvm::DenseMap<const Stmt *, unsigned> BitmapMap;
-  llvm::DenseMap<const Stmt *, ConditionID> CondIDMap;
+
+  struct Decision {
+    unsigned BitmapIdx;
+  };
+
+  llvm::DenseMap<const Stmt *, Decision> DecisionByStmt;
+
+  struct Branch {
+    ConditionID ID;
+  };
+
+  llvm::DenseMap<const Stmt *, Branch> BranchByStmt;
 };
 
 } // namespace clang::CodeGen::MCDC
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index ddce758072917..8f9d1eadc2daf 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -243,8 +243,6 @@ class MCDCRecordProcessor {
   /// Total number of conditions in the boolean expression.
   unsigned NumConditions;
 
-  unsigned BitmapIdx;
-
   /// Mapping of a condition ID to its corresponding branch params.
   llvm::DenseMap<unsigned, mcdc::ConditionIDs> CondsMap;
 
@@ -265,7 +263,6 @@ class MCDCRecordProcessor {
       : Bitmap(Bitmap), Region(Region),
         DecisionParams(Region.getDecisionParams()), Branches(Branches),
         NumConditions(DecisionParams.NumConditions),
-        BitmapIdx(DecisionParams.BitmapIdx * CHAR_BIT),
         Folded(NumConditions, false), IndependencePairs(NumConditions) {}
 
 private:
@@ -287,7 +284,7 @@ class MCDCRecordProcessor {
         continue;
       }
 
-      if (!Bitmap[BitmapIdx + Index])
+      if (!Bitmap[DecisionParams.BitmapIdx * CHAR_BIT + Index])
         continue;
 
       // Copy the completed test vector to the vector of testvectors.

From 12d29cd171fdf20ab8a516998ad6be0d24a9c050 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= <schuett@gmail.com>
Date: Sun, 25 Feb 2024 11:37:43 +0100
Subject: [PATCH 243/546] test overflow intrinsics

---
 llvm/test/CodeGen/AArch64/overflow.ll | 242 ++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/overflow.ll

diff --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll
new file mode 100644
index 0000000000000..444aaeb0f3fe7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/overflow.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm64-eabi -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,GISEL
+
+
+define zeroext i1 @saddo1.i32.unused(i32 %v1, i32 %v2, ptr %res) {
+; CHECK-LABEL: saddo1.i32.unused:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %val = extractvalue {i32, i1} %t, 0
+  store i32 %val, ptr %res
+  ret i1 1
+}
+
+define zeroext i1 @saddo1.i32.fold(i32 %v1, i32 %v2, ptr %res) {
+; SDAG-LABEL: saddo1.i32.fold:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #20 // =0x14
+; SDAG-NEXT:    mov w0, wzr
+; SDAG-NEXT:    str w8, [x2]
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: saddo1.i32.fold:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    mov w8, #9 // =0x9
+; GISEL-NEXT:    adds w8, w8, #11
+; GISEL-NEXT:    cset w0, vs
+; GISEL-NEXT:    str w8, [x2]
+; GISEL-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 9, i32 11)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo1.i32.addzero(i32 %v1, i32 %v2, ptr %res) {
+; CHECK-LABEL: saddo1.i32.addzero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 0)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo1.i32.addzero(i32 %v1, i32 %v2, ptr %res) {
+; CHECK-LABEL: uaddo1.i32.addzero:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    str w8, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 0)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define i32 @saddo.select.i64(i32 %v1, i32 %v2, i1 %v3, i64 %v4, i64 %v5) {
+; SDAG-LABEL: saddo.select.i64:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w0, w1
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: saddo.select.i64:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    mov w8, #13 // =0xd
+; GISEL-NEXT:    and x9, x3, #0xc
+; GISEL-NEXT:    and x8, x4, x8
+; GISEL-NEXT:    cmn x9, x8
+; GISEL-NEXT:    cset w8, vs
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    ret
+entry:
+  %lhs = and i64 %v4, 12
+  %rhs = and i64 %v5, 13
+  %t = call {i64, i1} @llvm.sadd.with.overflow.64(i64 %lhs, i64 %rhs)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i32 @uaddo.select.i64(i32 %v1, i32 %v2, i1 %v3, i64 %v4, i64 %v5) {
+; SDAG-LABEL: uaddo.select.i64:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w0, w1
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: uaddo.select.i64:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    mov w8, #9 // =0x9
+; GISEL-NEXT:    mov w9, #10 // =0xa
+; GISEL-NEXT:    and x8, x3, x8
+; GISEL-NEXT:    and x9, x4, x9
+; GISEL-NEXT:    cmn x8, x9
+; GISEL-NEXT:    cset w8, hs
+; GISEL-NEXT:    tst w8, #0x1
+; GISEL-NEXT:    csel w0, w0, w1, ne
+; GISEL-NEXT:    ret
+entry:
+  %lhs = and i64 %v4, 9
+  %rhs = and i64 %v5, 10
+  %t = call {i64, i1} @llvm.uadd.with.overflow.64(i64 %lhs, i64 %rhs)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define zeroext i1 @saddo.canon.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
+; SDAG-LABEL: saddo.canon.i32:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w0, wzr
+; SDAG-NEXT:    str w4, [x5]
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: saddo.canon.i32:
+; GISEL:       // %bb.0: // %entry
+; GISEL-NEXT:    adds w8, wzr, w4
+; GISEL-NEXT:    cset w0, vs
+; GISEL-NEXT:    str w8, [x5]
+; GISEL-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 0, i32 %v5)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+define zeroext i1 @saddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
+; CHECK-LABEL: saddo.add.i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w4, #100
+; CHECK-NEXT:    subs w8, w8, #100
+; CHECK-NEXT:    cset w0, vs
+; CHECK-NEXT:    str w8, [x5]
+; CHECK-NEXT:    ret
+entry:
+  %lhs = add nsw i32 %v5, 100
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %lhs, i32 -100)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.add.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
+; CHECK-LABEL: uaddo.add.i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    add w8, w4, #5
+; CHECK-NEXT:    adds w8, w8, #5
+; CHECK-NEXT:    cset w0, hs
+; CHECK-NEXT:    str w8, [x5]
+; CHECK-NEXT:    ret
+entry:
+  %lhs = add nuw i32 %v5, 5
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %lhs, i32 5)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.negative.i32(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
+; CHECK-LABEL: uaddo.negative.i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adds w8, w3, #5
+; CHECK-NEXT:    cset w0, hs
+; CHECK-NEXT:    str w8, [x5]
+; CHECK-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v4, i32 5)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define i32 @saddo.always.i8(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
+; CHECK-LABEL: saddo.always.i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 255, i8 254)
+  %obit = extractvalue {i8, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i32 @uaddo.never.i8(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, ptr %res) {
+; CHECK-LABEL: uaddo.never.i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+entry:
+  %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 2, i8 1)
+  %obit = extractvalue {i8, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8, i1} @llvm.uadd.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8, i1} @llvm.ssub.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8, i1} @llvm.usub.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8, i1} @llvm.smul.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8, i1} @llvm.umul.with.overflow.i8(i8, i8) nounwind readnone
+declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone

From 85da9f80b89be7b96bc1b22909062286fab9dc31 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 25 Feb 2024 12:14:44 +0000
Subject: [PATCH 244/546] [VPlan] Remove unused VPTransformState::VPValue2Value
 (NFCI).

Clean up unused member variable.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 4 ----
 llvm/lib/Transforms/Vectorize/VPlan.h   | 2 --
 2 files changed, 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 56310dc11786c..4ffbed4b705ca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -829,10 +829,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 /// Assumes a single pre-header basic-block was created for this. Introduce
 /// additional basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
-  // Set the reverse mapping from VPValues to Values for code generation.
-  for (auto &Entry : Value2VPValue)
-    State->VPValue2Value[Entry.second] = Entry.first;
-
   // Initialize CFG state.
   State->CFG.PrevVPBB = nullptr;
   State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a2a203c420051..aaddaaff2aba2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -383,8 +383,6 @@ struct VPTransformState {
   /// Hold a reference to the IRBuilder used to generate output IR code.
   IRBuilderBase &Builder;
 
-  VPValue2ValueTy VPValue2Value;
-
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 

From 4bf06c16fcddcfcea332069bdde5cbf1401513cf Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking@codeplay.com>
Date: Sun, 25 Feb 2024 12:56:12 +0000
Subject: [PATCH 245/546] Initialize unsigned integer when declared (#81894)

Initialize ModOpcode directly before the loop execution to silence
static analyzer warnings about the usage of an uninitialized variable.

This leads to a redundant assignment of ElV2F16 inside the first loop
execution, but also avoids superfluous emptiness checks of EltsV2F16
after the first execution of the loop.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  9 ++++---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 27 ++++++++++---------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 024adcda0fa06..4896ae8bad9ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3261,15 +3261,16 @@ bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
                                                  SDValue &SrcMods) const {
   Src = In;
   unsigned Mods = SISrcMods::OP_SEL_1;
-  unsigned ModOpcode;
   SmallVector<SDValue, 8> EltsF32;
 
   if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    assert(BV->getNumOperands() > 0);
+    // Based on first element decide which mod we match, neg or abs
+    SDValue ElF32 = stripBitcast(BV->getOperand(0));
+    unsigned ModOpcode =
+        (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
     for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
       SDValue ElF32 = stripBitcast(BV->getOperand(i));
-      // Based on first element decide which mod we match, neg or abs
-      if (EltsF32.empty())
-        ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
       if (ElF32.getOpcode() != ModOpcode)
         break;
       EltsF32.push_back(ElF32.getOperand(0));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aed9bffc551f4..aacc3590a5dbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4019,16 +4019,17 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
   Register Src = Root.getReg();
   unsigned Mods = SISrcMods::OP_SEL_1;
-  unsigned ModOpcode;
   SmallVector<Register, 8> EltsF32;
 
   if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
+    assert(BV->getNumSources() > 0);
+    // Based on first element decide which mod we match, neg or abs
+    MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
+    unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
+                             ? AMDGPU::G_FNEG
+                             : AMDGPU::G_FABS;
     for (unsigned i = 0; i < BV->getNumSources(); ++i) {
-      MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
-      // Based on first element decide which mod we match, neg or abs
-      if (EltsF32.empty())
-        ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
-                                                           : AMDGPU::G_FABS;
+      ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
       if (ElF32->getOpcode() != ModOpcode)
         break;
       EltsF32.push_back(ElF32->getOperand(1).getReg());
@@ -4075,16 +4076,18 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
   Register Src = Root.getReg();
   unsigned Mods = SISrcMods::OP_SEL_1;
-  unsigned ModOpcode;
   SmallVector<Register, 8> EltsV2F16;
 
   if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+    assert(CV->getNumSources() > 0);
+    MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
+    // Based on first element decide which mod we match, neg or abs
+    unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
+                             ? AMDGPU::G_FNEG
+                             : AMDGPU::G_FABS;
+
     for (unsigned i = 0; i < CV->getNumSources(); ++i) {
-      MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
-      // Based on first element decide which mod we match, neg or abs
-      if (EltsV2F16.empty())
-        ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
-                                                             : AMDGPU::G_FABS;
+      ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
       if (ElV2F16->getOpcode() != ModOpcode)
         break;
       EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());

From fe42e72db29e48aa81eac2aa922afd90a7f01517 Mon Sep 17 00:00:00 2001
From: Rishabh Bali <rishabhsbali@gmail.com>
Date: Sun, 25 Feb 2024 18:42:22 +0530
Subject: [PATCH 246/546] [CodeGen] Port AtomicExpand to new Pass Manager
 (#71220)

Port the `atomicexpand` pass to the new Pass Manager.
Fixes #64559
---
 llvm/include/llvm/CodeGen/AtomicExpand.h      |  30 ++++
 llvm/include/llvm/CodeGen/Passes.h            |   2 +-
 llvm/include/llvm/InitializePasses.h          |   2 +-
 llvm/include/llvm/LinkAllPasses.h             |   1 +
 llvm/lib/CodeGen/AtomicExpandPass.cpp         | 143 +++++++++++-------
 llvm/lib/CodeGen/CodeGen.cpp                  |   2 +-
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |   2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +-
 llvm/lib/Target/ARC/ARCTargetMachine.cpp      |   2 +-
 llvm/lib/Target/ARM/ARMTargetMachine.cpp      |   2 +-
 llvm/lib/Target/BPF/BPFTargetMachine.cpp      |   2 +-
 llvm/lib/Target/CSKY/CSKYTargetMachine.cpp    |   2 +-
 .../Target/Hexagon/HexagonTargetMachine.cpp   |   2 +-
 llvm/lib/Target/Lanai/LanaiTargetMachine.cpp  |   2 +-
 .../LoongArch/LoongArchTargetMachine.cpp      |   2 +-
 llvm/lib/Target/M68k/M68kTargetMachine.cpp    |   2 +-
 .../lib/Target/MSP430/MSP430TargetMachine.cpp |   2 +-
 llvm/lib/Target/Mips/MipsTargetMachine.cpp    |   2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |   2 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp  |   2 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   2 +-
 llvm/lib/Target/Sparc/SparcTargetMachine.cpp  |   2 +-
 .../Target/SystemZ/SystemZTargetMachine.cpp   |   2 +-
 llvm/lib/Target/VE/VETargetMachine.cpp        |   2 +-
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |   2 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   2 +-
 llvm/lib/Target/XCore/XCoreTargetMachine.cpp  |   2 +-
 .../test/CodeGen/AMDGPU/idemponent-atomics.ll |   2 +-
 .../CodeGen/AMDGPU/private-memory-atomics.ll  |   2 +-
 .../AtomicExpand/AArch64/atomicrmw-fp.ll      |   2 +-
 .../AArch64/expand-atomicrmw-xchg-fp.ll       |   4 +-
 .../AtomicExpand/AArch64/pcsections.ll        |   2 +-
 .../AMDGPU/expand-atomic-i16-system.ll        |   2 +-
 .../AtomicExpand/AMDGPU/expand-atomic-i16.ll  |   4 +-
 .../AMDGPU/expand-atomic-i8-system.ll         |   2 +-
 .../AtomicExpand/AMDGPU/expand-atomic-i8.ll   |   4 +-
 ...and-atomic-rmw-fadd-flat-specialization.ll |   8 +-
 .../AMDGPU/expand-atomic-rmw-fadd.ll          |  12 +-
 .../AMDGPU/expand-atomic-rmw-fmax.ll          |   4 +-
 .../AMDGPU/expand-atomic-rmw-fmin.ll          |   4 +-
 .../AMDGPU/expand-atomic-rmw-fsub.ll          |   4 +-
 .../AMDGPU/expand-atomic-rmw-nand.ll          |   4 +-
 .../expand-atomic-simplify-cfg-CAS-block.ll   |   2 +-
 .../AtomicExpand/AMDGPU/unaligned-atomic.ll   |   2 +-
 .../AtomicExpand/ARM/atomic-expansion-v7.ll   |   2 +-
 .../AtomicExpand/ARM/atomic-expansion-v8.ll   |   2 +-
 .../AtomicExpand/ARM/atomicrmw-fp.ll          |   2 +-
 .../AtomicExpand/ARM/cmpxchg-weak.ll          |   2 +-
 .../AtomicExpand/Hexagon/atomicrmw-fp.ll      |   2 +-
 .../AtomicExpand/LoongArch/atomicrmw-fp.ll    |   2 +-
 .../LoongArch/load-store-atomic.ll            |   4 +-
 .../AtomicExpand/Mips/atomicrmw-fp.ll         |   2 +-
 .../AtomicExpand/PowerPC/atomicrmw-fp.ll      |   2 +-
 .../AtomicExpand/PowerPC/cfence-double.ll     |   4 +-
 .../AtomicExpand/PowerPC/cfence-float.ll      |   4 +-
 .../AtomicExpand/PowerPC/cmpxchg.ll           |   4 +-
 .../AtomicExpand/PowerPC/issue55983.ll        |   4 +-
 .../AtomicExpand/RISCV/atomicrmw-fp.ll        |   2 +-
 .../Transforms/AtomicExpand/SPARC/libcalls.ll |   2 +-
 .../Transforms/AtomicExpand/SPARC/partword.ll |   2 +-
 .../AtomicExpand/X86/expand-atomic-libcall.ll |   2 +-
 .../X86/expand-atomic-non-integer.ll          |   2 +-
 .../AtomicExpand/X86/expand-atomic-rmw-fp.ll  |   2 +-
 .../X86/expand-atomic-rmw-initial-load.ll     |   2 +-
 .../AtomicExpand/X86/expand-atomic-xchg-fp.ll |   2 +-
 llvm/tools/opt/optdriver.cpp                  |   2 +-
 68 files changed, 201 insertions(+), 141 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/AtomicExpand.h

diff --git a/llvm/include/llvm/CodeGen/AtomicExpand.h b/llvm/include/llvm/CodeGen/AtomicExpand.h
new file mode 100644
index 0000000000000..1b8a988ef4866
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/AtomicExpand.h
@@ -0,0 +1,30 @@
+//===-- AtomicExpand.h - Expand Atomic Instructions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ATOMICEXPAND_H
+#define LLVM_CODEGEN_ATOMICEXPAND_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+class TargetMachine;
+
+class AtomicExpandPass : public PassInfoMixin<AtomicExpandPass> {
+private:
+  const TargetMachine *TM;
+
+public:
+  AtomicExpandPass(const TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_ATOMICEXPAND_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index bbfb8a0dbe26a..3f0d81fa1d14b 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -44,7 +44,7 @@ namespace llvm {
   /// AtomicExpandPass - At IR level this pass replace atomic instructions with
   /// __atomic_* library calls, or target specific instruction which implement the
   /// same semantics in a way which better fits the target backend.
-  FunctionPass *createAtomicExpandPass();
+  FunctionPass *createAtomicExpandLegacyPass();
 
   /// createUnreachableBlockEliminationPass - The LLVM code generator does not
   /// work well with unreachable basic blocks (what live ranges make sense for a
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index f673bef2fd857..ee91c3ec3ddc2 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -52,7 +52,7 @@ void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeAlwaysInlinerLegacyPassPass(PassRegistry&);
 void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
 void initializeAssumptionCacheTrackerPass(PassRegistry&);
-void initializeAtomicExpandPass(PassRegistry&);
+void initializeAtomicExpandLegacyPass(PassRegistry &);
 void initializeBasicBlockPathCloningPass(PassRegistry &);
 void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
 void initializeBasicBlockSectionsPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 729b12dc29e6d..2eaed42e62fcd 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -60,6 +60,7 @@ namespace {
       if (std::getenv("bar") != (char*) -1)
         return;
 
+      (void)llvm::createAtomicExpandLegacyPass();
       (void) llvm::createBasicAAWrapperPass();
       (void) llvm::createSCEVAAWrapperPass();
       (void) llvm::createTypeBasedAAWrapperPass();
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index faa3edb2b03c8..894285a7eb256 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -59,19 +60,10 @@ using namespace llvm;
 
 namespace {
 
-class AtomicExpand : public FunctionPass {
+class AtomicExpandImpl {
   const TargetLowering *TLI = nullptr;
   const DataLayout *DL = nullptr;
 
-public:
-  static char ID; // Pass identification, replacement for typeid
-
-  AtomicExpand() : FunctionPass(ID) {
-    initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
-
 private:
   bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
   IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
@@ -124,6 +116,20 @@ class AtomicExpand : public FunctionPass {
   friend bool
   llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
                                  CreateCmpXchgInstFun CreateCmpXchg);
+
+public:
+  bool run(Function &F, const TargetMachine *TM);
+};
+
+class AtomicExpandLegacy : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  AtomicExpandLegacy() : FunctionPass(ID) {
+    initializeAtomicExpandLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
 };
 
 // IRBuilder to be used for replacement atomic instructions.
@@ -138,14 +144,15 @@ struct ReplacementIRBuilder : IRBuilder<InstSimplifyFolder> {
 
 } // end anonymous namespace
 
-char AtomicExpand::ID = 0;
-
-char &llvm::AtomicExpandID = AtomicExpand::ID;
+char AtomicExpandLegacy::ID = 0;
 
-INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", false,
-                false)
+char &llvm::AtomicExpandID = AtomicExpandLegacy::ID;
 
-FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
+INITIALIZE_PASS_BEGIN(AtomicExpandLegacy, DEBUG_TYPE,
+                      "Expand Atomic instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AtomicExpandLegacy, DEBUG_TYPE,
+                    "Expand Atomic instructions", false, false)
 
 // Helper functions to retrieve the size of atomic instructions.
 static unsigned getAtomicOpSize(LoadInst *LI) {
@@ -179,13 +186,8 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
          Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
 }
 
-bool AtomicExpand::runOnFunction(Function &F) {
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  auto &TM = TPC->getTM<TargetMachine>();
-  const auto *Subtarget = TM.getSubtargetImpl(F);
+bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
+  const auto *Subtarget = TM->getSubtargetImpl(F);
   if (!Subtarget->enableAtomicExpand())
     return false;
   TLI = Subtarget->getTargetLowering();
@@ -330,7 +332,33 @@ bool AtomicExpand::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
+bool AtomicExpandLegacy::runOnFunction(Function &F) {
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+  auto *TM = &TPC->getTM<TargetMachine>();
+  AtomicExpandImpl AE;
+  return AE.run(F, TM);
+}
+
+FunctionPass *llvm::createAtomicExpandLegacyPass() {
+  return new AtomicExpandLegacy();
+}
+
+PreservedAnalyses AtomicExpandPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  AtomicExpandImpl AE;
+
+  bool Changed = AE.run(F, TM);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
+                                             AtomicOrdering Order) {
   ReplacementIRBuilder Builder(I, *DL);
 
   auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
@@ -345,8 +373,8 @@ bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
 }
 
 /// Get the iX type with the same bitwidth as T.
-IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
-                                                       const DataLayout &DL) {
+IntegerType *
+AtomicExpandImpl::getCorrespondingIntegerType(Type *T, const DataLayout &DL) {
   EVT VT = TLI->getMemValueType(DL, T);
   unsigned BitWidth = VT.getStoreSizeInBits();
   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
@@ -356,7 +384,7 @@ IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
 /// Convert an atomic load of a non-integral type to an integer load of the
 /// equivalent bitwidth.  See the function comment on
 /// convertAtomicStoreToIntegerType for background.
-LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
+LoadInst *AtomicExpandImpl::convertAtomicLoadToIntegerType(LoadInst *LI) {
   auto *M = LI->getModule();
   Type *NewTy = getCorrespondingIntegerType(LI->getType(), M->getDataLayout());
 
@@ -377,7 +405,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
 }
 
 AtomicRMWInst *
-AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
+AtomicExpandImpl::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
   auto *M = RMWI->getModule();
   Type *NewTy =
       getCorrespondingIntegerType(RMWI->getType(), M->getDataLayout());
@@ -390,9 +418,8 @@ AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
                       ? Builder.CreatePtrToInt(Val, NewTy)
                       : Builder.CreateBitCast(Val, NewTy);
 
-  auto *NewRMWI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, Addr, NewVal,
-                              RMWI->getAlign(), RMWI->getOrdering());
+  auto *NewRMWI = Builder.CreateAtomicRMW(
+      AtomicRMWInst::Xchg, Addr, NewVal, RMWI->getAlign(), RMWI->getOrdering());
   NewRMWI->setVolatile(RMWI->isVolatile());
   LLVM_DEBUG(dbgs() << "Replaced " << *RMWI << " with " << *NewRMWI << "\n");
 
@@ -404,7 +431,7 @@ AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
   return NewRMWI;
 }
 
-bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
+bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
@@ -426,7 +453,7 @@ bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
   }
 }
 
-bool AtomicExpand::tryExpandAtomicStore(StoreInst *SI) {
+bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
   switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
@@ -441,7 +468,7 @@ bool AtomicExpand::tryExpandAtomicStore(StoreInst *SI) {
   }
 }
 
-bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
+bool AtomicExpandImpl::expandAtomicLoadToLL(LoadInst *LI) {
   ReplacementIRBuilder Builder(LI, *DL);
 
   // On some architectures, load-linked instructions are atomic for larger
@@ -457,7 +484,7 @@ bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
   return true;
 }
 
-bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
+bool AtomicExpandImpl::expandAtomicLoadToCmpXchg(LoadInst *LI) {
   ReplacementIRBuilder Builder(LI, *DL);
   AtomicOrdering Order = LI->getOrdering();
   if (Order == AtomicOrdering::Unordered)
@@ -486,7 +513,7 @@ bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
 /// instruction select from the original atomic store, but as a migration
 /// mechanism, we convert back to the old format which the backends understand.
 /// Each backend will need individual work to recognize the new format.
-StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
+StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) {
   ReplacementIRBuilder Builder(SI, *DL);
   auto *M = SI->getModule();
   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
@@ -504,7 +531,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   return NewSI;
 }
 
-void AtomicExpand::expandAtomicStore(StoreInst *SI) {
+void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
   // This function is only called on atomic stores that are too large to be
   // atomic if implemented as a native store. So we replace them by an
   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
@@ -551,7 +578,7 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
     NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
 }
 
-bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
+bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
   LLVMContext &Ctx = AI->getModule()->getContext();
   TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);
   switch (Kind) {
@@ -844,7 +871,7 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
 /// way as a typical atomicrmw expansion. The only difference here is
 /// that the operation inside of the loop may operate upon only a
 /// part of the value.
-void AtomicExpand::expandPartwordAtomicRMW(
+void AtomicExpandImpl::expandPartwordAtomicRMW(
     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
   // Widen And/Or/Xor and give the target another chance at expanding it.
   AtomicRMWInst::BinOp Op = AI->getOperation();
@@ -853,7 +880,6 @@ void AtomicExpand::expandPartwordAtomicRMW(
     tryExpandAtomicRMW(widenPartwordAtomicRMW(AI));
     return;
   }
-
   AtomicOrdering MemOpOrder = AI->getOrdering();
   SyncScope::ID SSID = AI->getSyncScopeID();
 
@@ -894,7 +920,7 @@ void AtomicExpand::expandPartwordAtomicRMW(
 }
 
 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
-AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
+AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   ReplacementIRBuilder Builder(AI, *DL);
   AtomicRMWInst::BinOp Op = AI->getOperation();
 
@@ -929,7 +955,7 @@ AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   return NewAI;
 }
 
-bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
+bool AtomicExpandImpl::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   // The basic idea here is that we're expanding a cmpxchg of a
   // smaller memory size up to a word-sized cmpxchg. To do this, we
   // need to add a retry-loop for strong cmpxchg, so that
@@ -1054,7 +1080,7 @@ bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   return true;
 }
 
-void AtomicExpand::expandAtomicOpToLLSC(
+void AtomicExpandImpl::expandAtomicOpToLLSC(
     Instruction *I, Type *ResultType, Value *Addr, Align AddrAlign,
     AtomicOrdering MemOpOrder,
     function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
@@ -1066,7 +1092,7 @@ void AtomicExpand::expandAtomicOpToLLSC(
   I->eraseFromParent();
 }
 
-void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
+void AtomicExpandImpl::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
   ReplacementIRBuilder Builder(AI, *DL);
 
   PartwordMaskValues PMV =
@@ -1092,7 +1118,8 @@ void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
   AI->eraseFromParent();
 }
 
-void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
+void AtomicExpandImpl::expandAtomicCmpXchgToMaskedIntrinsic(
+    AtomicCmpXchgInst *CI) {
   ReplacementIRBuilder Builder(CI, *DL);
 
   PartwordMaskValues PMV = createMaskInstrs(
@@ -1119,7 +1146,7 @@ void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
   CI->eraseFromParent();
 }
 
-Value *AtomicExpand::insertRMWLLSCLoop(
+Value *AtomicExpandImpl::insertRMWLLSCLoop(
     IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
     AtomicOrdering MemOpOrder,
     function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
@@ -1175,7 +1202,7 @@ Value *AtomicExpand::insertRMWLLSCLoop(
 /// way to represent a pointer cmpxchg so that we can update backends one by
 /// one.
 AtomicCmpXchgInst *
-AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
+AtomicExpandImpl::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
   auto *M = CI->getModule();
   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
                                             M->getDataLayout());
@@ -1208,7 +1235,7 @@ AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
   return NewCI;
 }
 
-bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+bool AtomicExpandImpl::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
   AtomicOrdering FailureOrder = CI->getFailureOrdering();
   Value *Addr = CI->getPointerOperand();
@@ -1454,7 +1481,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   return true;
 }
 
-bool AtomicExpand::isIdempotentRMW(AtomicRMWInst *RMWI) {
+bool AtomicExpandImpl::isIdempotentRMW(AtomicRMWInst *RMWI) {
   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
   if (!C)
     return false;
@@ -1474,7 +1501,7 @@ bool AtomicExpand::isIdempotentRMW(AtomicRMWInst *RMWI) {
   }
 }
 
-bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
+bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
     tryExpandAtomicLoad(ResultingLoad);
     return true;
@@ -1482,7 +1509,7 @@ bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
   return false;
 }
 
-Value *AtomicExpand::insertRMWCmpXchgLoop(
+Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
     IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
     AtomicOrdering MemOpOrder, SyncScope::ID SSID,
     function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
@@ -1543,7 +1570,7 @@ Value *AtomicExpand::insertRMWCmpXchgLoop(
   return NewLoaded;
 }
 
-bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
   unsigned ValueSize = getAtomicOpSize(CI);
 
@@ -1574,7 +1601,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
 
   // FIXME: If FP exceptions are observable, we should force them off for the
   // loop for the FP atomics.
-  Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
+  Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(
       Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),
       AI->getOrdering(), AI->getSyncScopeID(),
       [&](IRBuilderBase &Builder, Value *Loaded) {
@@ -1608,7 +1635,7 @@ static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
          Size <= LargestSize;
 }
 
-void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
+void AtomicExpandImpl::expandAtomicLoadToLibcall(LoadInst *I) {
   static const RTLIB::Libcall Libcalls[6] = {
       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
@@ -1621,7 +1648,7 @@ void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
     report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");
 }
 
-void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
+void AtomicExpandImpl::expandAtomicStoreToLibcall(StoreInst *I) {
   static const RTLIB::Libcall Libcalls[6] = {
       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
@@ -1634,7 +1661,7 @@ void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
     report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");
 }
 
-void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
+void AtomicExpandImpl::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
   static const RTLIB::Libcall Libcalls[6] = {
       RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
@@ -1712,7 +1739,7 @@ static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
   llvm_unreachable("Unexpected AtomicRMW operation.");
 }
 
-void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
+void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
 
   unsigned Size = getAtomicOpSize(I);
@@ -1751,7 +1778,7 @@ void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
 // ATOMIC libcalls to be emitted. All of the other arguments besides
 // 'I' are extracted from the Instruction subclass by the
 // caller. Depending on the particular call, some will be null.
-bool AtomicExpand::expandAtomicOpToLibcall(
+bool AtomicExpandImpl::expandAtomicOpToLibcall(
     Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,
     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 418066452c172..be1813451228d 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAssignmentTrackingAnalysisPass(Registry);
-  initializeAtomicExpandPass(Registry);
+  initializeAtomicExpandLegacyPass(Registry);
   initializeBasicBlockPathCloningPass(Registry);
   initializeBasicBlockSectionsPass(Registry);
   initializeBranchFolderPassPass(Registry);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index fed7a14c8a2e3..e0bc57f8bf72f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -73,6 +73,7 @@
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/AssignmentTrackingAnalysis.h"
+#include "llvm/CodeGen/AtomicExpand.h"
 #include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
 #include "llvm/CodeGen/CallBrPrepare.h"
 #include "llvm/CodeGen/CodeGenPrepare.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 093c1f8aaad43..a345e8d72d939 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -302,6 +302,7 @@ FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
 FUNCTION_PASS("annotation-remarks", AnnotationRemarksPass())
 FUNCTION_PASS("assume-builder", AssumeBuilderPass())
 FUNCTION_PASS("assume-simplify", AssumeSimplifyPass())
+FUNCTION_PASS("atomic-expand", AtomicExpandPass(TM))
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 81bb6e59422fa..64c4ecd1fd6d5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -569,7 +569,7 @@ std::unique_ptr<CSEConfigBase> AArch64PassConfig::getCSEConfig() const {
 void AArch64PassConfig::addIRPasses() {
   // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
   // ourselves.
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   // Expand any SVE vector library calls that we can't code generate directly.
   if (EnableSVEIntrinsicOpts &&
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a0c6e3a1b49b7..0d830df1f1f1d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1058,7 +1058,7 @@ void AMDGPUPassConfig::addIRPasses() {
     addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
   }
 
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     addPass(createAMDGPUPromoteAlloca());
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index 4f612ae623b98..f50c3c0265e34 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -70,7 +70,7 @@ TargetPassConfig *ARCTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void ARCPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index a99773691df12..4ef00df57ef9a 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -418,7 +418,7 @@ void ARMPassConfig::addIRPasses() {
   if (TM->Options.ThreadModel == ThreadModel::Single)
     addPass(createLowerAtomicPass());
   else
-    addPass(createAtomicExpandPass());
+    addPass(createAtomicExpandLegacyPass());
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 8a6e7ae3663e0..08ac4b25540f7 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -149,7 +149,7 @@ void BPFTargetMachine::registerPassBuilderCallbacks(
 }
 
 void BPFPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
   addPass(createBPFCheckAndAdjustIR());
 
   TargetPassConfig::addIRPasses();
diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
index 8c268dc316141..0bbfabe93147c 100644
--- a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
+++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -118,7 +118,7 @@ TargetPassConfig *CSKYTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void CSKYPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
   TargetPassConfig::addIRPasses();
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index a5ebd64f1f8af..170684276ac3c 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -366,7 +366,7 @@ void HexagonPassConfig::addIRPasses() {
     addPass(createDeadCodeEliminationPass());
   }
 
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   if (!NoOpt) {
     if (EnableInitialCFGCleanup)
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 33479720183b4..2357221b0120f 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -106,7 +106,7 @@ LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
 }
 
 void LanaiPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 62ae1dea00d6f..e5494488e1135 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -161,7 +161,7 @@ void LoongArchPassConfig::addIRPasses() {
   // pointer values N iterations ahead.
   if (TM->getOptLevel() != CodeGenOptLevel::None && EnableLoopDataPrefetch)
     addPass(createLoopDataPrefetchPass());
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
index af8cb9a83a050..bbbcb1556ed55 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -171,7 +171,7 @@ TargetPassConfig *M68kTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void M68kPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
   TargetPassConfig::addIRPasses();
 }
 
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 283de46e57d5c..ed0fcf7110b78 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -83,7 +83,7 @@ MachineFunctionInfo *MSP430TargetMachine::createMachineFunctionInfo(
 }
 
 void MSP430PassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 0742228369292..4c4bf70e22c6c 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -263,7 +263,7 @@ std::unique_ptr<CSEConfigBase> MipsPassConfig::getCSEConfig() const {
 
 void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
   if (getMipsSubtarget().os16())
     addPass(createMipsOs16Pass());
   if (getMipsSubtarget().inMips16HardFloat())
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index fad69f5e80a7a..69d4596f7843e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -380,7 +380,7 @@ void NVPTXPassConfig::addIRPasses() {
     addStraightLineScalarOptimizationPasses();
   }
 
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
   addPass(createNVPTXCtorDtorLoweringLegacyPass());
 
   // === LSR and other generic IR passes ===
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index d676fa86a10e7..714cf69827a1e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -457,7 +457,7 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
 void PPCPassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOptLevel::None)
     addPass(createPPCBoolRetToIntPass());
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   // Lower generic MASSV routines to PowerPC subtarget-specific entries.
   addPass(createPPCLowerMASSVEntriesPass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 3e20e451410f6..be2d4c5c17d1e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -422,7 +422,7 @@ bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
 }
 
 void RISCVPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   if (getOptLevel() != CodeGenOptLevel::None) {
     if (EnableLoopDataPrefetch)
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index b408af2ea5943..20ddafb0e43d0 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -175,7 +175,7 @@ TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void SparcPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 73e01e3ec1844..121512d5a7e58 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -228,7 +228,7 @@ void SystemZPassConfig::addIRPasses() {
     addPass(createLoopDataPrefetchPass());
   }
 
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 6d102bfd3926a..6f4e137e4d2f1 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -134,7 +134,7 @@ TargetPassConfig *VETargetMachine::createPassConfig(PassManagerBase &PM) {
 
 void VEPassConfig::addIRPasses() {
   // VE requires atomic expand pass.
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
   TargetPassConfig::addIRPasses();
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 3120b6b67906e..b2f7ee970a732 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -493,7 +493,7 @@ void WebAssemblyPassConfig::addISelPrepare() {
   addPass(new CoalesceFeaturesAndStripAtomics(&getWebAssemblyTargetMachine()));
 
   // This is a no-op if atomics are not used in the module
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addISelPrepare();
 }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 279a1efdff978..276bc7f08d4cd 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -440,7 +440,7 @@ MachineFunctionInfo *X86TargetMachine::createMachineFunctionInfo(
 }
 
 void X86PassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   // We add both pass anyway and when these two passes run, we skip the pass
   // based on the option level and option attribute.
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 345a8365ed49b..374e91d01bdac 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -84,7 +84,7 @@ TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void XCorePassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass());
+  addPass(createAtomicExpandLegacyPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
index f45fc22783d1f..6385466e13416 100644
--- a/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/idemponent-atomics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand < %s | FileCheck --check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand < %s | FileCheck --check-prefix=OPT %s
 
 define i32 @global_agent_monotonic_idempotent_or(ptr addrspace(1) %in) {
 ; GFX940-LABEL: global_agent_monotonic_idempotent_or:
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 81ad160475683..6fdc0d5834ef6 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -atomic-expand < %s | FileCheck -check-prefix=IR %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -passes=atomic-expand < %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
 
 define i32 @load_atomic_private_seq_cst_i32(ptr addrspace(5) %ptr) {
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
index 2fc848a3a810b..ba6802f85c03c 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64-linux-gnu -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
index 47d626261bfc4..ef2b5fe3672be 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -atomic-expand %s | FileCheck %s
-; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
+; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s
+; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
 
 define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f16(
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll b/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
index 2e9efe911e6d6..cc42407c0210e 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64-linux-gnu -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
 define i8 @atomic8_load_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_load_unordered(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
index b846c1f77538e..08d990fb58039 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index 7f5d6e7cb76f8..94f1b733877ed 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -passes=atomic-expand %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
index f796d3cca3036..80cf19ed8c636 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck %s
 
 define i8 @test_atomicrmw_xchg_i8_global_system(ptr addrspace(1) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_xchg_i8_global_system(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index 6a6e416bdbc89..711580158e725 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,GCN
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,R600
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck %s --check-prefixes=CHECK,GCN
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -passes=atomic-expand %s | FileCheck %s --check-prefixes=CHECK,R600
 
 define i8 @test_atomicrmw_xchg_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
 ; GCN-LABEL: @test_atomicrmw_xchg_i8_global_agent(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 5d7825bb37887..c2f7057dc26f3 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -atomic-expand %s | FileCheck -check-prefix=GFX908 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -atomic-expand %s | FileCheck -check-prefix=GFX90A %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -atomic-expand %s | FileCheck -check-prefix=GFX940 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -atomic-expand %s | FileCheck -check-prefix=GFX1100 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefix=GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefix=GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefix=GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefix=GFX1100 %s
 
 define float @syncscope_system(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: @syncscope_system(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 97c041168d147..b6e6b26024952 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=CI %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GFX9 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -atomic-expand %s | FileCheck -check-prefix=GFX908 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -atomic-expand %s | FileCheck -check-prefix=GFX90A %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -atomic-expand %s | FileCheck -check-prefix=GFX940 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -atomic-expand %s | FileCheck -check-prefix=GFX11 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=CI %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefix=GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefix=GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefix=GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=atomic-expand %s | FileCheck -check-prefix=GFX11 %s
 
 define void @test_atomicrmw_fadd_f32_global_no_use_unsafe(ptr addrspace(1) %ptr, float %value) #0 {
 ; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_unsafe(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 9dfbe9b4eb741..5a79bc2680769 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
 
 define float @test_atomicrmw_fmax_f32_flat(ptr %ptr, float %value) {
 ; GCN-LABEL: @test_atomicrmw_fmax_f32_flat(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index 5a732653b48b1..e3d3bfde3be68 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
 
 define float @test_atomicrmw_fmin_f32_flat(ptr %ptr, float %value) {
 ; GCN-LABEL: @test_atomicrmw_fmin_f32_flat(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
index 9805c317b9215..bbcc6b8a2724f 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefix=GCN %s
 
 define float @test_atomicrmw_fsub_f32_flat(ptr %ptr, float %value) {
 ; GCN-LABEL: @test_atomicrmw_fsub_f32_flat(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
index 5fa9dcc4ad9bf..ac88ff1dd8807 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -passes=atomic-expand %s | FileCheck %s
 
 define i32 @test_atomicrmw_nand_i32_flat(ptr %ptr, i32 %value) {
 ; CHECK-LABEL: @test_atomicrmw_nand_i32_flat(
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
index aceb897a7d487..e926519783594 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -atomic-expand %s | FileCheck -check-prefix=GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefix=GFX90A %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll
index 6c84474edc05b..acf726a7de5e0 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s 2>&1 | FileCheck --check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s 2>&1 | FileCheck --check-prefix=GCN %s
 
 define i32 @atomic_load_global_align1(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: @atomic_load_global_align1(
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll b/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
index 353aafb9727a5..2e72d26ed4566 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v7.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -o - -mtriple=armv7-apple-ios7.0 -atomic-expand -codegen-opt-level=1 %s | FileCheck %s
+; RUN: opt -S -o - -mtriple=armv7-apple-ios7.0 -passes=atomic-expand -codegen-opt-level=1 %s | FileCheck %s
 
 define i8 @test_atomic_xchg_i8(ptr %ptr, i8 %xchgend) {
 ; CHECK-LABEL: @test_atomic_xchg_i8
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll b/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
index bad28b2b6824e..10073e23f5d46 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/atomic-expansion-v8.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -o - -mtriple=armv8-linux-gnueabihf -atomic-expand %s -codegen-opt-level=1 | FileCheck %s
+; RUN: opt -S -o - -mtriple=armv8-linux-gnueabihf -passes=atomic-expand %s -codegen-opt-level=1 | FileCheck %s
 
 define i8 @test_atomic_xchg_i8(ptr %ptr, i8 %xchgend) {
 ; CHECK-LABEL: @test_atomic_xchg_i8
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
index d0268bf3e0079..9c4ce50da6917 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=armv7-apple-ios7.0 -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=armv7-apple-ios7.0 -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll b/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
index f7a210d631bf9..23aa57e18ecc5 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
@@ -1,4 +1,4 @@
-; RUN: opt -atomic-expand -codegen-opt-level=1 -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
+; RUN: opt -passes=atomic-expand -codegen-opt-level=1 -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
 
 define i32 @test_cmpxchg_seq_cst(ptr %addr, i32 %desired, i32 %new) {
 ; CHECK-LABEL: @test_cmpxchg_seq_cst
diff --git a/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
index 8827eb5d8e108..9e64db0a5e31d 100644
--- a/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=hexagon-- -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=hexagon-- -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
index 43fdd25e257b8..5ce831a4c4c83 100644
--- a/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/LoongArch/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S --mtriple=loongarch64 --atomic-expand --mattr=+d %s | FileCheck %s
+; RUN: opt -S --mtriple=loongarch64 --passes=atomic-expand --mattr=+d %s | FileCheck %s
 
 define float @atomicrmw_fadd_float(ptr %ptr, float %value) {
 ; CHECK-LABEL: @atomicrmw_fadd_float(
diff --git a/llvm/test/Transforms/AtomicExpand/LoongArch/load-store-atomic.ll b/llvm/test/Transforms/AtomicExpand/LoongArch/load-store-atomic.ll
index b0875669bc3a2..77c237a38a489 100644
--- a/llvm/test/Transforms/AtomicExpand/LoongArch/load-store-atomic.ll
+++ b/llvm/test/Transforms/AtomicExpand/LoongArch/load-store-atomic.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S --mtriple=loongarch32 --atomic-expand %s | FileCheck %s --check-prefix=LA32
-; RUN: opt -S --mtriple=loongarch64 --atomic-expand %s | FileCheck %s --check-prefix=LA64
+; RUN: opt -S --mtriple=loongarch32 --passes=atomic-expand %s | FileCheck %s --check-prefix=LA32
+; RUN: opt -S --mtriple=loongarch64 --passes=atomic-expand %s | FileCheck %s --check-prefix=LA64
 
 define i8 @load_acquire_i8(ptr %ptr) {
 ; LA32-LABEL: @load_acquire_i8(
diff --git a/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
index 2c90a70bd0ad0..0fd1cbe8bdd0e 100644
--- a/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=mips64-mti-linux-gnu -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=mips64-mti-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
index 7e42735feabff..a3d62e06a7cd6 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=powerpc64-unknown-unknown -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=powerpc64-unknown-unknown -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-double.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-double.ll
index 19e5f56821d74..1bd2c6cb5607f 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-double.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-double.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -atomic-expand -mtriple=powerpc64le-unknown-unknown \
+; RUN: opt -S -passes=atomic-expand -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   < %s 2>&1 | FileCheck %s
-; RUN: opt -S -atomic-expand -mtriple=powerpc64-unknown-unknown \
+; RUN: opt -S -passes=atomic-expand -mtriple=powerpc64-unknown-unknown \
 ; RUN:   < %s 2>&1 | FileCheck %s
 
 define double @foo(ptr %dp) {
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-float.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-float.ll
index 62f0db00df800..2233ce0fcf689 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-float.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cfence-float.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -atomic-expand -mtriple=powerpc64le-unknown-unknown \
+; RUN: opt -S -passes=atomic-expand -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   < %s 2>&1 | FileCheck %s
-; RUN: opt -S -atomic-expand -mtriple=powerpc64-unknown-unknown \
+; RUN: opt -S -passes=atomic-expand -mtriple=powerpc64-unknown-unknown \
 ; RUN:   < %s 2>&1 | FileCheck %s
 
 define float @bar(ptr %fp) {
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
index 169d73cc0308d..b94023b97a295 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -atomic-expand -S -mtriple=powerpc64-unknown-unknown \
+; RUN: opt -passes=atomic-expand -S -mtriple=powerpc64-unknown-unknown \
 ; RUN:   -mcpu=pwr8 %s | FileCheck %s
-; RUN: opt -atomic-expand -S -mtriple=powerpc64-unknown-unknown \
+; RUN: opt -passes=atomic-expand -S -mtriple=powerpc64-unknown-unknown \
 ; RUN:   -mcpu=pwr7 %s | FileCheck --check-prefix=PWR7 %s
 
 define i1 @test_cmpxchg_seq_cst(ptr %addr, i128 %desire, i128 %new) {
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/issue55983.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/issue55983.ll
index 342506301d004..4a0df4c6739e4 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/issue55983.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/issue55983.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -atomic-expand -S -mtriple=powerpc64le-unknown-unknown \
+; RUN: opt -passes=atomic-expand -S -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   %s | FileCheck %s
-; RUN: opt -atomic-expand -S -mtriple=powerpc64-unknown-unknown \
+; RUN: opt -passes=atomic-expand -S -mtriple=powerpc64-unknown-unknown \
 ; RUN:   %s | FileCheck %s
 
 define ptr @foo(ptr %p) {
diff --git a/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
index ceaafd89990b0..7e41583189c3d 100644
--- a/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=riscv32-- -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=riscv32-- -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll b/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll
index 4427c5e7ed23d..682c1e6848b31 100644
--- a/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll
+++ b/llvm/test/Transforms/AtomicExpand/SPARC/libcalls.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %s -atomic-expand | FileCheck %s
+; RUN: opt -S %s -passes=atomic-expand | FileCheck %s
 
 ;;; NOTE: this test is actually target-independent -- any target which
 ;;; doesn't support inline atomics can be used. (E.g. X86 i386 would
diff --git a/llvm/test/Transforms/AtomicExpand/SPARC/partword.ll b/llvm/test/Transforms/AtomicExpand/SPARC/partword.ll
index 5bcb21105df8b..e8be1bc187483 100644
--- a/llvm/test/Transforms/AtomicExpand/SPARC/partword.ll
+++ b/llvm/test/Transforms/AtomicExpand/SPARC/partword.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S %s -atomic-expand | FileCheck %s
+; RUN: opt -S %s -passes=atomic-expand | FileCheck %s
 
 ;; Verify the cmpxchg and atomicrmw expansions where sub-word-size
 ;; instructions are not available.
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
index 8d71966c04d03..20a9e9f6cb86a 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=i686-linux-gnu -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=i686-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
 
 define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
index dab7677086e91..5929c153d5961 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-non-integer.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %s -atomic-expand -mtriple=x86_64-linux-gnu | FileCheck %s
+; RUN: opt -S %s -passes=atomic-expand -mtriple=x86_64-linux-gnu | FileCheck %s
 
 ; This file tests the functions `llvm::convertAtomicLoadToIntegerType` and
 ; `llvm::convertAtomicStoreToIntegerType`. If X86 stops using this 
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
index 69837b96a90d0..3866530abb796 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=i686-linux-gnu -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=i686-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
 define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fadd_f32(
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
index fba1512368ea2..316660ddfc49e 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-initial-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %s -atomic-expand -mtriple=i686-linux-gnu | FileCheck %s
+; RUN: opt -S %s -passes=atomic-expand -mtriple=i686-linux-gnu | FileCheck %s
 
 ; This file tests the function `llvm::expandAtomicRMWToCmpXchg`.
 ; It isn't technically target specific, but is exposed through a pass that is.
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
index 2464af3336ef3..211c6c5886413 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=i686-linux-gnu -atomic-expand %s | FileCheck %s
+; RUN: opt -S -mtriple=i686-linux-gnu -passes=atomic-expand %s | FileCheck %s
 
 define double @atomic_xchg_f64(ptr %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64(
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 85f52941a85b4..7d05cb412b3d5 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -429,7 +429,7 @@ extern "C" int optMain(
   initializeSelectOptimizePass(Registry);
   initializeCallBrPreparePass(Registry);
   initializeCodeGenPrepareLegacyPassPass(Registry);
-  initializeAtomicExpandPass(Registry);
+  initializeAtomicExpandLegacyPass(Registry);
   initializeWinEHPreparePass(Registry);
   initializeDwarfEHPrepareLegacyPassPass(Registry);
   initializeSafeStackLegacyPassPass(Registry);

From 711014714716753f791291ed6a152e00899469a3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 25 Feb 2024 13:17:08 +0000
Subject: [PATCH 247/546] [TBAA] Test for tbaa.struct creation for struct with
 named bitfields.

Add test for tbaa.struct metadata creation for copies of a struct with
named bitfields.

Test for https://github.com/llvm/llvm-project/issues/82586.
---
 clang/test/CodeGen/tbaa-struct.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index 47ccec3fb4162..ff5521fcf3f60 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -102,6 +102,23 @@ void copy7(A *a1, AA *a2) {
   *a1 = *a2;
 }
 
+struct NamedBitfields {
+   signed f0 : 9;
+   unsigned f1 : 2;
+   char f2;
+   double f3;
+};
+
+NamedBitfields g;
+
+void copy8(NamedBitfields *a1, NamedBitfields *a2) {
+// CHECK-LABEL: _Z5copy8P14NamedBitfieldsS0_
+// CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
+// CHECK-OLD-SAME: !tbaa.struct [[TS6:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields:!.+]], !tbaa.struct
+  *a1 = *a2;
+}
+
 // CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
 // CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
 // CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
@@ -113,6 +130,9 @@ void copy7(A *a1, AA *a2) {
 // CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
 // CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS6]] = !{i64 0, i64 4, [[TAG_INT]], i64 1, i64 4, [[TAG_INT]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
+// CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0}
+// CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
@@ -127,3 +147,6 @@ void copy7(A *a1, AA *a2) {
 // CHECK-NEW-DAG: [[TAG_C]] = !{[[TYPE_C]], [[TYPE_C]], i64 0, i64 3}
 // CHECK-NEW-DAG: [[TYPE_D:!.*]] = !{[[TYPE_char]], i64 6, !"_ZTS1D", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 4, i64 1, [[TYPE_char]], i64 5, i64 1}
 // CHECK-NEW-DAG: [[TAG_D]] = !{[[TYPE_D]], [[TYPE_D]], i64 0, i64 6}
+// CHECK-NEW-DAG: [[TAG_NamedBitfields]] = !{[[TYPE_NamedBitfields:!.+]], [[TYPE_NamedBitfields]], i64 0, i64 16}
+// CHECK-NEW-DAG: [[TYPE_NamedBitfields]] = !{[[TYPE_char]], i64 16, !"_ZTS14NamedBitfields", [[TYPE_int]], i64 0, i64 4, [[TYPE_int]], i64 1, i64 4, [[TYPE_char]], i64 2, i64 1, [[TYPE_double:!.+]], i64 8, i64 8}
+// CHECK-NEW-DAG: [[TYPE_double]] = !{[[TYPE_char]], i64 8, !"double"}

From f920b746ea818f1d21f317116cbb105e3e85979a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 25 Feb 2024 22:01:13 +0800
Subject: [PATCH 248/546] [FlattenCFG] Fix the miscompilation where phi nodes
 exist in the merge point (#81987)

When there are phi nodes in the merge point of the if-region, we cannot
do the merge.
Alive2: https://alive2.llvm.org/ce/z/DbgEan
Fixes #70900.
---
 llvm/lib/Transforms/Utils/FlattenCFG.cpp | 14 ++++----------
 llvm/test/Transforms/Util/flatten-cfg.ll | 15 +++++++++------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index db6ee39baece7..16b4bb1981d8b 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -407,6 +407,10 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
 /// form, by inverting the condition and the branch successors. The same
 /// approach goes for the opposite case.
 bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
+  // We cannot merge the if-region if the merge point has phi nodes.
+  if (isa<PHINode>(BB->front()))
+    return false;
+
   BasicBlock *IfTrue2, *IfFalse2;
   BranchInst *DomBI2 = GetIfCondition(BB, IfTrue2, IfFalse2);
   if (!DomBI2)
@@ -493,16 +497,6 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   PBI->replaceUsesOfWith(PBI->getCondition(), NC);
   Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
 
-  // Handle PHI node to replace its predecessors to FirstEntryBlock.
-  for (BasicBlock *Succ : successors(PBI)) {
-    for (PHINode &Phi : Succ->phis()) {
-      for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
-        if (Phi.getIncomingBlock(i) == SecondEntryBlock)
-          Phi.setIncomingBlock(i, FirstEntryBlock);
-      }
-    }
-  }
-
   // Remove IfTrue1
   if (IfTrue1 != FirstEntryBlock) {
     IfTrue1->dropAllReferences();
diff --git a/llvm/test/Transforms/Util/flatten-cfg.ll b/llvm/test/Transforms/Util/flatten-cfg.ll
index d6e34bda1cc60..038dcaa47419a 100644
--- a/llvm/test/Transforms/Util/flatten-cfg.ll
+++ b/llvm/test/Transforms/Util/flatten-cfg.ll
@@ -77,13 +77,16 @@ define void @test_not_crash3(i32 %a) #0 {
 ; CHECK-SAME: (i32 [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_EQ_0:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[A_EQ_0]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb1:
 ; CHECK-NEXT:    [[A_EQ_1:%.*]] = icmp eq i32 [[A]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[A_EQ_0]], [[A_EQ_1]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK-NEXT:    br i1 [[A_EQ_1]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[CHECK_BADREF:%.*]] = phi i32 [ 17, [[ENTRY:%.*]] ], [ 11, [[BB2]] ]
+; CHECK-NEXT:    [[CHECK_BADREF:%.*]] = phi i32 [ 17, [[BB1]] ], [ 11, [[BB2]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -278,9 +281,9 @@ define i1 @test_cond_multi_use(i32 %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:  entry.x:
 ; CHECK-NEXT:    [[CMP_X:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    [[CMP_Y:%.*]] = icmp eq i32 [[Y]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[CMP_Y]], true
-; CHECK-NEXT:    [[TMP1:%.*]] = or i1 [[CMP_X]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN_Y:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[CMP_Y_NOT:%.*]] = xor i1 [[CMP_Y]], true
+; CHECK-NEXT:    [[TMP0:%.*]] = or i1 [[CMP_X]], [[CMP_Y_NOT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[IF_THEN_Y:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if.then.y:
 ; CHECK-NEXT:    store i32 [[Z]], ptr @g, align 4
 ; CHECK-NEXT:    br label [[EXIT]]

From 9e7c0b1385baa1acffb62e0589ff100dd972cc0d Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sun, 25 Feb 2024 14:13:04 +0000
Subject: [PATCH 249/546] [OpenMP] Implement __kmp_is_address_mapped on
 DragonFlyBSD. (#82895)

implement internal __kmp_is_address_mapped.
---
 openmp/runtime/cmake/LibompHandleFlags.cmake |  3 ++
 openmp/runtime/src/z_Linux_util.cpp          | 51 ++++++++++++++++++--
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/openmp/runtime/cmake/LibompHandleFlags.cmake b/openmp/runtime/cmake/LibompHandleFlags.cmake
index e5c7a3ec6aaa9..eed98036dd51a 100644
--- a/openmp/runtime/cmake/LibompHandleFlags.cmake
+++ b/openmp/runtime/cmake/LibompHandleFlags.cmake
@@ -144,6 +144,9 @@ function(libomp_get_libflags libflags)
     libomp_append(libflags_local "-Wl,--no-as-needed" LIBOMP_HAVE_AS_NEEDED_FLAG)
     libomp_append(libflags_local "-lm")
     libomp_append(libflags_local "-Wl,--as-needed" LIBOMP_HAVE_AS_NEEDED_FLAG)
+    if (${CMAKE_SYSTEM_NAME} STREQUAL "DragonFly")
+      libomp_append(libflags_local "-lkvm")
+    endif()
   elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux|NetBSD|Solaris")
     libomp_append(libflags_local -lm)
   endif()
diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 513ec6517d00b..3636266677d99 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -59,6 +59,9 @@
 #include <sys/sysctl.h>
 #include <sys/user.h>
 #include <pthread_np.h>
+#if KMP_OS_DRAGONFLY
+#include <kvm.h>
+#endif
 #elif KMP_OS_NETBSD || KMP_OS_OPENBSD
 #include <sys/types.h>
 #include <sys/sysctl.h>
@@ -1042,9 +1045,7 @@ extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
 #else
 // Empty symbol to export (see exports_so.txt) when
 // monitor thread feature is disabled
-extern "C" void __kmp_reap_monitor(kmp_info_t *th) {
-  (void)th;
-}
+extern "C" void __kmp_reap_monitor(kmp_info_t *th) { (void)th; }
 #endif // KMP_USE_MONITOR
 
 void __kmp_reap_worker(kmp_info_t *th) {
@@ -2133,7 +2134,47 @@ int __kmp_is_address_mapped(void *addr) {
     lw += cursz;
   }
   kmpc_free(buf);
+#elif KMP_OS_DRAGONFLY
+  char err[_POSIX2_LINE_MAX];
+  kinfo_proc *proc;
+  vmspace sp;
+  vm_map *cur;
+  vm_map_entry entry, *c;
+  struct proc p;
+  kvm_t *fd;
+  uintptr_t uaddr;
+  int num;
+
+  fd = kvm_openfiles(nullptr, nullptr, nullptr, O_RDONLY, err);
+  if (!fd) {
+    return 0;
+  }
+
+  proc = kvm_getprocs(fd, KERN_PROC_PID, getpid(), &num);
+
+  if (kvm_read(fd, static_cast<uintptr_t>(proc->kp_paddr), &p, sizeof(p)) !=
+          sizeof(p) ||
+      kvm_read(fd, reinterpret_cast<uintptr_t>(p.p_vmspace), &sp, sizeof(sp)) !=
+          sizeof(sp)) {
+    kvm_close(fd);
+    return 0;
+  }
+
+  (void)rc;
+  cur = &sp.vm_map;
+  uaddr = reinterpret_cast<uintptr_t>(addr);
+  for (c = kvm_vm_map_entry_first(fd, cur, &entry); c;
+       c = kvm_vm_map_entry_next(fd, c, &entry)) {
+    if ((uaddr >= entry.ba.start) && (uaddr <= entry.ba.end)) {
+      if ((entry.protection & VM_PROT_READ) != 0 &&
+          (entry.protection & VM_PROT_WRITE) != 0) {
+        found = 1;
+        break;
+      }
+    }
+  }
 
+  kvm_close(fd);
 #elif KMP_OS_DARWIN
 
   /* On OS X*, /proc pseudo filesystem is not available. Try to read memory
@@ -2212,9 +2253,9 @@ int __kmp_is_address_mapped(void *addr) {
   }
 #elif KMP_OS_WASI
   found = (int)addr < (__builtin_wasm_memory_size(0) * PAGESIZE);
-#elif KMP_OS_DRAGONFLY || KMP_OS_SOLARIS || KMP_OS_AIX
+#elif KMP_OS_SOLARIS || KMP_OS_AIX
 
-  // FIXME(DragonFly, Solaris, AIX): Implement this
+  // FIXME(Solaris, AIX): Implement this
   found = 1;
 
 #else

From 8eb6757564ccea8f9fc3bb75480f1c1d1784415a Mon Sep 17 00:00:00 2001
From: Dani <daniel.kiss@arm.com>
Date: Sun, 25 Feb 2024 15:50:05 +0100
Subject: [PATCH 250/546] [NFC] Turn the StrictFP attribute check to a
 CompatRule. (#82600)

---
 llvm/include/llvm/IR/Attributes.td           | 1 +
 llvm/lib/IR/Attributes.cpp                   | 7 +++++++
 llvm/lib/Transforms/Utils/InlineFunction.cpp | 7 -------
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index d22eb76d2292d..08afecf320151 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -371,6 +371,7 @@ def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
 def : CompatRule<"isEqual<NoProfileAttr>">;
 def : CompatRule<"checkDenormMode">;
+def : CompatRule<"checkStrictFP">;
 def : CompatRuleStrAttr<"isEqual", "sign-return-address">;
 def : CompatRuleStrAttr<"isEqual", "sign-return-address-key">;
 def : CompatRuleStrAttr<"isEqual", "branch-protection-pauth-lr">;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 19076771ff2ea..00acbbe7989d8 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2039,6 +2039,13 @@ static bool checkDenormMode(const Function &Caller, const Function &Callee) {
   return false;
 }
 
+static bool checkStrictFP(const Function &Caller, const Function &Callee) {
+  // Do not inline strictfp function into non-strictfp one. It would require
+  // conversion of all FP operations in host function to constrained intrinsics.
+  return !Callee.getAttributes().hasFnAttr(Attribute::StrictFP) ||
+         Caller.getAttributes().hasFnAttr(Attribute::StrictFP);
+}
+
 template<typename AttrClass>
 static bool isEqual(const Function &Caller, const Function &Callee) {
   return Caller.getFnAttribute(AttrClass::getKind()) ==
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 78317df8a9cae..f68fdb26f2817 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2103,13 +2103,6 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   BasicBlock *OrigBB = CB.getParent();
   Function *Caller = OrigBB->getParent();
 
-  // Do not inline strictfp function into non-strictfp one. It would require
-  // conversion of all FP operations in host function to constrained intrinsics.
-  if (CalledFunc->getAttributes().hasFnAttr(Attribute::StrictFP) &&
-      !Caller->getAttributes().hasFnAttr(Attribute::StrictFP)) {
-    return InlineResult::failure("incompatible strictfp attributes");
-  }
-
   // GC poses two hazards to inlining, which only occur when the callee has GC:
   //  1. If the caller has no GC, then the callee's GC must be propagated to the
   //     caller.

From 2c5a68858b046c8a2ca3ba07ecd82771a5a9b884 Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor@mac.com>
Date: Sun, 25 Feb 2024 10:13:05 -0500
Subject: [PATCH 251/546] Fix non-splat vector SREM expansion when one of the
 divisors is a power of two. (#82706)

The expansion previously used, derived from Hacker's Delight,
does not work correctly when the dividend is INT_MIN and the
divisor is a power of two. We now use an alternate derivation
of the A and Q constants specifically for the power-of-two divisor
case to avoid this problem. Credit to Fabian Giesen for the
new derivation.

Fixes https://github.com/llvm/llvm-project/issues/77169
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  20 ++++
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    | 103 +++++++++---------
 2 files changed, 70 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 07fb89127a737..6970b230837fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6803,6 +6803,7 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
                                   SDValue CompTargetNode, ISD::CondCode Cond,
                                   DAGCombinerInfo &DCI, const SDLoc &DL,
                                   SmallVectorImpl<SDNode *> &Created) const {
+  // Derived from Hacker's Delight, 2nd Edition, by Hank Warren. Section 10-17.
   // Fold:
   //   (seteq/ne (srem N, D), 0)
   // To:
@@ -6813,6 +6814,17 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
   // - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
   // - Q = floor((2 * A) / (2^K))
   // where W is the width of the common type of N and D.
+  //
+  // When D is a power of two (and thus D0 is 1), the normal
+  // formula for A and Q don't apply, because the derivation
+  // depends on D not dividing 2^(W-1), and thus theorem ZRS
+  // does not apply. This specifically fails when N = INT_MIN.
+  //
+  // Instead, for power-of-two D, we use:
+  // - A = 2^(W-1)
+  // |-> Order-preserving map from [-2^(W-1), 2^(W-1) - 1] to [0,2^W - 1])
+  // - Q = 2^(W-K) - 1
+  // |-> Test that the top K bits are zero after rotation
   assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
          "Only applicable for (in)equality comparisons.");
 
@@ -6896,6 +6908,14 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     // Q = floor((2 * A) / (2^K))
     APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
 
+    // If D was a power of two, apply the alternate constant derivation.
+    if (D0.isOne()) {
+      // A = 2^(W-1)
+      A = APInt::getSignedMinValue(W);
+      // - Q = 2^(W-K) - 1
+      Q = APInt::getAllOnes(W - K).zext(W);
+    }
+
     assert(APInt::getAllOnes(SVT.getSizeInBits()).ugt(A) &&
            "We are expecting that A is always less than all-ones for SVT");
     assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 8509e930ba74a..3dde5c1c8a40c 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -560,7 +560,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
 ; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,268435454,858993458]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,268435455,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -646,7 +646,7 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,268435454,306783378]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,306783378,268435455,306783378]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -735,7 +735,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,268435454,42949672]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,306783378,268435455,42949672]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1041,7 +1041,7 @@ define <4 x i32> @test_srem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,0,858993458]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,858993458,1,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7]
@@ -1135,21 +1135,21 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pxor %xmm1, %xmm1
 ; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783]
 ; CHECK-SSE41-NEXT:    pmulld %xmm0, %xmm2
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-SSE41-NEXT:    paddd %xmm3, %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-SSE41-NEXT:    por %xmm5, %xmm4
-; CHECK-SSE41-NEXT:    pminud %xmm4, %xmm3
-; CHECK-SSE41-NEXT:    pcmpeqd %xmm4, %xmm3
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    por %xmm4, %xmm3
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [306783378,306783378,1,306783378]
+; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
+; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7]
+; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE41-NEXT:    retq
 ;
@@ -1157,17 +1157,16 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX1:       # %bb.0:
 ; CHECK-AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; CHECK-AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; CHECK-AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2]
-; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; CHECK-AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
-; CHECK-AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; CHECK-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; CHECK-AVX1-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; CHECK-AVX1-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; CHECK-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -1179,12 +1178,11 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX2:       # %bb.0:
 ; CHECK-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-AVX2-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX2-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm4
+; CHECK-AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; CHECK-AVX2-NEXT:    vpminud %xmm3, %xmm2, %xmm3
+; CHECK-AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; CHECK-AVX2-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
 ; CHECK-AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
 ; CHECK-AVX2-NEXT:    vpand %xmm3, %xmm0, %xmm0
@@ -1196,15 +1194,14 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-AVX512VL-LABEL: test_srem_even_INT_MIN:
 ; CHECK-AVX512VL:       # %bb.0:
 ; CHECK-AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378]
-; CHECK-AVX512VL-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT:    vpminud %xmm3, %xmm2, %xmm3
-; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; CHECK-AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
+; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm1, %xmm2, %xmm1
+; CHECK-AVX512VL-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vprorvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; CHECK-AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
 ; CHECK-AVX512VL-NEXT:    vpsrld $31, %xmm0, %xmm0
 ; CHECK-AVX512VL-NEXT:    retq
   %srem = srem <4 x i32> %X, <i32 14, i32 14, i32 2147483648, i32 14>
@@ -1263,7 +1260,7 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm4, %xmm3
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [858993458,306783378,1,42949672]
 ; CHECK-SSE41-NEXT:    pminud %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
 ; CHECK-SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1362,7 +1359,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
 ; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,858993458]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1447,7 +1444,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,306783378]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435455,306783378]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1536,7 +1533,7 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,42949672]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,42949672]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1844,7 +1841,7 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,858993458]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,268435455,4294967295,858993458]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -1932,7 +1929,7 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,268435454,4294967295,306783378]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,268435455,4294967295,306783378]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -2016,7 +2013,7 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,268435454,4294967295,42949672]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,268435455,4294967295,42949672]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -2093,7 +2090,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm0
 ; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [858993458,4294967295,268435455,4294967295]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -2166,7 +2163,7 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm0
 ; CHECK-SSE41-NEXT:    por %xmm1, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435454,4294967295]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [306783378,4294967295,268435455,4294967295]
 ; CHECK-SSE41-NEXT:    pminud %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
 ; CHECK-SSE41-NEXT:    psrld $31, %xmm0
@@ -2237,7 +2234,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
 ; CHECK-SSE2-NEXT:    psrlw $8, %xmm6
 ; CHECK-SSE2-NEXT:    packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
 ; CHECK-SSE2-NEXT:    pminub %xmm6, %xmm7
 ; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm7
 ; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
@@ -2264,7 +2261,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    psrlw $8, %xmm0
 ; CHECK-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
 ; CHECK-SSE2-NEXT:    pmaxub %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pcmpeqb %xmm6, %xmm3
@@ -2300,7 +2297,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
 ; CHECK-SSE41-NEXT:    psrlw $8, %xmm6
 ; CHECK-SSE41-NEXT:    packuswb %xmm0, %xmm6
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [84,2,36,42,2,0,2,4,2,255,4,36,126,30,2,2]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
 ; CHECK-SSE41-NEXT:    pminub %xmm6, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqb %xmm6, %xmm0
 ; CHECK-SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
@@ -2326,7 +2323,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    psrlw $8, %xmm0
 ; CHECK-SSE41-NEXT:    packuswb %xmm4, %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [19,51,13,7,127,31,127,3,5,5,51,37,3,127,85,5]
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
 ; CHECK-SSE41-NEXT:    pmaxub %xmm0, %xmm4
 ; CHECK-SSE41-NEXT:    pcmpeqb %xmm0, %xmm4
 ; CHECK-SSE41-NEXT:    pcmpeqb %xmm6, %xmm3

From d6ded91121fa02837ef6c8c7f06d98ccf4a0fe16 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 25 Feb 2024 16:22:57 +0000
Subject: [PATCH 252/546] [Codegen] Change getSpillSize/getReloadSize to
 LocationSize. NFC (#82636)

This is a small part of #70452, attempting to take a small simpler part
of it in isolation to simplify what remains. It changes the getSpillSize,
getFoldedSpillSize, getRestoreSize and getFoldedRestoreSize methods to return
optional<LocationSize> instead of unsigned. The code is intended to be the
same, keeping the optional<> to specify when there was no size found, with some
minor adjustments to make sure that unknown (~UINT64_C(0)) sizes are handled
sensibly.  Hopefully as more unsigned's are converted to LocationSize's the use
of ~UINT64_C(0) can be cleaned up too.
---
 llvm/include/llvm/CodeGen/MachineInstr.h   | 10 ++++---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 26 +++++++----------
 llvm/lib/CodeGen/MachineInstr.cpp          | 34 +++++++++++++---------
 3 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 5bb4a67555222..fcdd73d8b65fd 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
@@ -1744,16 +1745,17 @@ class MachineInstr
   bool allImplicitDefsAreDead() const;
 
   /// Return a valid size if the instruction is a spill instruction.
-  std::optional<unsigned> getSpillSize(const TargetInstrInfo *TII) const;
+  std::optional<LocationSize> getSpillSize(const TargetInstrInfo *TII) const;
 
   /// Return a valid size if the instruction is a folded spill instruction.
-  std::optional<unsigned> getFoldedSpillSize(const TargetInstrInfo *TII) const;
+  std::optional<LocationSize>
+  getFoldedSpillSize(const TargetInstrInfo *TII) const;
 
   /// Return a valid size if the instruction is a restore instruction.
-  std::optional<unsigned> getRestoreSize(const TargetInstrInfo *TII) const;
+  std::optional<LocationSize> getRestoreSize(const TargetInstrInfo *TII) const;
 
   /// Return a valid size if the instruction is a folded restore instruction.
-  std::optional<unsigned>
+  std::optional<LocationSize>
   getFoldedRestoreSize(const TargetInstrInfo *TII) const;
 
   /// Copy implicit register operands from specified
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e89a1c26c23e6..0efe7a0e73367 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1106,25 +1106,21 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 
   // We assume a single instruction only has a spill or reload, not
   // both.
-  std::optional<unsigned> Size;
+  std::optional<LocationSize> Size;
   if ((Size = MI.getRestoreSize(TII))) {
-    CommentOS << *Size << "-byte Reload\n";
+    CommentOS << Size->getValue() << "-byte Reload\n";
   } else if ((Size = MI.getFoldedRestoreSize(TII))) {
-    if (*Size) {
-      if (*Size == unsigned(MemoryLocation::UnknownSize))
-        CommentOS << "Unknown-size Folded Reload\n";
-      else
-        CommentOS << *Size << "-byte Folded Reload\n";
-    }
+    if (!Size->hasValue())
+      CommentOS << "Unknown-size Folded Reload\n";
+    else if (Size->getValue())
+      CommentOS << Size->getValue() << "-byte Folded Reload\n";
   } else if ((Size = MI.getSpillSize(TII))) {
-    CommentOS << *Size << "-byte Spill\n";
+    CommentOS << Size->getValue() << "-byte Spill\n";
   } else if ((Size = MI.getFoldedSpillSize(TII))) {
-    if (*Size) {
-      if (*Size == unsigned(MemoryLocation::UnknownSize))
-        CommentOS << "Unknown-size Folded Spill\n";
-      else
-        CommentOS << *Size << "-byte Folded Spill\n";
-    }
+    if (!Size->hasValue())
+      CommentOS << "Unknown-size Folded Spill\n";
+    else if (Size->getValue())
+      CommentOS << Size->getValue() << "-byte Folded Spill\n";
   }
 
   // Check for spill-induced copies
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index efbcc839f22d0..6654e1d6cecee 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -2354,29 +2354,35 @@ void MachineInstr::changeDebugValuesDefReg(Register Reg) {
 
 using MMOList = SmallVector<const MachineMemOperand *, 2>;
 
-static unsigned getSpillSlotSize(const MMOList &Accesses,
-                                 const MachineFrameInfo &MFI) {
-  unsigned Size = 0;
+static LocationSize getSpillSlotSize(const MMOList &Accesses,
+                                     const MachineFrameInfo &MFI) {
+  uint64_t Size = 0;
   for (const auto *A : Accesses)
     if (MFI.isSpillSlotObjectIndex(
             cast<FixedStackPseudoSourceValue>(A->getPseudoValue())
-                ->getFrameIndex()))
-      Size += A->getSize();
+                ->getFrameIndex())) {
+      uint64_t S = A->getSize();
+      if (S == ~UINT64_C(0))
+        return LocationSize::beforeOrAfterPointer();
+      Size += S;
+    }
   return Size;
 }
 
-std::optional<unsigned>
+std::optional<LocationSize>
 MachineInstr::getSpillSize(const TargetInstrInfo *TII) const {
   int FI;
   if (TII->isStoreToStackSlotPostFE(*this, FI)) {
     const MachineFrameInfo &MFI = getMF()->getFrameInfo();
-    if (MFI.isSpillSlotObjectIndex(FI))
-      return (*memoperands_begin())->getSize();
+    if (MFI.isSpillSlotObjectIndex(FI)) {
+      uint64_t Size = (*memoperands_begin())->getSize();
+      return Size == ~UINT64_C(0) ? LocationSize::beforeOrAfterPointer() : Size;
+    }
   }
   return std::nullopt;
 }
 
-std::optional<unsigned>
+std::optional<LocationSize>
 MachineInstr::getFoldedSpillSize(const TargetInstrInfo *TII) const {
   MMOList Accesses;
   if (TII->hasStoreToStackSlot(*this, Accesses))
@@ -2384,18 +2390,20 @@ MachineInstr::getFoldedSpillSize(const TargetInstrInfo *TII) const {
   return std::nullopt;
 }
 
-std::optional<unsigned>
+std::optional<LocationSize>
 MachineInstr::getRestoreSize(const TargetInstrInfo *TII) const {
   int FI;
   if (TII->isLoadFromStackSlotPostFE(*this, FI)) {
     const MachineFrameInfo &MFI = getMF()->getFrameInfo();
-    if (MFI.isSpillSlotObjectIndex(FI))
-      return (*memoperands_begin())->getSize();
+    if (MFI.isSpillSlotObjectIndex(FI)) {
+      uint64_t Size = (*memoperands_begin())->getSize();
+      return Size == ~UINT64_C(0) ? LocationSize::beforeOrAfterPointer() : Size;
+    }
   }
   return std::nullopt;
 }
 
-std::optional<unsigned>
+std::optional<LocationSize>
 MachineInstr::getFoldedRestoreSize(const TargetInstrInfo *TII) const {
   MMOList Accesses;
   if (TII->hasLoadFromStackSlot(*this, Accesses))

From 411c5dde59fa4c427941143ca0ec8cd8fdaee407 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sun, 25 Feb 2024 13:50:38 +0100
Subject: [PATCH 253/546] [clang][Interp] Handle null function pointers

We were instead asserting that they are non-null before.
---
 clang/lib/AST/Interp/FunctionPointer.h | 1 +
 clang/lib/AST/Interp/Interp.h          | 6 ++++++
 clang/test/AST/Interp/functions.cpp    | 5 ++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/FunctionPointer.h b/clang/lib/AST/Interp/FunctionPointer.h
index 4a3f993d4882e..bb3da9b50aa55 100644
--- a/clang/lib/AST/Interp/FunctionPointer.h
+++ b/clang/lib/AST/Interp/FunctionPointer.h
@@ -26,6 +26,7 @@ class FunctionPointer final {
   FunctionPointer(const Function *Func) : Func(Func) { assert(Func); }
 
   const Function *getFunction() const { return Func; }
+  bool isZero() const { return !Func; }
 
   APValue toAPValue() const {
     if (!Func)
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 23a2756a18f69..d885d19ce7064 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2087,6 +2087,12 @@ inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize) {
   const FunctionPointer &FuncPtr = S.Stk.pop<FunctionPointer>();
 
   const Function *F = FuncPtr.getFunction();
+  if (!F) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    S.FFDiag(E, diag::note_constexpr_null_callee)
+        << const_cast<Expr *>(E) << E->getSourceRange();
+    return false;
+  }
   assert(F);
 
   assert(ArgSize >= F->getWrittenArgSize());
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index 51269741eb901..9daf8722050f0 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -123,9 +123,12 @@ namespace FunctionPointers {
   }
 
   constexpr int applyBinOp(int a, int b, int (*op)(int, int)) {
-    return op(a, b);
+    return op(a, b); // both-note {{evaluates to a null function pointer}}
   }
   static_assert(applyBinOp(1, 2, add) == 3, "");
+  static_assert(applyBinOp(1, 2, nullptr) == 3, ""); // both-error {{not an integral constant expression}} \
+                                                     // both-note {{in call to}}
+
 
   constexpr int ignoreReturnValue() {
     int (*foo)(int, int) = add;

From 8dfc023e80c35aded33b3e5e4739d3a487b95a7a Mon Sep 17 00:00:00 2001
From: Alexander Scholz <duddel@users.noreply.github.com>
Date: Sun, 25 Feb 2024 17:42:57 +0100
Subject: [PATCH 254/546] [run-clang-tidy.py] Add option to ignore source files
 from compilation database (#82416)

I added the option -source-filter to the
`run-clang-tidy.py` script in the clang-tools-extra.

This option allows for handing over a regex, to filter out source files
from the compilation database (not run `clang-tidy` on them).
---
 .../clang-tidy/tool/run-clang-tidy.py         | 20 +++++++++++++++++++
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++++
 2 files changed, 24 insertions(+)

diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 70f8cbcdcb2f1..1bd4a5b283091 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -300,6 +300,13 @@ def main():
         "the main file of each translation unit are always "
         "displayed.",
     )
+    parser.add_argument(
+        "-source-filter",
+        default=None,
+        help="Regular expression matching the names of the "
+        "source files from compilation database to output "
+        "diagnostics from.",
+    )
     parser.add_argument(
         "-line-filter",
         default=None,
@@ -462,6 +469,19 @@ def main():
         [make_absolute(entry["file"], entry["directory"]) for entry in database]
     )
 
+    # Filter source files from compilation database.
+    if args.source_filter:
+        try:
+            source_filter_re = re.compile(args.source_filter)
+        except:
+            print(
+                "Error: unable to compile regex from arg -source-filter:",
+                file=sys.stderr,
+            )
+            traceback.print_exc()
+            sys.exit(1)
+        files = {f for f in files if source_filter_re.match(f)}
+
     max_task = args.j
     if max_task == 0:
         max_task = multiprocessing.cpu_count()
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index a0b9fcfe0d777..6fd01ed9d471c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -97,6 +97,10 @@ The improvements are...
 Improvements to clang-tidy
 --------------------------
 
+- Improved :program:`run-clang-tidy.py` script. Added argument `-source-filter`
+  to filter source files from the compilation database, via a RegEx. In a
+  similar fashion to what `-header-filter` does for header files.
+
 New checks
 ^^^^^^^^^^
 

From cb4f94db83d9c4373b485493ef079e318f63bf13 Mon Sep 17 00:00:00 2001
From: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com>
Date: Sun, 25 Feb 2024 19:43:11 +0300
Subject: [PATCH 255/546] [lld][WebAssembly] Add `--no-growable-memory`
 (#82890)

We recently added `--initial-heap` - an option that allows one to up the
initial memory size without the burden of having to know exactly how
much is needed.

However, in the process of implementing support for this in Emscripten
(https://github.com/emscripten-core/emscripten/pull/21071), we have
realized that `--initial-heap` cannot support the use-case of
non-growable memories by itself, since with it we don't know what to set
`--max-memory` to.

We have thus agreed to move the above work forward by introducing
another option to the linker (see
https://github.com/emscripten-core/emscripten/pull/21071#discussion_r1491755616),
one that would allow users to explicitly specify they want a
non-growable memory.

This change does this by introducing `--no-growable-memory`: an option
that is mutally exclusive with `--max-memory` (for simplicity - we can
also decide that it should override or be overridable by `--max-memory`.
In Emscripten a similar mix of options results in `--no-growable-memory`
taking precedence). The option specifies that the maximum memory size
should be set to the initial memory size, effectively disallowing memory
growth.

Closes #81932.
---
 lld/docs/WebAssembly.rst    |  4 ++++
 lld/test/wasm/data-layout.s | 16 ++++++++++++++++
 lld/wasm/Config.h           |  1 +
 lld/wasm/Driver.cpp         |  6 ++++++
 lld/wasm/Options.td         |  3 +++
 lld/wasm/Writer.cpp         | 28 ++++++++++++++++------------
 6 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/lld/docs/WebAssembly.rst b/lld/docs/WebAssembly.rst
index 3f554de46d38a..1dd05d67983c7 100644
--- a/lld/docs/WebAssembly.rst
+++ b/lld/docs/WebAssembly.rst
@@ -135,6 +135,10 @@ WebAssembly-specific options:
 
   Maximum size of the linear memory. Default: unlimited.
 
+.. option:: --no-growable-memory
+
+  Set maximum size of the linear memory to its initial size, disallowing memory growth.
+
 By default the function table is neither imported nor exported, but defined
 for internal use only.
 
diff --git a/lld/test/wasm/data-layout.s b/lld/test/wasm/data-layout.s
index 2a447aad62216..a68bc032e4840 100644
--- a/lld/test/wasm/data-layout.s
+++ b/lld/test/wasm/data-layout.s
@@ -103,6 +103,22 @@ local_struct_internal_ptr:
 # CHECK-MAX-NEXT:         Minimum:         0x2
 # CHECK-MAX-NEXT:         Maximum:         0x2
 
+# RUN: wasm-ld --no-entry --initial-memory=327680 --no-growable-memory \
+# RUN:     -o %t_max.wasm %t.hello32.o
+# RUN: obj2yaml %t_max.wasm | FileCheck %s -check-prefix=CHECK-NO-GROWTH
+
+# CHECK-NO-GROWTH:        - Type:            MEMORY
+# CHECK-NO-GROWTH-NEXT:     Memories:
+# CHECK-NO-GROWTH-NEXT:       - Flags:           [ HAS_MAX ]
+# CHECK-NO-GROWTH-NEXT:         Minimum:         0x5
+# CHECK-NO-GROWTH-NEXT:         Maximum:         0x5
+
+# RUN: not wasm-ld --max-memory=262144 --no-growable-memory \
+# RUN:     --no-entry -o %t_max.wasm %t.hello32.o 2>&1 \
+# RUN: | FileCheck %s --check-prefix CHECK-NO-GROWTH-COMPAT-ERROR
+
+# CHECK-NO-GROWTH-COMPAT-ERROR: --max-memory is incompatible with --no-growable-memory
+
 # RUN: wasm-ld -no-gc-sections --allow-undefined --no-entry --shared-memory \
 # RUN:     --features=atomics,bulk-memory --initial-memory=131072 \
 # RUN:     --max-memory=131072 -o %t_max.wasm %t32.o %t.hello32.o
diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index 97c508bda6a1c..266348fef4031 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -78,6 +78,7 @@ struct Configuration {
   uint64_t initialHeap;
   uint64_t initialMemory;
   uint64_t maxMemory;
+  bool noGrowableMemory;
   // The table offset at which to place function addresses.  We reserve zero
   // for the null function pointer.  This gets set to 1 for executables and 0
   // for shared libraries (since they always added to a dynamic offset at
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 635f19f78b15e..df7d4d1cc3d67 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -542,9 +542,15 @@ static void readConfigs(opt::InputArgList &args) {
   config->initialHeap = args::getInteger(args, OPT_initial_heap, 0);
   config->initialMemory = args::getInteger(args, OPT_initial_memory, 0);
   config->maxMemory = args::getInteger(args, OPT_max_memory, 0);
+  config->noGrowableMemory = args.hasArg(OPT_no_growable_memory);
   config->zStackSize =
       args::getZOptionValue(args, OPT_z, "stack-size", WasmPageSize);
 
+  if (config->maxMemory != 0 && config->noGrowableMemory) {
+    // Erroring out here is simpler than defining precedence rules.
+    error("--max-memory is incompatible with --no-growable-memory");
+  }
+
   // Default value of exportDynamic depends on `-shared`
   config->exportDynamic =
       args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, config->shared);
diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td
index 8190717cef63b..70b5aadc26c2a 100644
--- a/lld/wasm/Options.td
+++ b/lld/wasm/Options.td
@@ -230,6 +230,9 @@ def initial_memory: JJ<"initial-memory=">,
 def max_memory: JJ<"max-memory=">,
   HelpText<"Maximum size of the linear memory">;
 
+def no_growable_memory: FF<"no-growable-memory">,
+  HelpText<"Set maximum size of the linear memory to its initial size">;
+
 def no_entry: FF<"no-entry">,
   HelpText<"Do not output any entry point">;
 
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index d1a06c9ac9c2a..55eff995fb8a1 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -473,6 +473,7 @@ void Writer::layoutMemory() {
     WasmSym::heapEnd->setVA(memoryPtr);
   }
 
+  uint64_t maxMemory = 0;
   if (config->maxMemory != 0) {
     if (config->maxMemory != alignTo(config->maxMemory, WasmPageSize))
       error("maximum memory must be " + Twine(WasmPageSize) + "-byte aligned");
@@ -481,20 +482,23 @@ void Writer::layoutMemory() {
     if (config->maxMemory > maxMemorySetting)
       error("maximum memory too large, cannot be greater than " +
             Twine(maxMemorySetting));
+
+    maxMemory = config->maxMemory;
+  } else if (config->noGrowableMemory) {
+    maxMemory = memoryPtr;
   }
 
-  // Check max if explicitly supplied or required by shared memory
-  if (config->maxMemory != 0 || config->sharedMemory) {
-    uint64_t max = config->maxMemory;
-    if (max == 0) {
-      // If no maxMemory config was supplied but we are building with
-      // shared memory, we need to pick a sensible upper limit.
-      if (ctx.isPic)
-        max = maxMemorySetting;
-      else
-        max = memoryPtr;
-    }
-    out.memorySec->maxMemoryPages = max / WasmPageSize;
+  // If no maxMemory config was supplied but we are building with
+  // shared memory, we need to pick a sensible upper limit.
+  if (config->sharedMemory && maxMemory == 0) {
+    if (ctx.isPic)
+      maxMemory = maxMemorySetting;
+    else
+      maxMemory = memoryPtr;
+  }
+
+  if (maxMemory != 0) {
+    out.memorySec->maxMemoryPages = maxMemory / WasmPageSize;
     log("mem: max pages   = " + Twine(out.memorySec->maxMemoryPages));
   }
 }

From d99b1481770e4f8454c7f238dcd40c8e977e6b70 Mon Sep 17 00:00:00 2001
From: FruitClover <m.kashkarov@samsung.com>
Date: Sun, 25 Feb 2024 20:34:38 +0300
Subject: [PATCH 256/546] [TableGen] Fix __CLAUSE_NO_CLASS macro leak in
 directive emitter (#82912)

`__CLAUSE_NO_CLASS` was not undefined inside the
`GEN_CLANG_CLAUSE_CLASS` block, resulting in macro redifinition warnings
when several generated directives are used simultaneously.
---
 llvm/test/TableGen/directive1.td         | 1 +
 llvm/test/TableGen/directive2.td         | 1 +
 llvm/utils/TableGen/DirectiveEmitter.cpp | 1 +
 3 files changed, 3 insertions(+)

diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index b249f2bf5fc68..bcd9fa34c99d3 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -258,6 +258,7 @@ def TDL_DirA : Directive<"dira"> {
 // IMPL-EMPTY:
 // IMPL-NEXT:  #undef __IMPLICIT_CLAUSE_NO_CLASS
 // IMPL-NEXT:  #undef __IMPLICIT_CLAUSE_CLASS
+// IMPL-NEXT:  #undef __CLAUSE_NO_CLASS
 // IMPL-NEXT:  #undef __CLAUSE
 // IMPL-NEXT:  #undef CLAUSE_NO_CLASS
 // IMPL-NEXT:  #undef CLAUSE_CLASS
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index 154d1e86ffb1d..01741152956d2 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -206,6 +206,7 @@ def TDL_DirA : Directive<"dira"> {
 // IMPL-EMPTY:
 // IMPL-NEXT:  #undef __IMPLICIT_CLAUSE_NO_CLASS
 // IMPL-NEXT:  #undef __IMPLICIT_CLAUSE_CLASS
+// IMPL-NEXT:  #undef __CLAUSE_NO_CLASS
 // IMPL-NEXT:  #undef __CLAUSE
 // IMPL-NEXT:  #undef CLAUSE_NO_CLASS
 // IMPL-NEXT:  #undef CLAUSE_CLASS
diff --git a/llvm/utils/TableGen/DirectiveEmitter.cpp b/llvm/utils/TableGen/DirectiveEmitter.cpp
index b6aee665f8ee0..99eebacd6b3f3 100644
--- a/llvm/utils/TableGen/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/DirectiveEmitter.cpp
@@ -842,6 +842,7 @@ static void GenerateClauseClassMacro(const DirectiveLanguage &DirLang,
   OS << "\n";
   OS << "#undef __IMPLICIT_CLAUSE_NO_CLASS\n";
   OS << "#undef __IMPLICIT_CLAUSE_CLASS\n";
+  OS << "#undef __CLAUSE_NO_CLASS\n";
   OS << "#undef __CLAUSE\n";
   OS << "#undef CLAUSE_NO_CLASS\n";
   OS << "#undef CLAUSE_CLASS\n";

From ac9e67756e0157793d565c2cceaf82e4403f58ba Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 26 Feb 2024 01:53:16 +0800
Subject: [PATCH 257/546] [ValueTracking][NFC] Early exit when enumerating
 guaranteed well-defined/non-poison operands. (#82812)

According to the [coverage
result](https://dtcxzyw.github.io/llvm-opt-benchmark/coverage/home/dtcxzyw/llvm-project/llvm/lib/Analysis/ValueTracking.cpp.html#L7193)
on my benchmark, `llvm::mustTriggerUB` returns true with an average of
35.0M/12.3M=2.85 matches. I think we can stop enumerating when one of
the matches succeeds to avoid filling the temporary buffer
`NonPoisonOps`.

This patch introduces two template functions
`handleGuaranteedWellDefinedOps/handleGuaranteedNonPoisonOps`. They will
pass well-defined/non-poison operands to inlinable callbacks `Handle`.
If the callback returns true, stop processing and return true.
Otherwise, return false.

Compile-time improvement:
https://llvm-compile-time-tracker.com/compare.php?from=13acb3af5ad48e850cf37dcf02270ede3f267bd4&to=2b55f513c1b6dd2732cb79a25f3eaf6c5e4d6619&stat=instructions:u

|stage1-O3|stage1-ReleaseThinLTO|stage1-ReleaseLTO-g|stage1-O0-g|stage2-O3|stage2-O0-g|stage2-clang|
|--|--|--|--|--|--|--|
|-0.03%|-0.04%|-0.06%|-0.03%|-0.05%|+0.03%|-0.02%|
---
 llvm/lib/Analysis/ValueTracking.cpp | 96 ++++++++++++++++++-----------
 1 file changed, 60 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 653b3d4ffd988..de105dfe53ab2 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7226,84 +7226,108 @@ bool llvm::propagatesPoison(const Use &PoisonOp) {
   }
 }
 
-void llvm::getGuaranteedWellDefinedOps(
-    const Instruction *I, SmallVectorImpl<const Value *> &Operands) {
+/// Enumerates all operands of \p I that are guaranteed to not be undef or
+/// poison. If the callback \p Handle returns true, stop processing and return
+/// true. Otherwise, return false.
+template <typename CallableT>
+static bool handleGuaranteedWellDefinedOps(const Instruction *I,
+                                           const CallableT &Handle) {
   switch (I->getOpcode()) {
     case Instruction::Store:
-      Operands.push_back(cast<StoreInst>(I)->getPointerOperand());
+      if (Handle(cast<StoreInst>(I)->getPointerOperand()))
+        return true;
       break;
 
     case Instruction::Load:
-      Operands.push_back(cast<LoadInst>(I)->getPointerOperand());
+      if (Handle(cast<LoadInst>(I)->getPointerOperand()))
+        return true;
       break;
 
     // Since dereferenceable attribute imply noundef, atomic operations
     // also implicitly have noundef pointers too
     case Instruction::AtomicCmpXchg:
-      Operands.push_back(cast<AtomicCmpXchgInst>(I)->getPointerOperand());
+      if (Handle(cast<AtomicCmpXchgInst>(I)->getPointerOperand()))
+        return true;
       break;
 
     case Instruction::AtomicRMW:
-      Operands.push_back(cast<AtomicRMWInst>(I)->getPointerOperand());
+      if (Handle(cast<AtomicRMWInst>(I)->getPointerOperand()))
+        return true;
       break;
 
     case Instruction::Call:
     case Instruction::Invoke: {
       const CallBase *CB = cast<CallBase>(I);
-      if (CB->isIndirectCall())
-        Operands.push_back(CB->getCalledOperand());
-      for (unsigned i = 0; i < CB->arg_size(); ++i) {
-        if (CB->paramHasAttr(i, Attribute::NoUndef) ||
-            CB->paramHasAttr(i, Attribute::Dereferenceable) ||
-            CB->paramHasAttr(i, Attribute::DereferenceableOrNull))
-          Operands.push_back(CB->getArgOperand(i));
-      }
+      if (CB->isIndirectCall() && Handle(CB->getCalledOperand()))
+        return true;
+      for (unsigned i = 0; i < CB->arg_size(); ++i)
+        if ((CB->paramHasAttr(i, Attribute::NoUndef) ||
+             CB->paramHasAttr(i, Attribute::Dereferenceable) ||
+             CB->paramHasAttr(i, Attribute::DereferenceableOrNull)) &&
+            Handle(CB->getArgOperand(i)))
+          return true;
       break;
     }
     case Instruction::Ret:
-      if (I->getFunction()->hasRetAttribute(Attribute::NoUndef))
-        Operands.push_back(I->getOperand(0));
+      if (I->getFunction()->hasRetAttribute(Attribute::NoUndef) &&
+          Handle(I->getOperand(0)))
+        return true;
       break;
     case Instruction::Switch:
-      Operands.push_back(cast<SwitchInst>(I)->getCondition());
+      if (Handle(cast<SwitchInst>(I)->getCondition()))
+        return true;
       break;
     case Instruction::Br: {
       auto *BR = cast<BranchInst>(I);
-      if (BR->isConditional())
-        Operands.push_back(BR->getCondition());
+      if (BR->isConditional() && Handle(BR->getCondition()))
+        return true;
       break;
     }
     default:
       break;
   }
+
+  return false;
 }
 
-void llvm::getGuaranteedNonPoisonOps(const Instruction *I,
-                                     SmallVectorImpl<const Value *> &Operands) {
-  getGuaranteedWellDefinedOps(I, Operands);
+void llvm::getGuaranteedWellDefinedOps(
+    const Instruction *I, SmallVectorImpl<const Value *> &Operands) {
+  handleGuaranteedWellDefinedOps(I, [&](const Value *V) {
+    Operands.push_back(V);
+    return false;
+  });
+}
+
+/// Enumerates all operands of \p I that are guaranteed to not be poison.
+template <typename CallableT>
+static bool handleGuaranteedNonPoisonOps(const Instruction *I,
+                                         const CallableT &Handle) {
+  if (handleGuaranteedWellDefinedOps(I, Handle))
+    return true;
   switch (I->getOpcode()) {
   // Divisors of these operations are allowed to be partially undef.
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
-    Operands.push_back(I->getOperand(1));
-    break;
+    return Handle(I->getOperand(1));
   default:
-    break;
+    return false;
   }
 }
 
+void llvm::getGuaranteedNonPoisonOps(const Instruction *I,
+                                     SmallVectorImpl<const Value *> &Operands) {
+  handleGuaranteedNonPoisonOps(I, [&](const Value *V) {
+    Operands.push_back(V);
+    return false;
+  });
+}
+
 bool llvm::mustTriggerUB(const Instruction *I,
                          const SmallPtrSetImpl<const Value *> &KnownPoison) {
-  SmallVector<const Value *, 4> NonPoisonOps;
-  getGuaranteedNonPoisonOps(I, NonPoisonOps);
-
-  for (const auto *V : NonPoisonOps)
-    if (KnownPoison.count(V))
-      return true;
-
-  return false;
+  return handleGuaranteedNonPoisonOps(
+      I, [&](const Value *V) { return KnownPoison.count(V); });
 }
 
 static bool programUndefinedIfUndefOrPoison(const Value *V,
@@ -7346,9 +7370,9 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
       if (--ScanLimit == 0)
         break;
 
-      SmallVector<const Value *, 4> WellDefinedOps;
-      getGuaranteedWellDefinedOps(&I, WellDefinedOps);
-      if (is_contained(WellDefinedOps, V))
+      if (handleGuaranteedWellDefinedOps(&I, [V](const Value *WellDefinedOp) {
+            return WellDefinedOp == V;
+          }))
         return true;
 
       if (!isGuaranteedToTransferExecutionToSuccessor(&I))

From eca0bd171e6ab0f1c60e3950f5fa5fa1eaf1fa32 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 23 Feb 2024 12:54:07 -0600
Subject: [PATCH 258/546] [ValueTracking] Add tests for tracking `(and/or
 cond0, cond1)` on both sides of branch; NFC

---
 .../test/Transforms/InstCombine/known-bits.ll | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 246579cc4cd0c..2e39f7f7d627a 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -124,6 +124,67 @@ exit:
   ret i8 %or2
 }
 
+
+define i8 @test_cond_and_bothways(i8 %x) {
+; CHECK-LABEL: @test_cond_and_bothways(
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i8 [[AND]], 24
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[COND:%.*]] = and i1 [[CMP0]], [[CMP1]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
+; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
+; CHECK-NEXT:    ret i8 [[OR2]]
+;
+  %and = and i8 %x, 91
+  %cmp0 = icmp ne i8 %and, 24
+  %cmp1 = icmp ne i8 %x, 0
+  %cond = and i1 %cmp0, %cmp1
+  br i1 %cond, label %if, label %exit
+
+if:
+  %or1 = or i8 %x, -4
+  ret i8 %or1
+
+exit:
+  %or2 = or i8 %x, -4
+  ret i8 %or2
+}
+
+define i8 @test_cond_or_bothways(i8 %x) {
+; CHECK-LABEL: @test_cond_or_bothways(
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 91
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i8 [[AND]], 24
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[COND:%.*]] = or i1 [[CMP0]], [[CMP1]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
+; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
+; CHECK-NEXT:    ret i8 [[OR2]]
+;
+  %and = and i8 %x, 91
+  %cmp0 = icmp eq i8 %and, 24
+  %cmp1 = icmp eq i8 %x, 0
+  %cond = or i1 %cmp0, %cmp1
+  br i1 %cond, label %if, label %exit
+
+if:
+  %or1 = or i8 %x, -4
+  ret i8 %or1
+
+exit:
+  %or2 = or i8 %x, -4
+  ret i8 %or2
+}
+
+
+
 define i8 @test_cond_and_commuted(i8 %x, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: @test_cond_and_commuted(
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X:%.*]], 3

From 6f9b0a7095cbb7781e1f387f99d5725c950ce79b Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 23 Feb 2024 12:54:11 -0600
Subject: [PATCH 259/546] [ValueTracking] Compute knownbits for `(and/or cond0,
 cond1)` on both sides of branch

The false branch for `and` and true branch for `or` provide less
information (intersection as opposed to union), but still can give
some useful information.

Closes #82818
---
 llvm/lib/Analysis/DomConditionCache.cpp        |  5 +----
 llvm/lib/Analysis/ValueTracking.cpp            | 15 +++++++++++----
 llvm/test/Transforms/InstCombine/known-bits.ll |  6 ++----
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Analysis/DomConditionCache.cpp b/llvm/lib/Analysis/DomConditionCache.cpp
index 274f3ff44b2a6..da05e02b4b57f 100644
--- a/llvm/lib/Analysis/DomConditionCache.cpp
+++ b/llvm/lib/Analysis/DomConditionCache.cpp
@@ -34,7 +34,6 @@ static void findAffectedValues(Value *Cond,
     }
   };
 
-  bool TopLevelIsAnd = match(Cond, m_LogicalAnd());
   SmallVector<Value *, 8> Worklist;
   SmallPtrSet<Value *, 8> Visited;
   Worklist.push_back(Cond);
@@ -45,9 +44,7 @@ static void findAffectedValues(Value *Cond,
 
     CmpInst::Predicate Pred;
     Value *A, *B;
-    // Only recurse into and/or if it matches the top-level and/or type.
-    if (TopLevelIsAnd ? match(V, m_LogicalAnd(m_Value(A), m_Value(B)))
-                      : match(V, m_LogicalOr(m_Value(A), m_Value(B)))) {
+    if (match(V, m_LogicalOp(m_Value(A), m_Value(B)))) {
       Worklist.push_back(A);
       Worklist.push_back(B);
     } else if (match(V, m_ICmp(Pred, m_Value(A), m_Constant()))) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index de105dfe53ab2..e591ac504e9f0 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -711,10 +711,17 @@ static void computeKnownBitsFromCond(const Value *V, Value *Cond,
                                      const SimplifyQuery &SQ, bool Invert) {
   Value *A, *B;
   if (Depth < MaxAnalysisRecursionDepth &&
-      (Invert ? match(Cond, m_LogicalOr(m_Value(A), m_Value(B)))
-              : match(Cond, m_LogicalAnd(m_Value(A), m_Value(B))))) {
-    computeKnownBitsFromCond(V, A, Known, Depth + 1, SQ, Invert);
-    computeKnownBitsFromCond(V, B, Known, Depth + 1, SQ, Invert);
+      match(Cond, m_LogicalOp(m_Value(A), m_Value(B)))) {
+    KnownBits Known2(Known.getBitWidth());
+    KnownBits Known3(Known.getBitWidth());
+    computeKnownBitsFromCond(V, A, Known2, Depth + 1, SQ, Invert);
+    computeKnownBitsFromCond(V, B, Known3, Depth + 1, SQ, Invert);
+    if (Invert ? match(Cond, m_LogicalOr(m_Value(), m_Value()))
+               : match(Cond, m_LogicalAnd(m_Value(), m_Value())))
+      Known2 = Known2.unionWith(Known3);
+    else
+      Known2 = Known2.intersectWith(Known3);
+    Known = Known.unionWith(Known2);
   }
 
   if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 2e39f7f7d627a..b658ee0d2ef4e 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -136,8 +136,7 @@ define i8 @test_cond_and_bothways(i8 %x) {
 ; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR1]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR2]]
+; CHECK-NEXT:    ret i8 -4
 ;
   %and = and i8 %x, 91
   %cmp0 = icmp ne i8 %and, 24
@@ -162,8 +161,7 @@ define i8 @test_cond_or_bothways(i8 %x) {
 ; CHECK-NEXT:    [[COND:%.*]] = or i1 [[CMP0]], [[CMP1]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[X]], -4
-; CHECK-NEXT:    ret i8 [[OR1]]
+; CHECK-NEXT:    ret i8 -4
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[OR2:%.*]] = or i8 [[X]], -4
 ; CHECK-NEXT:    ret i8 [[OR2]]

From 641d160ad236fb7a472a9eedbda2d62541e7dd0c Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 26 Feb 2024 03:26:55 +0800
Subject: [PATCH 260/546] [InstCombine] Fold umax(smax)/smin(umin) with
 non-negative constants (#82929)

This patch extends `reassociateMinMaxWithConstants` to fold the
following patterns:
```
umax (smax X, nneg C0), nneg C1 --> smax X, (umax C0, C1)
smin (umin X, nneg C0), nneg C1 --> umin X, (smin/umin C0, C1)
```
Alive2: https://alive2.llvm.org/ce/z/wfEj-e

Address the comment
https://github.com/llvm/llvm-project/pull/82472#pullrequestreview-1896922897.
---
 .../InstCombine/InstCombineCalls.cpp          |  23 +++-
 .../Transforms/InstCombine/minmax-fold.ll     | 127 +++++++++++++++++-
 .../Transforms/InstCombine/select_meta.ll     |  17 ++-
 3 files changed, 150 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 0be1495083ceb..60b8243d6ba66 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1220,10 +1220,11 @@ static Instruction *foldClampRangeOfTwo(IntrinsicInst *II,
 /// If this min/max has a constant operand and an operand that is a matching
 /// min/max with a constant operand, constant-fold the 2 constant operands.
 static Value *reassociateMinMaxWithConstants(IntrinsicInst *II,
-                                             IRBuilderBase &Builder) {
+                                             IRBuilderBase &Builder,
+                                             const SimplifyQuery &SQ) {
   Intrinsic::ID MinMaxID = II->getIntrinsicID();
-  auto *LHS = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
-  if (!LHS || LHS->getIntrinsicID() != MinMaxID)
+  auto *LHS = dyn_cast<MinMaxIntrinsic>(II->getArgOperand(0));
+  if (!LHS)
     return nullptr;
 
   Constant *C0, *C1;
@@ -1231,11 +1232,21 @@ static Value *reassociateMinMaxWithConstants(IntrinsicInst *II,
       !match(II->getArgOperand(1), m_ImmConstant(C1)))
     return nullptr;
 
-  // max (max X, C0), C1 --> max X, (max C0, C1) --> max X, NewC
+  // max (max X, C0), C1 --> max X, (max C0, C1)
+  // min (min X, C0), C1 --> min X, (min C0, C1)
+  // umax (smax X, nneg C0), nneg C1 --> smax X, (umax C0, C1)
+  // smin (umin X, nneg C0), nneg C1 --> umin X, (smin C0, C1)
+  Intrinsic::ID InnerMinMaxID = LHS->getIntrinsicID();
+  if (InnerMinMaxID != MinMaxID &&
+      !(((MinMaxID == Intrinsic::umax && InnerMinMaxID == Intrinsic::smax) ||
+         (MinMaxID == Intrinsic::smin && InnerMinMaxID == Intrinsic::umin)) &&
+        isKnownNonNegative(C0, SQ) && isKnownNonNegative(C1, SQ)))
+    return nullptr;
+
   ICmpInst::Predicate Pred = MinMaxIntrinsic::getPredicate(MinMaxID);
   Value *CondC = Builder.CreateICmp(Pred, C0, C1);
   Value *NewC = Builder.CreateSelect(CondC, C0, C1);
-  return Builder.CreateIntrinsic(MinMaxID, II->getType(),
+  return Builder.CreateIntrinsic(InnerMinMaxID, II->getType(),
                                  {LHS->getArgOperand(0), NewC});
 }
 
@@ -1786,7 +1797,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *SAdd = matchSAddSubSat(*II))
       return SAdd;
 
-    if (Value *NewMinMax = reassociateMinMaxWithConstants(II, Builder))
+    if (Value *NewMinMax = reassociateMinMaxWithConstants(II, Builder, SQ))
       return replaceInstUsesWith(*II, NewMinMax);
 
     if (Instruction *R = reassociateMinMaxWithConstantInOperand(II, Builder))
diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll
index 1f7837c109b3f..8391fe33eb9b5 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fold.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll
@@ -316,8 +316,7 @@ define i32 @test73(i32 %x) {
 ; SMAX(SMAX(X, 36), 75) -> SMAX(X, 75)
 define i32 @test74(i32 %x) {
 ; CHECK-LABEL: @test74(
-; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 36)
-; CHECK-NEXT:    [[RETVAL:%.*]] = call i32 @llvm.umax.i32(i32 [[COND]], i32 75)
+; CHECK-NEXT:    [[RETVAL:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 75)
 ; CHECK-NEXT:    ret i32 [[RETVAL]]
 ;
   %cmp = icmp slt i32 %x, 36
@@ -1419,3 +1418,127 @@ entry:
   %r = select i1 %cmp2, i32 %s1, i32 %k1
   ret i32 %r
 }
+
+define i32 @test_umax_smax1(i32 %x) {
+; CHECK-LABEL: @test_umax_smax1(
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 1)
+; CHECK-NEXT:    ret i32 [[UMAX]]
+;
+  %smax = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %umax = call i32 @llvm.umax.i32(i32 %smax, i32 1)
+  ret i32 %umax
+}
+
+define i32 @test_umax_smax2(i32 %x) {
+; CHECK-LABEL: @test_umax_smax2(
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 20)
+; CHECK-NEXT:    ret i32 [[SMAX]]
+;
+  %smax = call i32 @llvm.smax.i32(i32 %x, i32 20)
+  %umax = call i32 @llvm.umax.i32(i32 %smax, i32 10)
+  ret i32 %umax
+}
+
+define <2 x i32> @test_umax_smax_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test_umax_smax_vec(
+; CHECK-NEXT:    [[UMAX:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X:%.*]], <2 x i32> <i32 1, i32 20>)
+; CHECK-NEXT:    ret <2 x i32> [[UMAX]]
+;
+  %smax = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 0, i32 20>)
+  %umax = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %smax, <2 x i32> <i32 1, i32 10>)
+  ret <2 x i32> %umax
+}
+
+define i32 @test_smin_umin1(i32 %x) {
+; CHECK-LABEL: @test_smin_umin1(
+; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 10)
+; CHECK-NEXT:    ret i32 [[SMIN]]
+;
+  %smin = call i32 @llvm.umin.i32(i32 %x, i32 10)
+  %umin = call i32 @llvm.smin.i32(i32 %smin, i32 20)
+  ret i32 %umin
+}
+
+define i32 @test_smin_umin2(i32 %x) {
+; CHECK-LABEL: @test_smin_umin2(
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 10)
+; CHECK-NEXT:    ret i32 [[UMIN]]
+;
+  %smin = call i32 @llvm.umin.i32(i32 %x, i32 20)
+  %umin = call i32 @llvm.smin.i32(i32 %smin, i32 10)
+  ret i32 %umin
+}
+
+define <2 x i32> @test_smin_umin_vec(<2 x i32> %x) {
+; CHECK-LABEL: @test_smin_umin_vec(
+; CHECK-NEXT:    [[UMIN:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[X:%.*]], <2 x i32> <i32 10, i32 10>)
+; CHECK-NEXT:    ret <2 x i32> [[UMIN]]
+;
+  %smin = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %x, <2 x i32> <i32 10, i32 20>)
+  %umin = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %smin, <2 x i32> <i32 20, i32 10>)
+  ret <2 x i32> %umin
+}
+
+; Negative tests
+
+define i32 @test_umax_smax3(i32 %x) {
+; CHECK-LABEL: @test_umax_smax3(
+; CHECK-NEXT:    ret i32 -1
+;
+  %smax = call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %umax = call i32 @llvm.umax.i32(i32 %smax, i32 -1)
+  ret i32 %umax
+}
+
+define i32 @test_umax_smax4(i32 %x) {
+; CHECK-LABEL: @test_umax_smax4(
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 -20)
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[SMAX]], i32 10)
+; CHECK-NEXT:    ret i32 [[UMAX]]
+;
+  %smax = call i32 @llvm.smax.i32(i32 %x, i32 -20)
+  %umax = call i32 @llvm.umax.i32(i32 %smax, i32 10)
+  ret i32 %umax
+}
+
+define i32 @test_smin_umin3(i32 %x) {
+; CHECK-LABEL: @test_smin_umin3(
+; CHECK-NEXT:    ret i32 -20
+;
+  %smin = call i32 @llvm.umin.i32(i32 %x, i32 10)
+  %umin = call i32 @llvm.smin.i32(i32 %smin, i32 -20)
+  ret i32 %umin
+}
+
+define i32 @test_smin_umin4(i32 %x) {
+; CHECK-LABEL: @test_smin_umin4(
+; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 -20)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[SMIN]], i32 10)
+; CHECK-NEXT:    ret i32 [[UMIN]]
+;
+  %smin = call i32 @llvm.umin.i32(i32 %x, i32 -20)
+  %umin = call i32 @llvm.smin.i32(i32 %smin, i32 10)
+  ret i32 %umin
+}
+
+define i32 @test_umax_nonminmax(i32 %x) {
+; CHECK-LABEL: @test_umax_nonminmax(
+; CHECK-NEXT:    [[Y:%.*]] = call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[Y]], i32 1)
+; CHECK-NEXT:    ret i32 [[UMAX]]
+;
+  %y = call i32 @llvm.ctpop.i32(i32 %x)
+  %umax = call i32 @llvm.umax.i32(i32 %y, i32 1)
+  ret i32 %umax
+}
+
+define <2 x i32> @test_umax_smax_vec_neg(<2 x i32> %x) {
+; CHECK-LABEL: @test_umax_smax_vec_neg(
+; CHECK-NEXT:    [[SMAX:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X:%.*]], <2 x i32> <i32 0, i32 -20>)
+; CHECK-NEXT:    [[UMAX:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[SMAX]], <2 x i32> <i32 1, i32 10>)
+; CHECK-NEXT:    ret <2 x i32> [[UMAX]]
+;
+  %smax = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 0, i32 -20>)
+  %umax = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %smax, <2 x i32> <i32 1, i32 10>)
+  ret <2 x i32> %umax
+}
diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll
index df1e5a82ad5d1..cd133101736cf 100644
--- a/llvm/test/Transforms/InstCombine/select_meta.ll
+++ b/llvm/test/Transforms/InstCombine/select_meta.ll
@@ -171,8 +171,7 @@ define i32 @test72(i32 %x) {
 ; SMAX(SMAX(X, 36), 75) -> SMAX(X, 75)
 define i32 @test74(i32 %x) {
 ; CHECK-LABEL: @test74(
-; CHECK-NEXT:    [[COND:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 36)
-; CHECK-NEXT:    [[RETVAL:%.*]] = call i32 @llvm.umax.i32(i32 [[COND]], i32 75)
+; CHECK-NEXT:    [[RETVAL:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 75)
 ; CHECK-NEXT:    ret i32 [[RETVAL]]
 ;
   %cmp = icmp slt i32 %x, 36
@@ -317,7 +316,7 @@ define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv)
 
 define i64 @select_add(i1 %cond, i64 %x, i64 %y) {
 ; CHECK-LABEL: @select_add(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i64 [[Y:%.*]], i64 0, !prof [[PROF0]], !unpredictable !2
+; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i64 [[Y:%.*]], i64 0, !prof [[PROF0]], !unpredictable [[META2:![0-9]+]]
 ; CHECK-NEXT:    [[RET:%.*]] = add i64 [[OP]], [[X:%.*]]
 ; CHECK-NEXT:    ret i64 [[RET]]
 ;
@@ -328,7 +327,7 @@ define i64 @select_add(i1 %cond, i64 %x, i64 %y) {
 
 define <2 x i32> @select_or(<2 x i1> %cond, <2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @select_or(
-; CHECK-NEXT:    [[OP:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> zeroinitializer, !prof [[PROF0]], !unpredictable !2
+; CHECK-NEXT:    [[OP:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> zeroinitializer, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = or <2 x i32> [[OP]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[RET]]
 ;
@@ -339,7 +338,7 @@ define <2 x i32> @select_or(<2 x i1> %cond, <2 x i32> %x, <2 x i32> %y) {
 
 define i17 @select_sub(i1 %cond, i17 %x, i17 %y) {
 ; CHECK-LABEL: @select_sub(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i17 [[Y:%.*]], i17 0, !prof [[PROF0]], !unpredictable !2
+; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i17 [[Y:%.*]], i17 0, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = sub i17 [[X:%.*]], [[OP]]
 ; CHECK-NEXT:    ret i17 [[RET]]
 ;
@@ -350,7 +349,7 @@ define i17 @select_sub(i1 %cond, i17 %x, i17 %y) {
 
 define i128 @select_ashr(i1 %cond, i128 %x, i128 %y) {
 ; CHECK-LABEL: @select_ashr(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i128 [[Y:%.*]], i128 0, !prof [[PROF0]], !unpredictable !2
+; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i128 [[Y:%.*]], i128 0, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = ashr i128 [[X:%.*]], [[OP]]
 ; CHECK-NEXT:    ret i128 [[RET]]
 ;
@@ -361,7 +360,7 @@ define i128 @select_ashr(i1 %cond, i128 %x, i128 %y) {
 
 define double @select_fmul(i1 %cond, double %x, double %y) {
 ; CHECK-LABEL: @select_fmul(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable !2
+; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], double [[Y:%.*]], double 1.000000e+00, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = fmul double [[OP]], [[X:%.*]]
 ; CHECK-NEXT:    ret double [[RET]]
 ;
@@ -372,7 +371,7 @@ define double @select_fmul(i1 %cond, double %x, double %y) {
 
 define <2 x float> @select_fdiv(i1 %cond, <2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @select_fdiv(
-; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x float> [[Y:%.*]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, !prof [[PROF0]], !unpredictable !2
+; CHECK-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], <2 x float> [[Y:%.*]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, !prof [[PROF0]], !unpredictable [[META2]]
 ; CHECK-NEXT:    [[RET:%.*]] = fdiv <2 x float> [[X:%.*]], [[OP]]
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
@@ -391,5 +390,5 @@ define <2 x float> @select_fdiv(i1 %cond, <2 x float> %x, <2 x float> %y) {
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 2, i32 10}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 10, i32 2}
-; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META2]] = !{}
 ;.

From 529b5705db2ccefeee2c9b8cb5144e1f5a6420de Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sun, 25 Feb 2024 21:10:01 +0000
Subject: [PATCH 261/546] Revert "[compiler-rt] simplifying ::ReExec for
 freebsd. (#79711)" (#82933)

This reverts commit 691b12a2dcc12fa43517d23f2a9b6039616eebc8.
---
 .../lib/sanitizer_common/sanitizer_getauxval.h     | 14 --------------
 .../sanitizer_common/sanitizer_linux_libcdep.cpp   |  9 ++++++++-
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h b/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h
index a22248d500715..38439e44f611e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_getauxval.h
@@ -55,20 +55,6 @@ static inline decltype(AuxInfo::a_v) getauxval(decltype(AuxInfo::a_type) type) {
   return 0;
 }
 
-#elif SANITIZER_FREEBSD
-#  include <sys/auxv.h>
-
-#  define SANITIZER_USE_GETAUXVAL 1
-#  define AT_EXECFN AT_EXECPATH
-
-static inline unsigned long getauxval(unsigned long type) {
-  unsigned long buf = 0ul;
-
-  if (elf_aux_info(static_cast<int>(type), &buf, sizeof(buf)))
-    return 0ul;
-  return buf;
-}
-
 #endif
 
 #endif // SANITIZER_GETAUXVAL_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index b3f1fe91f16c5..cccbb4d256df2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -934,7 +934,14 @@ u64 MonotonicNanoTime() {
 void ReExec() {
   const char *pathname = "/proc/self/exe";
 
-#  if SANITIZER_NETBSD
+#  if SANITIZER_FREEBSD
+  for (const auto *aux = __elf_aux_vector; aux->a_type != AT_NULL; aux++) {
+    if (aux->a_type == AT_EXECPATH) {
+      pathname = static_cast<const char *>(aux->a_un.a_ptr);
+      break;
+    }
+  }
+#  elif SANITIZER_NETBSD
   static const int name[] = {
       CTL_KERN,
       KERN_PROC_ARGS,

From 085f9b0d146fc99bbb0e193593aad696fc50a056 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 25 Feb 2024 16:05:38 -0800
Subject: [PATCH 262/546] [clang-format][doc] Update documentation for
 RemoveSemicolon

---
 clang/docs/ClangFormatStyleOptions.rst | 3 ++-
 clang/include/clang/Format/Format.h    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index fdf7bfaeaa4ec..6515b16600191 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4844,7 +4844,8 @@ the configuration (without a prefix: ``Auto``).
 .. _RemoveSemicolon:
 
 **RemoveSemicolon** (``Boolean``) :versionbadge:`clang-format 16` :ref:`¶ <RemoveSemicolon>`
-  Remove semicolons after the closing brace of a non-empty function.
+  Remove semicolons after the closing braces of functions and
+  constructors/destructors.
 
   .. warning::
 
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index e9b2160a7b924..47923e06d2c2d 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3771,7 +3771,8 @@ struct FormatStyle {
   /// \version 17
   RemoveParenthesesStyle RemoveParentheses;
 
-  /// Remove semicolons after the closing brace of a non-empty function.
+  /// Remove semicolons after the closing braces of functions and
+  /// constructors/destructors.
   /// \warning
   ///  Setting this option to ``true`` could lead to incorrect code formatting
   ///  due to clang-format's lack of complete semantic information. As such,

From da092e8808319b572fa9fea7eb74e55e7434a8b2 Mon Sep 17 00:00:00 2001
From: Matteo Franciolini <mfranciolini@tesla.com>
Date: Sun, 25 Feb 2024 16:18:42 -0800
Subject: [PATCH 263/546] Fix bytecode roundtrip of unregistered ops (#82932)

When roundtripping to bytecode an unregistered operation name that does
not contain any '.' separator, the bytecode writer will emit an op
encoding without a proper opName. In this case, the string just becomes
a possibly unknown dialect name. At parsing, this dialect name is
used as a proper operation name.

However, when the unregistered operation name coincidentally matches
that of a dialect, the parser would fail. That means we can't roundtrip
an unregistered op with a name that matches one of the registered
dialect names. For example,

```
"index"() : () -> ()
```

can be emitted but cannot be parsed, because its name is coincidentally
the same as that of the Index dialect. The patch removes such
inconsistency.

This patch specifically fixes the bytecode roundtrip of
`mlir/test/IR/parser.mlir`.
---
 mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index d61634062784c..dd1e4abaea166 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -1854,22 +1854,18 @@ BytecodeReader::Impl::parseOpName(EncodingReader &reader,
   // Check to see if this operation name has already been resolved. If we
   // haven't, load the dialect and build the operation name.
   if (!opName->opName) {
-    // Load the dialect and its version.
-    DialectReader dialectReader(attrTypeReader, stringReader, resourceReader,
-                                dialectsMap, reader, version);
-    if (failed(opName->dialect->load(dialectReader, getContext())))
-      return failure();
     // If the opName is empty, this is because we use to accept names such as
     // `foo` without any `.` separator. We shouldn't tolerate this in textual
     // format anymore but for now we'll be backward compatible. This can only
     // happen with unregistered dialects.
     if (opName->name.empty()) {
-      if (opName->dialect->getLoadedDialect())
-        return emitError(fileLoc) << "has an empty opname for dialect '"
-                                  << opName->dialect->name << "'\n";
-
       opName->opName.emplace(opName->dialect->name, getContext());
     } else {
+      // Load the dialect and its version.
+      DialectReader dialectReader(attrTypeReader, stringReader, resourceReader,
+                                  dialectsMap, reader, version);
+      if (failed(opName->dialect->load(dialectReader, getContext())))
+        return failure();
       opName->opName.emplace((opName->dialect->name + "." + opName->name).str(),
                              getContext());
     }

From bc6b5be6a298ab094d1bb41f393ca422feddd298 Mon Sep 17 00:00:00 2001
From: Matteo Franciolini <mfranciolini@tesla.com>
Date: Sun, 25 Feb 2024 16:19:08 -0800
Subject: [PATCH 264/546] Fix TestI64ElementsAttr printer (#82931)

This enables to correctly roundtrip the attribute to text or bytecode.
---
 mlir/test/lib/Dialect/Test/TestAttributes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index 8819f6cbe94e2..d41d495c38e55 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -82,7 +82,7 @@ Attribute TestI64ElementsAttr::parse(AsmParser &parser, Type type) {
 void TestI64ElementsAttr::print(AsmPrinter &printer) const {
   printer << "<[";
   llvm::interleaveComma(getElements(), printer);
-  printer << "] : " << getType() << ">";
+  printer << "]>";
 }
 
 LogicalResult

From 1d5e3b2d6559a853c544099e4cf1d46f44f83368 Mon Sep 17 00:00:00 2001
From: tw-ilson <63574793+tw-ilson@users.noreply.github.com>
Date: Sun, 25 Feb 2024 19:47:25 -0500
Subject: [PATCH 265/546] [mlir][spirv] Use ODS generated attribute names for
 op definitions (#81552)

Since ODS generates getters functions for SPIRV operations' attribute
names, we replace instances of these hardcoded strings in the SPIR-V
dialect's op parser/printer with function calls for consistency.

Fixes https://github.com/llvm/llvm-project/issues/77627

---------

Co-authored-by: Lei Zhang <antiagainst@gmail.com>
---
 mlir/lib/Dialect/SPIRV/IR/AtomicOps.cpp       |  30 ++--
 mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp  |   8 +-
 mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp        | 143 +++++++++++-------
 .../Dialect/SPIRV/IR/IntegerDotProductOps.cpp |  18 ++-
 mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp       |  80 +++++++---
 mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp        |  52 ++++---
 .../Dialect/SPIRV/IR/SPIRVParsingUtils.cpp    |  29 ----
 mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.h |  44 +-----
 8 files changed, 223 insertions(+), 181 deletions(-)

diff --git a/mlir/lib/Dialect/SPIRV/IR/AtomicOps.cpp b/mlir/lib/Dialect/SPIRV/IR/AtomicOps.cpp
index 7e33e91414e0b..948d48980f2e8 100644
--- a/mlir/lib/Dialect/SPIRV/IR/AtomicOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/AtomicOps.cpp
@@ -33,7 +33,7 @@ StringRef stringifyTypeName<FloatType>() {
 }
 
 // Verifies an atomic update op.
-template <typename ExpectedElementType>
+template <typename AtomicOpTy, typename ExpectedElementType>
 static LogicalResult verifyAtomicUpdateOp(Operation *op) {
   auto ptrType = llvm::cast<spirv::PointerType>(op->getOperand(0).getType());
   auto elementType = ptrType.getPointeeType();
@@ -42,8 +42,10 @@ static LogicalResult verifyAtomicUpdateOp(Operation *op) {
                              << stringifyTypeName<ExpectedElementType>()
                              << " value, found " << elementType;
 
+  StringAttr semanticsAttrName =
+      AtomicOpTy::getSemanticsAttrName(op->getName());
   auto memorySemantics =
-      op->getAttrOfType<spirv::MemorySemanticsAttr>(kSemanticsAttrName)
+      op->getAttrOfType<spirv::MemorySemanticsAttr>(semanticsAttrName)
           .getValue();
   if (failed(verifyMemorySemantics(op, memorySemantics))) {
     return failure();
@@ -56,7 +58,7 @@ static LogicalResult verifyAtomicUpdateOp(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicAndOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicAndOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -64,7 +66,7 @@ LogicalResult AtomicAndOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicIAddOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicIAddOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -72,7 +74,7 @@ LogicalResult AtomicIAddOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult EXTAtomicFAddOp::verify() {
-  return verifyAtomicUpdateOp<FloatType>(getOperation());
+  return verifyAtomicUpdateOp<EXTAtomicFAddOp, FloatType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -80,7 +82,7 @@ LogicalResult EXTAtomicFAddOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicIDecrementOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicIDecrementOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -88,7 +90,7 @@ LogicalResult AtomicIDecrementOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicIIncrementOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicIIncrementOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -96,7 +98,7 @@ LogicalResult AtomicIIncrementOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicISubOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicISubOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,7 +106,7 @@ LogicalResult AtomicISubOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicOrOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicOrOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -112,7 +114,7 @@ LogicalResult AtomicOrOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicSMaxOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicSMaxOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -120,7 +122,7 @@ LogicalResult AtomicSMaxOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicSMinOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicSMinOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -128,7 +130,7 @@ LogicalResult AtomicSMinOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicUMaxOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicUMaxOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -136,7 +138,7 @@ LogicalResult AtomicUMaxOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicUMinOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicUMinOp, IntegerType>(getOperation());
 }
 
 //===----------------------------------------------------------------------===//
@@ -144,7 +146,7 @@ LogicalResult AtomicUMinOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult AtomicXorOp::verify() {
-  return verifyAtomicUpdateOp<IntegerType>(getOperation());
+  return verifyAtomicUpdateOp<AtomicXorOp, IntegerType>(getOperation());
 }
 
 } // namespace mlir::spirv
diff --git a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
index 7170a899069ee..3e317319b68fc 100644
--- a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp
@@ -87,7 +87,9 @@ ParseResult BranchConditionalOp::parse(OpAsmParser &parser,
         parser.parseRSquare())
       return failure();
 
-    result.addAttribute(kBranchWeightAttrName,
+    StringAttr branchWeightsAttrName =
+        BranchConditionalOp::getBranchWeightsAttrName(result.name);
+    result.addAttribute(branchWeightsAttrName,
                         builder.getArrayAttr({trueWeight, falseWeight}));
   }
 
@@ -199,11 +201,11 @@ LogicalResult FunctionCallOp::verify() {
 }
 
 CallInterfaceCallable FunctionCallOp::getCallableForCallee() {
-  return (*this)->getAttrOfType<SymbolRefAttr>(kCallee);
+  return (*this)->getAttrOfType<SymbolRefAttr>(getCalleeAttrName());
 }
 
 void FunctionCallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  (*this)->setAttr(kCallee, callee.get<SymbolRefAttr>());
+  (*this)->setAttr(getCalleeAttrName(), callee.get<SymbolRefAttr>());
 }
 
 Operation::operand_range FunctionCallOp::getArgOperands() {
diff --git a/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp b/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp
index ac29ab0db586c..fcbef2be75f9a 100644
--- a/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp
@@ -20,15 +20,18 @@ using namespace mlir::spirv::AttrNames;
 
 namespace mlir::spirv {
 
+template <typename OpTy>
 static ParseResult parseGroupNonUniformArithmeticOp(OpAsmParser &parser,
                                                     OperationState &state) {
   spirv::Scope executionScope;
   GroupOperation groupOperation;
   OpAsmParser::UnresolvedOperand valueInfo;
-  if (spirv::parseEnumStrAttr<spirv::ScopeAttr>(executionScope, parser, state,
-                                                kExecutionScopeAttrName) ||
-      spirv::parseEnumStrAttr<GroupOperationAttr>(groupOperation, parser, state,
-                                                  kGroupOperationAttrName) ||
+  if (spirv::parseEnumStrAttr<spirv::ScopeAttr>(
+          executionScope, parser, state,
+          OpTy::getExecutionScopeAttrName(state.name)) ||
+      spirv::parseEnumStrAttr<GroupOperationAttr>(
+          groupOperation, parser, state,
+          OpTy::getGroupOperationAttrName(state.name)) ||
       parser.parseOperand(valueInfo))
     return failure();
 
@@ -56,16 +59,23 @@ static ParseResult parseGroupNonUniformArithmeticOp(OpAsmParser &parser,
   return parser.addTypeToList(resultType, state.types);
 }
 
+template <typename GroupNonUniformArithmeticOpTy>
 static void printGroupNonUniformArithmeticOp(Operation *groupOp,
                                              OpAsmPrinter &printer) {
   printer
       << " \""
       << stringifyScope(
-             groupOp->getAttrOfType<spirv::ScopeAttr>(kExecutionScopeAttrName)
+             groupOp
+                 ->getAttrOfType<spirv::ScopeAttr>(
+                     GroupNonUniformArithmeticOpTy::getExecutionScopeAttrName(
+                         groupOp->getName()))
                  .getValue())
       << "\" \""
       << stringifyGroupOperation(
-             groupOp->getAttrOfType<GroupOperationAttr>(kGroupOperationAttrName)
+             groupOp
+                 ->getAttrOfType<GroupOperationAttr>(
+                     GroupNonUniformArithmeticOpTy::getGroupOperationAttrName(
+                         groupOp->getName()))
                  .getValue())
       << "\" " << groupOp->getOperand(0);
 
@@ -74,16 +84,21 @@ static void printGroupNonUniformArithmeticOp(Operation *groupOp,
   printer << " : " << groupOp->getResult(0).getType();
 }
 
+template <typename OpTy>
 static LogicalResult verifyGroupNonUniformArithmeticOp(Operation *groupOp) {
   spirv::Scope scope =
-      groupOp->getAttrOfType<spirv::ScopeAttr>(kExecutionScopeAttrName)
+      groupOp
+          ->getAttrOfType<spirv::ScopeAttr>(
+              OpTy::getExecutionScopeAttrName(groupOp->getName()))
           .getValue();
   if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup)
     return groupOp->emitOpError(
         "execution scope must be 'Workgroup' or 'Subgroup'");
 
   GroupOperation operation =
-      groupOp->getAttrOfType<GroupOperationAttr>(kGroupOperationAttrName)
+      groupOp
+          ->getAttrOfType<GroupOperationAttr>(
+              OpTy::getGroupOperationAttrName(groupOp->getName()))
           .getValue();
   if (operation == GroupOperation::ClusteredReduce &&
       groupOp->getNumOperands() == 1)
@@ -206,16 +221,17 @@ LogicalResult GroupNonUniformElectOp::verify() {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformFAddOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformFAddOp>(*this);
 }
 
 ParseResult GroupNonUniformFAddOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformFAddOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformFAddOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformFAddOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -223,16 +239,17 @@ void GroupNonUniformFAddOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformFMaxOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformFMaxOp>(*this);
 }
 
 ParseResult GroupNonUniformFMaxOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformFMaxOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformFMaxOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformFMaxOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -240,16 +257,17 @@ void GroupNonUniformFMaxOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformFMinOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformFMinOp>(*this);
 }
 
 ParseResult GroupNonUniformFMinOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformFMinOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformFMinOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformFMinOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -257,16 +275,17 @@ void GroupNonUniformFMinOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformFMulOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformFMulOp>(*this);
 }
 
 ParseResult GroupNonUniformFMulOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformFMulOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformFMulOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformFMulOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -274,16 +293,17 @@ void GroupNonUniformFMulOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformIAddOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformIAddOp>(*this);
 }
 
 ParseResult GroupNonUniformIAddOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformIAddOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformIAddOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformIAddOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -291,16 +311,17 @@ void GroupNonUniformIAddOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformIMulOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformIMulOp>(*this);
 }
 
 ParseResult GroupNonUniformIMulOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformIMulOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformIMulOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformIMulOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -308,16 +329,17 @@ void GroupNonUniformIMulOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformSMaxOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformSMaxOp>(*this);
 }
 
 ParseResult GroupNonUniformSMaxOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformSMaxOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformSMaxOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformSMaxOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -325,16 +347,17 @@ void GroupNonUniformSMaxOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformSMinOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformSMinOp>(*this);
 }
 
 ParseResult GroupNonUniformSMinOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformSMinOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformSMinOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformSMinOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -342,16 +365,17 @@ void GroupNonUniformSMinOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformUMaxOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformUMaxOp>(*this);
 }
 
 ParseResult GroupNonUniformUMaxOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformUMaxOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformUMaxOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformUMaxOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -359,16 +383,17 @@ void GroupNonUniformUMaxOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformUMinOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformUMinOp>(*this);
 }
 
 ParseResult GroupNonUniformUMinOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformUMinOp>(parser,
+                                                                 result);
 }
 
 void GroupNonUniformUMinOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformUMinOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -376,16 +401,17 @@ void GroupNonUniformUMinOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformBitwiseAndOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformBitwiseAndOp>(*this);
 }
 
 ParseResult GroupNonUniformBitwiseAndOp::parse(OpAsmParser &parser,
                                                OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformBitwiseAndOp>(parser,
+                                                                       result);
 }
 
 void GroupNonUniformBitwiseAndOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformBitwiseAndOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -393,16 +419,17 @@ void GroupNonUniformBitwiseAndOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformBitwiseOrOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformBitwiseOrOp>(*this);
 }
 
 ParseResult GroupNonUniformBitwiseOrOp::parse(OpAsmParser &parser,
                                               OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformBitwiseOrOp>(parser,
+                                                                      result);
 }
 
 void GroupNonUniformBitwiseOrOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformBitwiseOrOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -410,16 +437,17 @@ void GroupNonUniformBitwiseOrOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformBitwiseXorOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformBitwiseXorOp>(*this);
 }
 
 ParseResult GroupNonUniformBitwiseXorOp::parse(OpAsmParser &parser,
                                                OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformBitwiseXorOp>(parser,
+                                                                       result);
 }
 
 void GroupNonUniformBitwiseXorOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformBitwiseXorOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -427,16 +455,17 @@ void GroupNonUniformBitwiseXorOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformLogicalAndOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformLogicalAndOp>(*this);
 }
 
 ParseResult GroupNonUniformLogicalAndOp::parse(OpAsmParser &parser,
                                                OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformLogicalAndOp>(parser,
+                                                                       result);
 }
 
 void GroupNonUniformLogicalAndOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformLogicalAndOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -444,16 +473,17 @@ void GroupNonUniformLogicalAndOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformLogicalOrOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformLogicalOrOp>(*this);
 }
 
 ParseResult GroupNonUniformLogicalOrOp::parse(OpAsmParser &parser,
                                               OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformLogicalOrOp>(parser,
+                                                                      result);
 }
 
 void GroupNonUniformLogicalOrOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformLogicalOrOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
@@ -461,16 +491,17 @@ void GroupNonUniformLogicalOrOp::print(OpAsmPrinter &p) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult GroupNonUniformLogicalXorOp::verify() {
-  return verifyGroupNonUniformArithmeticOp(*this);
+  return verifyGroupNonUniformArithmeticOp<GroupNonUniformLogicalXorOp>(*this);
 }
 
 ParseResult GroupNonUniformLogicalXorOp::parse(OpAsmParser &parser,
                                                OperationState &result) {
-  return parseGroupNonUniformArithmeticOp(parser, result);
+  return parseGroupNonUniformArithmeticOp<GroupNonUniformLogicalXorOp>(parser,
+                                                                       result);
 }
 
 void GroupNonUniformLogicalXorOp::print(OpAsmPrinter &p) {
-  printGroupNonUniformArithmeticOp(*this, p);
+  printGroupNonUniformArithmeticOp<GroupNonUniformLogicalXorOp>(*this, p);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp b/mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp
index 00fc2acf7f07d..f5676f36a0f5f 100644
--- a/mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/IntegerDotProductOps.cpp
@@ -25,6 +25,7 @@ namespace mlir::spirv {
 // Integer Dot Product ops
 //===----------------------------------------------------------------------===//
 
+template <typename IntegerDotProductOpTy>
 static LogicalResult verifyIntegerDotProduct(Operation *op) {
   assert(llvm::is_contained({2u, 3u}, op->getNumOperands()) &&
          "Not an integer dot product op?");
@@ -33,10 +34,12 @@ static LogicalResult verifyIntegerDotProduct(Operation *op) {
   // ODS enforces that vector 1 and vector 2, and result and the accumulator
   // have the same types.
   Type factorTy = op->getOperand(0).getType();
+  StringAttr packedVectorFormatAttrName =
+      IntegerDotProductOpTy::getFormatAttrName(op->getName());
   if (auto intTy = llvm::dyn_cast<IntegerType>(factorTy)) {
     auto packedVectorFormat =
         llvm::dyn_cast_or_null<spirv::PackedVectorFormatAttr>(
-            op->getAttr(kPackedVectorFormatAttrName));
+            op->getAttr(packedVectorFormatAttrName));
     if (!packedVectorFormat)
       return op->emitOpError("requires Packed Vector Format attribute for "
                              "integer vector operands");
@@ -50,7 +53,7 @@ static LogicalResult verifyIntegerDotProduct(Operation *op) {
                         "integer vector operands to be 32-bits wide",
                         packedVectorFormat.getValue()));
   } else {
-    if (op->hasAttr(kPackedVectorFormatAttrName))
+    if (op->hasAttr(packedVectorFormatAttrName))
       return op->emitOpError(llvm::formatv(
           "with invalid format attribute for vector operands of type '{0}'",
           factorTy));
@@ -84,6 +87,7 @@ getIntegerDotProductExtensions() {
   return {extension};
 }
 
+template <typename IntegerDotProductOpTy>
 static SmallVector<ArrayRef<spirv::Capability>, 1>
 getIntegerDotProductCapabilities(Operation *op) {
   // Requires the the DotProduct capability and capabilities that depend on
@@ -99,9 +103,11 @@ getIntegerDotProductCapabilities(Operation *op) {
   SmallVector<ArrayRef<spirv::Capability>, 1> capabilities = {dotProductCap};
 
   Type factorTy = op->getOperand(0).getType();
+  StringAttr packedVectorFormatAttrName =
+      IntegerDotProductOpTy::getFormatAttrName(op->getName());
   if (auto intTy = llvm::dyn_cast<IntegerType>(factorTy)) {
     auto formatAttr = llvm::cast<spirv::PackedVectorFormatAttr>(
-        op->getAttr(kPackedVectorFormatAttrName));
+        op->getAttr(packedVectorFormatAttrName));
     if (formatAttr.getValue() ==
         spirv::PackedVectorFormat::PackedVectorFormat4x8Bit)
       capabilities.push_back(dotProductInput4x8BitPackedCap);
@@ -120,12 +126,14 @@ getIntegerDotProductCapabilities(Operation *op) {
 }
 
 #define SPIRV_IMPL_INTEGER_DOT_PRODUCT_OP(OpName)                              \
-  LogicalResult OpName::verify() { return verifyIntegerDotProduct(*this); }    \
+  LogicalResult OpName::verify() {                                             \
+    return verifyIntegerDotProduct<OpName>(*this);                             \
+  }                                                                            \
   SmallVector<ArrayRef<spirv::Extension>, 1> OpName::getExtensions() {         \
     return getIntegerDotProductExtensions();                                   \
   }                                                                            \
   SmallVector<ArrayRef<spirv::Capability>, 1> OpName::getCapabilities() {      \
-    return getIntegerDotProductCapabilities(*this);                            \
+    return getIntegerDotProductCapabilities<OpName>(*this);                    \
   }                                                                            \
   std::optional<spirv::Version> OpName::getMinVersion() {                      \
     return getIntegerDotProductMinVersion();                                   \
diff --git a/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp b/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp
index 0df59e42218b5..c4c7ff722175d 100644
--- a/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/MemoryOps.cpp
@@ -25,10 +25,48 @@ using namespace mlir::spirv::AttrNames;
 
 namespace mlir::spirv {
 
+/// Parses optional memory access (a.k.a. memory operand) attributes attached to
+/// a memory access operand/pointer. Specifically, parses the following syntax:
+///     (`[` memory-access `]`)?
+/// where:
+///     memory-access ::= `"None"` | `"Volatile"` | `"Aligned", `
+///         integer-literal | `"NonTemporal"`
+template <typename MemoryOpTy>
+ParseResult parseMemoryAccessAttributes(OpAsmParser &parser,
+                                        OperationState &state) {
+  // Parse an optional list of attributes staring with '['
+  if (parser.parseOptionalLSquare()) {
+    // Nothing to do
+    return success();
+  }
+
+  spirv::MemoryAccess memoryAccessAttr;
+  StringAttr memoryAccessAttrName =
+      MemoryOpTy::getMemoryAccessAttrName(state.name);
+  if (spirv::parseEnumStrAttr<spirv::MemoryAccessAttr>(
+          memoryAccessAttr, parser, state, memoryAccessAttrName))
+    return failure();
+
+  if (spirv::bitEnumContainsAll(memoryAccessAttr,
+                                spirv::MemoryAccess::Aligned)) {
+    // Parse integer attribute for alignment.
+    Attribute alignmentAttr;
+    StringAttr alignmentAttrName = MemoryOpTy::getAlignmentAttrName(state.name);
+    Type i32Type = parser.getBuilder().getIntegerType(32);
+    if (parser.parseComma() ||
+        parser.parseAttribute(alignmentAttr, i32Type, alignmentAttrName,
+                              state.attributes)) {
+      return failure();
+    }
+  }
+  return parser.parseRSquare();
+}
+
 // TODO Make sure to merge this and the previous function into one template
 // parameterized by memory access attribute name and alignment. Doing so now
 // results in VS2017 in producing an internal error (at the call site) that's
 // not detailed enough to understand what is happening.
+template <typename MemoryOpTy>
 static ParseResult parseSourceMemoryAccessAttributes(OpAsmParser &parser,
                                                      OperationState &state) {
   // Parse an optional list of attributes staring with '['
@@ -38,17 +76,21 @@ static ParseResult parseSourceMemoryAccessAttributes(OpAsmParser &parser,
   }
 
   spirv::MemoryAccess memoryAccessAttr;
+  StringRef memoryAccessAttrName =
+      MemoryOpTy::getSourceMemoryAccessAttrName(state.name);
   if (spirv::parseEnumStrAttr<spirv::MemoryAccessAttr>(
-          memoryAccessAttr, parser, state, kSourceMemoryAccessAttrName))
+          memoryAccessAttr, parser, state, memoryAccessAttrName))
     return failure();
 
   if (spirv::bitEnumContainsAll(memoryAccessAttr,
                                 spirv::MemoryAccess::Aligned)) {
     // Parse integer attribute for alignment.
     Attribute alignmentAttr;
+    StringAttr alignmentAttrName =
+        MemoryOpTy::getSourceAlignmentAttrName(state.name);
     Type i32Type = parser.getBuilder().getIntegerType(32);
     if (parser.parseComma() ||
-        parser.parseAttribute(alignmentAttr, i32Type, kSourceAlignmentAttrName,
+        parser.parseAttribute(alignmentAttr, i32Type, alignmentAttrName,
                               state.attributes)) {
       return failure();
     }
@@ -72,7 +114,7 @@ static void printSourceMemoryAccessAttribute(
   // Print optional memory access attribute.
   if (auto memAccess = (memoryAccessAtrrValue ? memoryAccessAtrrValue
                                               : memoryOp.getMemoryAccess())) {
-    elidedAttrs.push_back(kSourceMemoryAccessAttrName);
+    elidedAttrs.push_back(memoryOp.getSourceMemoryAccessAttrName());
 
     printer << " [\"" << stringifyMemoryAccess(*memAccess) << "\"";
 
@@ -80,7 +122,7 @@ static void printSourceMemoryAccessAttribute(
       // Print integer alignment attribute.
       if (auto alignment = (alignmentAttrValue ? alignmentAttrValue
                                                : memoryOp.getAlignment())) {
-        elidedAttrs.push_back(kSourceAlignmentAttrName);
+        elidedAttrs.push_back(memoryOp.getSourceAlignmentAttrName());
         printer << ", " << *alignment;
       }
     }
@@ -98,7 +140,7 @@ static void printMemoryAccessAttribute(
   // Print optional memory access attribute.
   if (auto memAccess = (memoryAccessAtrrValue ? memoryAccessAtrrValue
                                               : memoryOp.getMemoryAccess())) {
-    elidedAttrs.push_back(kMemoryAccessAttrName);
+    elidedAttrs.push_back(memoryOp.getMemoryAccessAttrName());
 
     printer << " [\"" << stringifyMemoryAccess(*memAccess) << "\"";
 
@@ -106,7 +148,7 @@ static void printMemoryAccessAttribute(
       // Print integer alignment attribute.
       if (auto alignment = (alignmentAttrValue ? alignmentAttrValue
                                                : memoryOp.getAlignment())) {
-        elidedAttrs.push_back(kAlignmentAttrName);
+        elidedAttrs.push_back(memoryOp.getAlignmentAttrName());
         printer << ", " << *alignment;
       }
     }
@@ -136,11 +178,11 @@ static LogicalResult verifyMemoryAccessAttribute(MemoryOpTy memoryOp) {
   // memory-access attribute is Aligned, then the alignment attribute must be
   // present.
   auto *op = memoryOp.getOperation();
-  auto memAccessAttr = op->getAttr(kMemoryAccessAttrName);
+  auto memAccessAttr = op->getAttr(memoryOp.getMemoryAccessAttrName());
   if (!memAccessAttr) {
     // Alignment attribute shouldn't be present if memory access attribute is
     // not present.
-    if (op->getAttr(kAlignmentAttrName)) {
+    if (op->getAttr(memoryOp.getAlignmentAttrName())) {
       return memoryOp.emitOpError(
           "invalid alignment specification without aligned memory access "
           "specification");
@@ -157,11 +199,11 @@ static LogicalResult verifyMemoryAccessAttribute(MemoryOpTy memoryOp) {
 
   if (spirv::bitEnumContainsAll(memAccess.getValue(),
                                 spirv::MemoryAccess::Aligned)) {
-    if (!op->getAttr(kAlignmentAttrName)) {
+    if (!op->getAttr(memoryOp.getAlignmentAttrName())) {
       return memoryOp.emitOpError("missing alignment value");
     }
   } else {
-    if (op->getAttr(kAlignmentAttrName)) {
+    if (op->getAttr(memoryOp.getAlignmentAttrName())) {
       return memoryOp.emitOpError(
           "invalid alignment specification with non-aligned memory access "
           "specification");
@@ -180,11 +222,11 @@ static LogicalResult verifySourceMemoryAccessAttribute(MemoryOpTy memoryOp) {
   // memory-access attribute is Aligned, then the alignment attribute must be
   // present.
   auto *op = memoryOp.getOperation();
-  auto memAccessAttr = op->getAttr(kSourceMemoryAccessAttrName);
+  auto memAccessAttr = op->getAttr(memoryOp.getSourceMemoryAccessAttrName());
   if (!memAccessAttr) {
     // Alignment attribute shouldn't be present if memory access attribute is
     // not present.
-    if (op->getAttr(kSourceAlignmentAttrName)) {
+    if (op->getAttr(memoryOp.getSourceAlignmentAttrName())) {
       return memoryOp.emitOpError(
           "invalid alignment specification without aligned memory access "
           "specification");
@@ -201,11 +243,11 @@ static LogicalResult verifySourceMemoryAccessAttribute(MemoryOpTy memoryOp) {
 
   if (spirv::bitEnumContainsAll(memAccess.getValue(),
                                 spirv::MemoryAccess::Aligned)) {
-    if (!op->getAttr(kSourceAlignmentAttrName)) {
+    if (!op->getAttr(memoryOp.getSourceAlignmentAttrName())) {
       return memoryOp.emitOpError("missing alignment value");
     }
   } else {
-    if (op->getAttr(kSourceAlignmentAttrName)) {
+    if (op->getAttr(memoryOp.getSourceAlignmentAttrName())) {
       return memoryOp.emitOpError(
           "invalid alignment specification with non-aligned memory access "
           "specification");
@@ -376,7 +418,7 @@ ParseResult LoadOp::parse(OpAsmParser &parser, OperationState &result) {
   OpAsmParser::UnresolvedOperand ptrInfo;
   Type elementType;
   if (parseEnumStrAttr(storageClass, parser) || parser.parseOperand(ptrInfo) ||
-      parseMemoryAccessAttributes(parser, result) ||
+      parseMemoryAccessAttributes<LoadOp>(parser, result) ||
       parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() ||
       parser.parseType(elementType)) {
     return failure();
@@ -425,8 +467,8 @@ ParseResult StoreOp::parse(OpAsmParser &parser, OperationState &result) {
   Type elementType;
   if (parseEnumStrAttr(storageClass, parser) ||
       parser.parseOperandList(operandInfo, 2) ||
-      parseMemoryAccessAttributes(parser, result) || parser.parseColon() ||
-      parser.parseType(elementType)) {
+      parseMemoryAccessAttributes<StoreOp>(parser, result) ||
+      parser.parseColon() || parser.parseType(elementType)) {
     return failure();
   }
 
@@ -499,13 +541,13 @@ ParseResult CopyMemoryOp::parse(OpAsmParser &parser, OperationState &result) {
       parser.parseOperand(targetPtrInfo) || parser.parseComma() ||
       parseEnumStrAttr(sourceStorageClass, parser) ||
       parser.parseOperand(sourcePtrInfo) ||
-      parseMemoryAccessAttributes(parser, result)) {
+      parseMemoryAccessAttributes<CopyMemoryOp>(parser, result)) {
     return failure();
   }
 
   if (!parser.parseOptionalComma()) {
     // Parse 2nd memory access attributes.
-    if (parseSourceMemoryAccessAttributes(parser, result)) {
+    if (parseSourceMemoryAccessAttributes<CopyMemoryOp>(parser, result)) {
       return failure();
     }
   }
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index 50035c917137e..38415620fd4f9 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -457,12 +457,14 @@ ParseResult spirv::CompositeExtractOp::parse(OpAsmParser &parser,
                                              OperationState &result) {
   OpAsmParser::UnresolvedOperand compositeInfo;
   Attribute indicesAttr;
+  StringRef indicesAttrName =
+      spirv::CompositeExtractOp::getIndicesAttrName(result.name);
   Type compositeType;
   SMLoc attrLocation;
 
   if (parser.parseOperand(compositeInfo) ||
       parser.getCurrentLocation(&attrLocation) ||
-      parser.parseAttribute(indicesAttr, kIndicesAttrName, result.attributes) ||
+      parser.parseAttribute(indicesAttr, indicesAttrName, result.attributes) ||
       parser.parseColonType(compositeType) ||
       parser.resolveOperand(compositeInfo, compositeType, result.operands)) {
     return failure();
@@ -513,11 +515,13 @@ ParseResult spirv::CompositeInsertOp::parse(OpAsmParser &parser,
   SmallVector<OpAsmParser::UnresolvedOperand, 2> operands;
   Type objectType, compositeType;
   Attribute indicesAttr;
+  StringRef indicesAttrName =
+      spirv::CompositeInsertOp::getIndicesAttrName(result.name);
   auto loc = parser.getCurrentLocation();
 
   return failure(
       parser.parseOperandList(operands, 2) ||
-      parser.parseAttribute(indicesAttr, kIndicesAttrName, result.attributes) ||
+      parser.parseAttribute(indicesAttr, indicesAttrName, result.attributes) ||
       parser.parseColonType(objectType) ||
       parser.parseKeywordType("into", compositeType) ||
       parser.resolveOperands(operands, {objectType, compositeType}, loc,
@@ -559,7 +563,8 @@ void spirv::CompositeInsertOp::print(OpAsmPrinter &printer) {
 ParseResult spirv::ConstantOp::parse(OpAsmParser &parser,
                                      OperationState &result) {
   Attribute value;
-  if (parser.parseAttribute(value, kValueAttrName, result.attributes))
+  StringRef valueAttrName = spirv::ConstantOp::getValueAttrName(result.name);
+  if (parser.parseAttribute(value, valueAttrName, result.attributes))
     return failure();
 
   Type type = NoneType::get(parser.getContext());
@@ -822,7 +827,7 @@ ParseResult spirv::EntryPointOp::parse(OpAsmParser &parser,
         }))
       return failure();
   }
-  result.addAttribute(kInterfaceAttrName,
+  result.addAttribute(spirv::EntryPointOp::getInterfaceAttrName(result.name),
                       parser.getBuilder().getArrayAttr(interfaceVars));
   return success();
 }
@@ -875,7 +880,9 @@ ParseResult spirv::ExecutionModeOp::parse(OpAsmParser &parser,
     }
     values.push_back(llvm::cast<IntegerAttr>(value).getInt());
   }
-  result.addAttribute(kValuesAttrName,
+  StringRef valuesAttrName =
+      spirv::ExecutionModeOp::getValuesAttrName(result.name);
+  result.addAttribute(valuesAttrName,
                       parser.getBuilder().getI32ArrayAttr(values));
   return success();
 }
@@ -1150,16 +1157,18 @@ ParseResult spirv::GlobalVariableOp::parse(OpAsmParser &parser,
                                            OperationState &result) {
   // Parse variable name.
   StringAttr nameAttr;
+  StringRef initializerAttrName =
+      spirv::GlobalVariableOp::getInitializerAttrName(result.name);
   if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
                              result.attributes)) {
     return failure();
   }
 
   // Parse optional initializer
-  if (succeeded(parser.parseOptionalKeyword(kInitializerAttrName))) {
+  if (succeeded(parser.parseOptionalKeyword(initializerAttrName))) {
     FlatSymbolRefAttr initSymbol;
     if (parser.parseLParen() ||
-        parser.parseAttribute(initSymbol, Type(), kInitializerAttrName,
+        parser.parseAttribute(initSymbol, Type(), initializerAttrName,
                               result.attributes) ||
         parser.parseRParen())
       return failure();
@@ -1170,6 +1179,8 @@ ParseResult spirv::GlobalVariableOp::parse(OpAsmParser &parser,
   }
 
   Type type;
+  StringRef typeAttrName =
+      spirv::GlobalVariableOp::getTypeAttrName(result.name);
   auto loc = parser.getCurrentLocation();
   if (parser.parseColonType(type)) {
     return failure();
@@ -1177,7 +1188,7 @@ ParseResult spirv::GlobalVariableOp::parse(OpAsmParser &parser,
   if (!llvm::isa<spirv::PointerType>(type)) {
     return parser.emitError(loc, "expected spirv.ptr type");
   }
-  result.addAttribute(kTypeAttrName, TypeAttr::get(type));
+  result.addAttribute(typeAttrName, TypeAttr::get(type));
 
   return success();
 }
@@ -1191,15 +1202,17 @@ void spirv::GlobalVariableOp::print(OpAsmPrinter &printer) {
   printer.printSymbolName(getSymName());
   elidedAttrs.push_back(SymbolTable::getSymbolAttrName());
 
+  StringRef initializerAttrName = this->getInitializerAttrName();
   // Print optional initializer
   if (auto initializer = this->getInitializer()) {
-    printer << " " << kInitializerAttrName << '(';
+    printer << " " << initializerAttrName << '(';
     printer.printSymbolName(*initializer);
     printer << ')';
-    elidedAttrs.push_back(kInitializerAttrName);
+    elidedAttrs.push_back(initializerAttrName);
   }
 
-  elidedAttrs.push_back(kTypeAttrName);
+  StringRef typeAttrName = this->getTypeAttrName();
+  elidedAttrs.push_back(typeAttrName);
   spirv::printVariableDecorations(*this, printer, elidedAttrs);
   printer << " : " << getType();
 }
@@ -1219,8 +1232,8 @@ LogicalResult spirv::GlobalVariableOp::verify() {
            << stringifyStorageClass(storageClass) << "'";
   }
 
-  if (auto init =
-          (*this)->getAttrOfType<FlatSymbolRefAttr>(kInitializerAttrName)) {
+  if (auto init = (*this)->getAttrOfType<FlatSymbolRefAttr>(
+          this->getInitializerAttrName())) {
     Operation *initOp = SymbolTable::lookupNearestSymbolFrom(
         (*this)->getParentOp(), init.getAttr());
     // TODO: Currently only variable initialization with specialization
@@ -1594,6 +1607,8 @@ ParseResult spirv::SpecConstantOp::parse(OpAsmParser &parser,
                                          OperationState &result) {
   StringAttr nameAttr;
   Attribute valueAttr;
+  StringRef defaultValueAttrName =
+      spirv::SpecConstantOp::getDefaultValueAttrName(result.name);
 
   if (parser.parseSymbolName(nameAttr, SymbolTable::getSymbolAttrName(),
                              result.attributes))
@@ -1609,8 +1624,7 @@ ParseResult spirv::SpecConstantOp::parse(OpAsmParser &parser,
   }
 
   if (parser.parseEqual() ||
-      parser.parseAttribute(valueAttr, kDefaultValueAttrName,
-                            result.attributes))
+      parser.parseAttribute(valueAttr, defaultValueAttrName, result.attributes))
     return failure();
 
   return success();
@@ -1783,14 +1797,18 @@ ParseResult spirv::SpecConstantCompositeOp::parse(OpAsmParser &parser,
   if (parser.parseRParen())
     return failure();
 
-  result.addAttribute(kCompositeSpecConstituentsName,
+  StringAttr compositeSpecConstituentsName =
+      spirv::SpecConstantCompositeOp::getConstituentsAttrName(result.name);
+  result.addAttribute(compositeSpecConstituentsName,
                       parser.getBuilder().getArrayAttr(constituents));
 
   Type type;
   if (parser.parseColonType(type))
     return failure();
 
-  result.addAttribute(kTypeAttrName, TypeAttr::get(type));
+  StringAttr typeAttrName =
+      spirv::SpecConstantCompositeOp::getTypeAttrName(result.name);
+  result.addAttribute(typeAttrName, TypeAttr::get(type));
 
   return success();
 }
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.cpp
index 27c373300aee8..726fb7ce9fdb7 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.cpp
@@ -18,35 +18,6 @@ using namespace mlir::spirv::AttrNames;
 
 namespace mlir::spirv {
 
-ParseResult parseMemoryAccessAttributes(OpAsmParser &parser,
-                                        OperationState &state,
-                                        StringRef attrName) {
-  // Parse an optional list of attributes staring with '['
-  if (parser.parseOptionalLSquare()) {
-    // Nothing to do
-    return success();
-  }
-
-  spirv::MemoryAccess memoryAccessAttr;
-  if (spirv::parseEnumStrAttr<spirv::MemoryAccessAttr>(memoryAccessAttr, parser,
-                                                       state, attrName))
-    return failure();
-
-  if (spirv::bitEnumContainsAll(memoryAccessAttr,
-                                spirv::MemoryAccess::Aligned)) {
-    // Parse integer attribute for alignment.
-    Attribute alignmentAttr;
-    Type i32Type = parser.getBuilder().getIntegerType(32);
-    if (parser.parseComma() ||
-        parser.parseAttribute(alignmentAttr, i32Type,
-                              AttrNames::kAlignmentAttrName,
-                              state.attributes)) {
-      return failure();
-    }
-  }
-  return parser.parseRSquare();
-}
-
 ParseResult parseVariableDecorations(OpAsmParser &parser,
                                      OperationState &state) {
   auto builtInName = llvm::convertToSnakeFromCamelCase(
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.h b/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.h
index 625c82f6e8e89..858b94f7be8b0 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.h
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVParsingUtils.h
@@ -20,34 +20,12 @@
 
 namespace mlir::spirv {
 namespace AttrNames {
-// TODO: generate these strings using ODS.
-inline constexpr char kAlignmentAttrName[] = "alignment";
-inline constexpr char kBranchWeightAttrName[] = "branch_weights";
-inline constexpr char kCallee[] = "callee";
-inline constexpr char kClusterSize[] = "cluster_size";
-inline constexpr char kControl[] = "control";
-inline constexpr char kDefaultValueAttrName[] = "default_value";
-inline constexpr char kEqualSemanticsAttrName[] = "equal_semantics";
-inline constexpr char kExecutionScopeAttrName[] = "execution_scope";
-inline constexpr char kFnNameAttrName[] = "fn";
-inline constexpr char kGroupOperationAttrName[] = "group_operation";
-inline constexpr char kIndicesAttrName[] = "indices";
-inline constexpr char kInitializerAttrName[] = "initializer";
-inline constexpr char kInterfaceAttrName[] = "interface";
-inline constexpr char kKhrCooperativeMatrixLayoutAttrName[] = "matrix_layout";
-inline constexpr char kMemoryAccessAttrName[] = "memory_access";
-inline constexpr char kMemoryOperandAttrName[] = "memory_operand";
-inline constexpr char kMemoryScopeAttrName[] = "memory_scope";
-inline constexpr char kPackedVectorFormatAttrName[] = "format";
-inline constexpr char kSemanticsAttrName[] = "semantics";
-inline constexpr char kSourceAlignmentAttrName[] = "source_alignment";
-inline constexpr char kSourceMemoryAccessAttrName[] = "source_memory_access";
-inline constexpr char kSpecIdAttrName[] = "spec_id";
-inline constexpr char kTypeAttrName[] = "type";
-inline constexpr char kUnequalSemanticsAttrName[] = "unequal_semantics";
-inline constexpr char kValueAttrName[] = "value";
-inline constexpr char kValuesAttrName[] = "values";
-inline constexpr char kCompositeSpecConstituentsName[] = "constituents";
+
+inline constexpr char kClusterSize[] = "cluster_size"; // no ODS generation
+inline constexpr char kControl[] = "control";          // no ODS generation
+inline constexpr char kFnNameAttrName[] = "fn";        // no ODS generation
+inline constexpr char kSpecIdAttrName[] = "spec_id";   // no ODS generation
+
 } // namespace AttrNames
 
 template <typename Ty>
@@ -143,16 +121,6 @@ parseEnumKeywordAttr(EnumClass &value, OpAsmParser &parser,
   return success();
 }
 
-/// Parses optional memory access (a.k.a. memory operand) attributes attached to
-/// a memory access operand/pointer. Specifically, parses the following syntax:
-///     (`[` memory-access `]`)?
-/// where:
-///     memory-access ::= `"None"` | `"Volatile"` | `"Aligned", `
-///         integer-literal | `"NonTemporal"`
-ParseResult parseMemoryAccessAttributes(
-    OpAsmParser &parser, OperationState &state,
-    StringRef attrName = AttrNames::kMemoryAccessAttrName);
-
 ParseResult parseVariableDecorations(OpAsmParser &parser,
                                      OperationState &state);
 

From 44b096682791518a36a4b59bd0f3538b1aa0f666 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 14:29:27 -0700
Subject: [PATCH 266/546] Apply clang-tidy fixes for modernize-use-override in
 SerializeROCDLTarget.cpp (NFC)

---
 mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
index 9ada2dab40ff7..3d9d8175b115e 100644
--- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
@@ -39,7 +39,7 @@ using namespace mlir;
 
 class MLIRTargetLLVMROCDL : public ::testing::Test {
 protected:
-  virtual void SetUp() {
+  void SetUp() override {
     registerBuiltinDialectTranslation(registry);
     registerLLVMDialectTranslation(registry);
     registerGPUDialectTranslation(registry);

From 97678078442adbd6c12cc54f7a36152f8873afb4 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 14:30:13 -0700
Subject: [PATCH 267/546] Apply clang-tidy fixes for
 readability-container-size-empty in SerializeROCDLTarget.cpp (NFC)

---
 mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
index 3d9d8175b115e..e91e69b102043 100644
--- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp
@@ -89,7 +89,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLMToLLVM)) {
         serializer.serializeToObject(gpuModule, options);
     // Check that the serializer was successful.
     ASSERT_TRUE(object != std::nullopt);
-    ASSERT_TRUE(object->size() > 0);
+    ASSERT_TRUE(!object->empty());
 
     // Read the serialized module.
     llvm::MemoryBufferRef buffer(StringRef(object->data(), object->size()),
@@ -125,7 +125,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToPTX)) {
         serializer.serializeToObject(gpuModule, options);
     // Check that the serializer was successful.
     ASSERT_TRUE(object != std::nullopt);
-    ASSERT_TRUE(object->size() > 0);
+    ASSERT_TRUE(!object->empty());
 
     ASSERT_TRUE(
         StringRef(object->data(), object->size()).contains("rocdl_kernel"));

From b1d1f5786fc24fa538b064e7b27f07a1765fce35 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 14:31:31 -0700
Subject: [PATCH 268/546] Apply clang-tidy fixes for modernize-use-override in
 SerializeToLLVMBitcode.cpp (NFC)

---
 mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
index 69c9d9b2f3202..94bc9b6da6cf6 100644
--- a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
@@ -32,7 +32,7 @@ using namespace mlir;
 
 class MLIRTargetLLVM : public ::testing::Test {
 protected:
-  virtual void SetUp() {
+  void SetUp() override {
     llvm::InitializeNativeTarget();
     llvm::InitializeNativeTargetAsmPrinter();
   }

From c67a4ae47c86f1f390db7ba0ea9c021abff130f8 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 14:32:02 -0700
Subject: [PATCH 269/546] Apply clang-tidy fixes for
 readability-container-size-empty in SerializeToLLVMBitcode.cpp (NFC)

---
 mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
index 94bc9b6da6cf6..96828828fe3f4 100644
--- a/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeToLLVMBitcode.cpp
@@ -60,7 +60,7 @@ TEST_F(MLIRTargetLLVM, SKIP_WITHOUT_NATIVE(SerializeToLLVMBitcode)) {
                                   "");
   std::optional<SmallVector<char, 0>> serializedModule = serializer.run();
   ASSERT_TRUE(!!serializedModule);
-  ASSERT_TRUE(serializedModule->size() > 0);
+  ASSERT_TRUE(!serializedModule->empty());
 
   // Read the serialized module.
   llvm::MemoryBufferRef buffer(

From 0f02431273faa2cd001c59fd5de767659bc0c976 Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Mon, 26 Feb 2024 09:43:20 +0800
Subject: [PATCH 270/546] [InstCombine] Fold (sub (xor X, (sext C)), (sext C))
 => (select C (neg X), X) (#79417)

This is useful when computing absdiff.

Correctness prove: https://alive2.llvm.org/ce/z/eMbxps,
https://alive2.llvm.org/ce/z/SNCWJe.

---------

Co-authored-by: Yingwei Zheng <dtcxzyw@qq.com>
---
 .../InstCombine/InstCombineAddSub.cpp         |  15 ++
 .../Transforms/InstCombine/sub-xor-cmp.ll     | 205 ++++++++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/sub-xor-cmp.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index cfff5df9ff500..36a5faa5f6743 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2448,6 +2448,21 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     }
   }
 
+  {
+    // (sub (xor X, (sext C)), (sext C)) => (select C, (neg X), X)
+    // (sub (sext C), (xor X, (sext C))) => (select C, X, (neg X))
+    Value *C, *X;
+    auto m_SubXorCmp = [&C, &X](Value *LHS, Value *RHS) {
+      return match(LHS, m_OneUse(m_c_Xor(m_Value(X), m_Specific(RHS)))) &&
+             match(RHS, m_SExt(m_Value(C))) &&
+             (C->getType()->getScalarSizeInBits() == 1);
+    };
+    if (m_SubXorCmp(Op0, Op1))
+      return SelectInst::Create(C, Builder.CreateNeg(X), X);
+    if (m_SubXorCmp(Op1, Op0))
+      return SelectInst::Create(C, X, Builder.CreateNeg(X));
+  }
+
   if (Instruction *R = tryFoldInstWithCtpopWithNot(&I))
     return R;
 
diff --git a/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
new file mode 100644
index 0000000000000..2e1ff0a21a3de
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/sub-xor-cmp.ll
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i64 @sext_xor_sub(i64 %a, i1 %b) {
+; CHECK-LABEL: define i64 @sext_xor_sub(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B]], i64 [[TMP1]], i64 [[A]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i1 %b to i64
+  %d = xor i64 %a, %c
+  %r = sub i64 %d, %c
+  ret i64 %r
+}
+
+define i64 @sext_xor_sub_1(i64 %a, i1 %b) {
+; CHECK-LABEL: define i64 @sext_xor_sub_1(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B]], i64 [[TMP1]], i64 [[A]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i1 %b to i64
+  %d = xor i64 %c, %a
+  %r = sub i64 %d, %c
+  ret i64 %r
+}
+
+define i64 @sext_xor_sub_2(i64 %a, i1 %b) {
+; CHECK-LABEL: define i64 @sext_xor_sub_2(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B]], i64 [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i1 %b to i64
+  %d = xor i64 %a, %c
+  %r = sub i64 %c, %d
+  ret i64 %r
+}
+
+define i64 @sext_xor_sub_3(i64 %a, i1 %b) {
+; CHECK-LABEL: define i64 @sext_xor_sub_3(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B]], i64 [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i1 %b to i64
+  %d = xor i64 %c, %a
+  %r = sub i64 %c, %d
+  ret i64 %r
+}
+
+; Sext non boolean type.
+define i64 @sext_non_bool_xor_sub(i64 %a, i8 %b) {
+; CHECK-LABEL: define i64 @sext_non_bool_xor_sub(
+; CHECK-SAME: i64 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = sext i8 [[B]] to i64
+; CHECK-NEXT:    [[D:%.*]] = xor i64 [[C]], [[A]]
+; CHECK-NEXT:    [[R:%.*]] = sub i64 [[D]], [[C]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i8 %b to i64
+  %d = xor i64 %a, %c
+  %r = sub i64 %d, %c
+  ret i64 %r
+}
+
+define i64 @sext_non_bool_xor_sub_1(i64 %a, i8 %b) {
+; CHECK-LABEL: define i64 @sext_non_bool_xor_sub_1(
+; CHECK-SAME: i64 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = sext i8 [[B]] to i64
+; CHECK-NEXT:    [[D:%.*]] = xor i64 [[C]], [[A]]
+; CHECK-NEXT:    [[R:%.*]] = sub i64 [[D]], [[C]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i8 %b to i64
+  %d = xor i64 %c, %a
+  %r = sub i64 %d, %c
+  ret i64 %r
+}
+
+; Different boolean values.
+define i64 @sext_diff_i1_xor_sub(i64 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define i64 @sext_diff_i1_xor_sub(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = sext i1 [[B]] to i64
+; CHECK-NEXT:    [[E_NEG:%.*]] = zext i1 [[C]] to i64
+; CHECK-NEXT:    [[R:%.*]] = add nsw i64 [[E_NEG]], [[D]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %d = sext i1 %b to i64
+  %e = sext i1 %c to i64
+  %f = xor i64 %a, %d
+  %r = sub i64 %d, %e
+  ret i64 %r
+}
+
+define i64 @sext_diff_i1_xor_sub_1(i64 %a, i1 %b, i1 %c) {
+; CHECK-LABEL: define i64 @sext_diff_i1_xor_sub_1(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[D:%.*]] = sext i1 [[B]] to i64
+; CHECK-NEXT:    [[E_NEG:%.*]] = zext i1 [[C]] to i64
+; CHECK-NEXT:    [[R:%.*]] = add nsw i64 [[E_NEG]], [[D]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %d = sext i1 %b to i64
+  %e = sext i1 %c to i64
+  %f = xor i64 %d, %a
+  %r = sub i64 %d, %e
+  ret i64 %r
+}
+
+; (sext C) has multiple uses.
+define i64 @sext_multi_uses(i64 %a, i1 %b, i64 %x) {
+; CHECK-LABEL: define i64 @sext_multi_uses(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = sext i1 [[B]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[A]]
+; CHECK-NEXT:    [[E:%.*]] = select i1 [[B]], i64 [[TMP1]], i64 [[A]]
+; CHECK-NEXT:    [[F:%.*]] = mul i64 [[C]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = add i64 [[F]], [[E]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i1 %b to i64
+  %d = xor i64 %a, %c
+  %e = sub i64 %d, %c
+  %f = mul i64 %x, %c
+  %r = add i64 %f, %e
+  ret i64 %r
+}
+
+; (xor X, (sext C)) has multiple uses.
+define i64 @xor_multi_uses(i64 %a, i1 %b, i64 %x) {
+; CHECK-LABEL: define i64 @xor_multi_uses(
+; CHECK-SAME: i64 [[A:%.*]], i1 [[B:%.*]], i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = sext i1 [[B]] to i64
+; CHECK-NEXT:    [[D:%.*]] = xor i64 [[C]], [[A]]
+; CHECK-NEXT:    [[E:%.*]] = sub i64 [[D]], [[C]]
+; CHECK-NEXT:    [[F:%.*]] = mul i64 [[D]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = add i64 [[F]], [[E]]
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %c = sext i1 %b to i64
+  %d = xor i64 %a, %c
+  %e = sub i64 %d, %c
+  %f = mul i64 %x, %d
+  %r = add i64 %f, %e
+  ret i64 %r
+}
+
+define i64 @absdiff(i64 %a, i64 %b) {
+; CHECK-LABEL: define i64 @absdiff(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i64 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = sub i64 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[D]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[C]], i64 [[TMP1]], i64 [[D]]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %c = icmp ult i64 %a, %b
+  %c.ext = sext i1 %c to i64
+  %d = sub i64 %a, %b
+  %may.rev = xor i64 %c.ext, %d
+  %res = sub i64 %may.rev, %c.ext
+  ret i64 %res
+}
+
+; Commuted xor operands compared to absdiff.
+define i64 @absdiff1(i64 %a, i64 %b) {
+; CHECK-LABEL: define i64 @absdiff1(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i64 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = sub i64 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[D]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[C]], i64 [[TMP1]], i64 [[D]]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %c = icmp ult i64 %a, %b
+  %c.ext = sext i1 %c to i64
+  %d = sub i64 %a, %b
+  %may.rev = xor i64 %d, %c.ext
+  %res = sub i64 %may.rev, %c.ext
+  ret i64 %res
+}
+
+; Use ugt as compare cond.
+define i64 @absdiff2(i64 %a, i64 %b) {
+; CHECK-LABEL: define i64 @absdiff2(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) {
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i64 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = sub i64 [[B]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[D]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[C]], i64 [[TMP1]], i64 [[D]]
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+  %c = icmp ugt i64 %a, %b
+  %c.ext = sext i1 %c to i64
+  %d = sub i64 %b, %a
+  %may.rev = xor i64 %d, %c.ext
+  %res = sub i64 %may.rev, %c.ext
+  ret i64 %res
+}

From 8be39b3901e3326ceebeaf0381f8cc57fdc0d464 Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Mon, 26 Feb 2024 11:13:21 +0800
Subject: [PATCH 271/546] [LoongArch] Improve pattern matching for AddLike
 predicate (#82767)

This commit updates the pattern matching logic for the `AddLike`
predicate in `LoongArchInstrInfo.td` to use the
`isBaseWithConstantOffset` function provided by `CurDAG`. This
optimization aims to improve the efficiency of pattern matching by
identifying cases where the operation can be represented as a base
address plus a constant offset, which can lead to more efficient code
generation.
---
 llvm/lib/Target/LoongArch/LoongArchInstrInfo.td | 2 +-
 llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index ae73a8ac74c71..fcfff08d1a9f5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -1048,7 +1048,7 @@ class PatGprImm_32<SDPatternOperator OpNode, LAInst Inst, Operand ImmOpnd>
 /// Predicates
 def AddLike: PatFrags<(ops node:$A, node:$B),
                       [(add node:$A, node:$B), (or node:$A, node:$B)], [{
-    return N->getOpcode() == ISD::ADD || isOrEquivalentToAdd(N);
+    return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
 }]>;
 
 /// Simple arithmetic operations
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
index 09453004dcef7..ff845cb1f3dbd 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -22,8 +22,7 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der
 ; CHECK-NEXT:    st.d $a1, $a0, 24
 ; CHECK-NEXT:    ld.d $a1, $a3, 16
 ; CHECK-NEXT:    st.d $a1, $a0, 16
-; CHECK-NEXT:    ori $a1, $a3, 8
-; CHECK-NEXT:    ld.d $a1, $a1, 0
+; CHECK-NEXT:    ld.d $a1, $a3, 8
 ; CHECK-NEXT:    st.d $a1, $a0, 8
 ; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret

From c087bebb02ef35021547c6ddf0a3fdf1cbc8ad17 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 26 Feb 2024 13:23:43 +0900
Subject: [PATCH 272/546] Introduce mcdc::TVIdxBuilder (LLVM side, NFC)
 (#80676)

This is a preparation of incoming Clang changes (#82448) and just checks
`TVIdx` is calculated correctly. NFC.

`TVIdxBuilder` calculates deterministic Indices for each Condition Node.
It is used for `clang` to emit `TestVector` indices (aka ID) and for
`llvm-cov` to reconstruct `TestVectors`.

This includes the unittest `CoverageMappingTest.TVIdxBuilder`.

See also
https://discourse.llvm.org/t/rfc-coverage-new-algorithm-and-file-format-for-mc-dc/76798
---
 .../ProfileData/Coverage/CoverageMapping.h    |  49 ++++++
 .../ProfileData/Coverage/CoverageMapping.cpp  | 153 ++++++++++++++++--
 .../ProfileData/CoverageMappingTest.cpp       |  56 +++++++
 3 files changed, 246 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index c5c9740f25c2c..b39f262c3d976 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -550,6 +550,55 @@ struct MCDCRecord {
   }
 };
 
+namespace mcdc {
+/// Compute TestVector Indices "TVIdx" from the Conds graph.
+///
+/// Clang CodeGen handles the bitmap index based on TVIdx.
+/// llvm-cov reconstructs conditions from TVIdx.
+///
+/// For each leaf "The final decision",
+/// - TVIdx should be unique.
+/// - TVIdx has the Width.
+///   - The width represents the number of possible paths.
+///   - The minimum width is 1 "deterministic".
+/// - The order of leaves are sorted by Width DESC. It expects
+///   latter TVIdx(s) (with Width=1) could be pruned and altered to
+///   other simple branch conditions.
+///
+class TVIdxBuilder {
+public:
+  struct MCDCNode {
+    int InCount = 0; /// Reference count; temporary use
+    int Width;       /// Number of accumulated paths (>= 1)
+    ConditionIDs NextIDs;
+  };
+
+#ifndef NDEBUG
+  /// This is no longer needed after the assignment.
+  /// It may be used in assert() for reconfirmation.
+  SmallVector<MCDCNode> SavedNodes;
+#endif
+
+  /// Output: Index for TestVectors bitmap (These are not CondIDs)
+  SmallVector<std::array<int, 2>> Indices;
+
+  /// Output: The number of test vectors.
+  /// Error with HardMaxTVs if the number has exploded.
+  int NumTestVectors;
+
+  /// Hard limit of test vectors
+  static constexpr auto HardMaxTVs =
+      std::numeric_limits<decltype(NumTestVectors)>::max();
+
+public:
+  /// Calculate and assign Indices
+  /// \param NextIDs The list of {FalseID, TrueID} indexed by ID
+  ///        The first element [0] should be the root node.
+  /// \param Offset Offset of index to final decisions.
+  TVIdxBuilder(const SmallVectorImpl<ConditionIDs> &NextIDs, int Offset = 0);
+};
+} // namespace mcdc
+
 /// A Counter mapping context is used to connect the counters, expressions
 /// and the obtained counter values.
 class CounterMappingContext {
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 8f9d1eadc2daf..334f5dce879e5 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -223,9 +223,130 @@ Expected<int64_t> CounterMappingContext::evaluate(const Counter &C) const {
   return LastPoppedValue;
 }
 
+mcdc::TVIdxBuilder::TVIdxBuilder(const SmallVectorImpl<ConditionIDs> &NextIDs,
+                                 int Offset)
+    : Indices(NextIDs.size()) {
+  // Construct Nodes and set up each InCount
+  auto N = NextIDs.size();
+  SmallVector<MCDCNode> Nodes(N);
+  for (unsigned ID = 0; ID < N; ++ID) {
+    for (unsigned C = 0; C < 2; ++C) {
+#ifndef NDEBUG
+      Indices[ID][C] = INT_MIN;
+#endif
+      auto NextID = NextIDs[ID][C];
+      Nodes[ID].NextIDs[C] = NextID;
+      if (NextID >= 0)
+        ++Nodes[NextID].InCount;
+    }
+  }
+
+  // Sort key ordered by <-Width, Ord>
+  SmallVector<std::tuple<int,      /// -Width
+                         unsigned, /// Ord
+                         int,      /// ID
+                         unsigned  /// Cond (0 or 1)
+                         >>
+      Decisions;
+
+  // Traverse Nodes to assign Idx
+  SmallVector<int> Q;
+  assert(Nodes[0].InCount == 0);
+  Nodes[0].Width = 1;
+  Q.push_back(0);
+
+  unsigned Ord = 0;
+  while (!Q.empty()) {
+    auto IID = Q.begin();
+    int ID = *IID;
+    Q.erase(IID);
+    auto &Node = Nodes[ID];
+    assert(Node.Width > 0);
+
+    for (unsigned I = 0; I < 2; ++I) {
+      auto NextID = Node.NextIDs[I];
+      assert(NextID != 0 && "NextID should not point to the top");
+      if (NextID < 0) {
+        // Decision
+        Decisions.emplace_back(-Node.Width, Ord++, ID, I);
+        assert(Ord == Decisions.size());
+        continue;
+      }
+
+      // Inter Node
+      auto &NextNode = Nodes[NextID];
+      assert(NextNode.InCount > 0);
+
+      // Assign Idx
+      assert(Indices[ID][I] == INT_MIN);
+      Indices[ID][I] = NextNode.Width;
+      auto NextWidth = int64_t(NextNode.Width) + Node.Width;
+      if (NextWidth > HardMaxTVs) {
+        NumTestVectors = HardMaxTVs; // Overflow
+        return;
+      }
+      NextNode.Width = NextWidth;
+
+      // Ready if all incomings are processed.
+      // Or NextNode.Width hasn't been confirmed yet.
+      if (--NextNode.InCount == 0)
+        Q.push_back(NextID);
+    }
+  }
+
+  llvm::sort(Decisions);
+
+  // Assign TestVector Indices in Decision Nodes
+  int64_t CurIdx = 0;
+  for (auto [NegWidth, Ord, ID, C] : Decisions) {
+    int Width = -NegWidth;
+    assert(Nodes[ID].Width == Width);
+    assert(Nodes[ID].NextIDs[C] < 0);
+    assert(Indices[ID][C] == INT_MIN);
+    Indices[ID][C] = Offset + CurIdx;
+    CurIdx += Width;
+    if (CurIdx > HardMaxTVs) {
+      NumTestVectors = HardMaxTVs; // Overflow
+      return;
+    }
+  }
+
+  assert(CurIdx < HardMaxTVs);
+  NumTestVectors = CurIdx;
+
+#ifndef NDEBUG
+  for (const auto &Idxs : Indices)
+    for (auto Idx : Idxs)
+      assert(Idx != INT_MIN);
+  SavedNodes = std::move(Nodes);
+#endif
+}
+
 namespace {
 
-class MCDCRecordProcessor {
+/// Construct this->NextIDs with Branches for TVIdxBuilder to use it
+/// before MCDCRecordProcessor().
+class NextIDsBuilder {
+protected:
+  SmallVector<mcdc::ConditionIDs> NextIDs;
+
+public:
+  NextIDsBuilder(const ArrayRef<const CounterMappingRegion *> Branches)
+      : NextIDs(Branches.size()) {
+#ifndef NDEBUG
+    DenseSet<mcdc::ConditionID> SeenIDs;
+#endif
+    for (const auto *Branch : Branches) {
+      const auto &BranchParams = Branch->getBranchParams();
+      assert(BranchParams.ID >= 0 && "CondID isn't set");
+      assert(SeenIDs.insert(BranchParams.ID).second && "Duplicate CondID");
+      NextIDs[BranchParams.ID] = BranchParams.Conds;
+    }
+    assert(SeenIDs.size() == Branches.size());
+  }
+};
+
+class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   /// A bitmap representing the executed test vectors for a boolean expression.
   /// Each index of the bitmap corresponds to a possible test vector. An index
   /// with a bit value of '1' indicates that the corresponding Test Vector
@@ -243,9 +364,6 @@ class MCDCRecordProcessor {
   /// Total number of conditions in the boolean expression.
   unsigned NumConditions;
 
-  /// Mapping of a condition ID to its corresponding branch params.
-  llvm::DenseMap<unsigned, mcdc::ConditionIDs> CondsMap;
-
   /// Vector used to track whether a condition is constant folded.
   MCDCRecord::BoolVector Folded;
 
@@ -256,13 +374,17 @@ class MCDCRecordProcessor {
   /// ExecutedTestVectorBitmap.
   MCDCRecord::TestVectors ExecVectors;
 
+#ifndef NDEBUG
+  DenseSet<unsigned> TVIdxs;
+#endif
+
 public:
   MCDCRecordProcessor(const BitVector &Bitmap,
                       const CounterMappingRegion &Region,
                       ArrayRef<const CounterMappingRegion *> Branches)
-      : Bitmap(Bitmap), Region(Region),
-        DecisionParams(Region.getDecisionParams()), Branches(Branches),
-        NumConditions(DecisionParams.NumConditions),
+      : NextIDsBuilder(Branches), TVIdxBuilder(this->NextIDs), Bitmap(Bitmap),
+        Region(Region), DecisionParams(Region.getDecisionParams()),
+        Branches(Branches), NumConditions(DecisionParams.NumConditions),
         Folded(NumConditions, false), IndependencePairs(NumConditions) {}
 
 private:
@@ -270,7 +392,7 @@ class MCDCRecordProcessor {
   // each node. When a terminal node (ID == 0) is reached, fill in the value in
   // the truth table.
   void buildTestVector(MCDCRecord::TestVector &TV, mcdc::ConditionID ID,
-                       unsigned Index) {
+                       int TVIdx, unsigned Index) {
     assert((Index & (1 << ID)) == 0);
 
     for (auto MCDCCond : {MCDCRecord::MCDC_False, MCDCRecord::MCDC_True}) {
@@ -278,12 +400,17 @@ class MCDCRecordProcessor {
       static_assert(MCDCRecord::MCDC_True == 1);
       Index |= MCDCCond << ID;
       TV[ID] = MCDCCond;
-      auto NextID = CondsMap[ID][MCDCCond];
+      auto NextID = NextIDs[ID][MCDCCond];
+      auto NextTVIdx = TVIdx + Indices[ID][MCDCCond];
+      assert(NextID == SavedNodes[ID].NextIDs[MCDCCond]);
       if (NextID >= 0) {
-        buildTestVector(TV, NextID, Index);
+        buildTestVector(TV, NextID, NextTVIdx, Index);
         continue;
       }
 
+      assert(TVIdx < SavedNodes[ID].Width);
+      assert(TVIdxs.insert(NextTVIdx).second && "Duplicate TVIdx");
+
       if (!Bitmap[DecisionParams.BitmapIdx * CHAR_BIT + Index])
         continue;
 
@@ -304,9 +431,12 @@ class MCDCRecordProcessor {
   void findExecutedTestVectors() {
     // Walk the binary decision diagram to enumerate all possible test vectors.
     // We start at the root node (ID == 0) with all values being DontCare.
+    // `TVIdx` starts with 0 and is in the traversal.
     // `Index` encodes the bitmask of true values and is initially 0.
     MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
-    buildTestVector(TV, 0, 0);
+    buildTestVector(TV, 0, 0, 0);
+    assert(TVIdxs.size() == unsigned(NumTestVectors) &&
+           "TVIdxs wasn't fulfilled");
   }
 
   // Find an independence pair for each condition:
@@ -367,7 +497,6 @@ class MCDCRecordProcessor {
     //   from being measured.
     for (const auto *B : Branches) {
       const auto &BranchParams = B->getBranchParams();
-      CondsMap[BranchParams.ID] = BranchParams.Conds;
       PosToID[I] = BranchParams.ID;
       CondLoc[I] = B->startLoc();
       Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero());
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 425b3d10510af..f063a33205b30 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
 
+#include <map>
 #include <ostream>
 #include <utility>
 
@@ -1074,4 +1075,59 @@ TEST(CoverageMappingTest, filename_compilation_dir) {
   }
 }
 
+TEST(CoverageMappingTest, TVIdxBuilder) {
+  // ((n0 && n3) || (n2 && n4) || (n1 && n5))
+  static const std::array<mcdc::ConditionIDs, 6> Branches = {{
+      {2, 3},
+      {-1, 5},
+      {1, 4},
+      {2, -1},
+      {1, -1},
+      {-1, -1},
+  }};
+  int Offset = 1000;
+  auto TheBuilder = mcdc::TVIdxBuilder(
+      SmallVector<mcdc::ConditionIDs>(ArrayRef(Branches)), Offset);
+  EXPECT_TRUE(TheBuilder.NumTestVectors < TheBuilder.HardMaxTVs);
+  EXPECT_EQ(TheBuilder.Indices.size(), 6u);
+  EXPECT_EQ(TheBuilder.NumTestVectors, 15);
+
+  std::map<int, int> Decisions;
+  for (unsigned I = 0; I < TheBuilder.Indices.size(); ++I) {
+    struct Rec {
+      int Width;
+      std::array<int, 2> Indices;
+    };
+    static const std::array<Rec, 6> IndicesRefs = {{
+        {1, {0, 0}},
+        {4, {1000, 0}},
+        {2, {0, 0}},
+        {1, {1, 1014}},
+        {2, {2, 1012}},
+        {4, {1004, 1008}},
+    }};
+    EXPECT_EQ(TheBuilder.Indices[I], IndicesRefs[I].Indices);
+
+#ifndef NDEBUG
+    const auto &Node = TheBuilder.SavedNodes[I];
+    EXPECT_EQ(Node.Width, IndicesRefs[I].Width);
+    for (int C = 0; C < 2; ++C) {
+      auto Index = TheBuilder.Indices[I][C];
+      if (Node.NextIDs[C] < 0)
+        EXPECT_TRUE(Decisions.insert({Index, Node.Width}).second);
+    }
+#endif
+  }
+
+#ifndef NDEBUG
+  int NextIdx = Offset;
+  for (const auto [Index, Width] : Decisions) {
+    EXPECT_EQ(Index, NextIdx);
+    NextIdx += Width;
+  }
+  // The sum of Width(s) is NumTVs.
+  EXPECT_EQ(NextIdx, Offset + TheBuilder.NumTestVectors);
+#endif
+}
+
 } // end anonymous namespace

From ce4da0c4ee6e414ddb771d45b68bc7e31b12970e Mon Sep 17 00:00:00 2001
From: Matteo Franciolini <mfranciolini@tesla.com>
Date: Sun, 25 Feb 2024 20:34:56 -0800
Subject: [PATCH 273/546] Enables textual IR roundtripping through
 `--verifyRoundtrip` (#82946)

The patch enables roundtrip to textual file when running
`--verifyRoundtrip`. The verification is successful if both textual and
bytecode formats can roundtrip successfully.
---
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index f01c7631decb7..2755a949fb947 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -276,6 +276,7 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
   if (!irdlFile.empty() && failed(loadIRDLDialects(irdlFile, roundtripContext)))
     return failure();
 
+  std::string testType = (useBytecode) ? "bytecode" : "textual";
   // Print a first time with custom format (or bytecode) and parse it back to
   // the roundtripModule.
   {
@@ -289,7 +290,7 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
       }
     } else {
       op->print(ostream,
-                OpPrintingFlags().printGenericOpForm(false).enableDebugInfo());
+                OpPrintingFlags().printGenericOpForm().enableDebugInfo());
     }
     FallbackAsmResourceMap fallbackResourceMap;
     ParserConfig parseConfig(&roundtripContext, /*verifyAfterParse=*/true,
@@ -297,8 +298,8 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
     roundtripModule =
         parseSourceString<Operation *>(ostream.str(), parseConfig);
     if (!roundtripModule) {
-      op->emitOpError()
-          << "failed to parse bytecode back, cannot verify round-trip.\n";
+      op->emitOpError() << "failed to parse " << testType
+                        << " content back, cannot verify round-trip.\n";
       return failure();
     }
   }
@@ -317,10 +318,12 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
   }
   if (reference != roundtrip) {
     // TODO implement a diff.
-    return op->emitOpError() << "roundTrip testing roundtripped module differs "
-                                "from reference:\n<<<<<<Reference\n"
-                             << reference << "\n=====\n"
-                             << roundtrip << "\n>>>>>roundtripped\n";
+    return op->emitOpError()
+           << testType
+           << " roundTrip testing roundtripped module differs "
+              "from reference:\n<<<<<<Reference\n"
+           << reference << "\n=====\n"
+           << roundtrip << "\n>>>>>roundtripped\n";
   }
 
   return success();
@@ -328,10 +331,9 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
 
 static LogicalResult doVerifyRoundTrip(Operation *op,
                                        const MlirOptMainConfig &config) {
-  // Textual round-trip isn't fully robust at the moment (for example implicit
-  // terminator are losing location informations).
-
-  return doVerifyRoundTrip(op, config, /*useBytecode=*/true);
+  auto txtStatus = doVerifyRoundTrip(op, config, /*useBytecode=*/false);
+  auto bcStatus = doVerifyRoundTrip(op, config, /*useBytecode=*/true);
+  return success(succeeded(txtStatus) && succeeded(bcStatus));
 }
 
 /// Perform the actions on the input file indicated by the command line flags

From e5332482c6009699d7b66393617a389d8ae62710 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Sun, 25 Feb 2024 20:47:08 -0800
Subject: [PATCH 274/546] [MLIR][Spirv] Use StringAttr for linkage_name (NFC)
 (#82953)

std::string was used here, likely by mistake. The usual convention for
attributes is to use StringAttr.
---
 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td  | 2 +-
 mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 2 +-
 mlir/test/Dialect/SPIRV/IR/function-decorations.mlir   | 2 +-
 mlir/test/Dialect/SPIRV/IR/structure-ops.mlir          | 4 ++--
 mlir/test/Target/SPIRV/decorations.mlir                | 2 +-
 mlir/test/Target/SPIRV/function-decorations.mlir       | 2 +-
 mlir/test/Target/SPIRV/global-variable.mlir            | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
index f2c1ee5cfd56e..74d36445e3113 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td
@@ -48,7 +48,7 @@ def SPIRV_CapabilityArrayAttr : TypedArrayAttrBase<
 
 def SPIRV_LinkageAttributesAttr : SPIRV_Attr<"LinkageAttributes", "linkage_attributes"> {
   let parameters = (ins
-    "std::string":$linkage_name,
+    "StringAttr":$linkage_name,
     "mlir::spirv::LinkageTypeAttr":$linkage_type
   );
   let assemblyFormat = "`<` struct(params) `>`";
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index bfd289eb350b5..57e8d9a772962 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -293,7 +293,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef<uint32_t> words) {
     auto linkageTypeAttr = opBuilder.getAttr<::mlir::spirv::LinkageTypeAttr>(
         static_cast<::mlir::spirv::LinkageType>(words[wordIndex++]));
     auto linkageAttr = opBuilder.getAttr<::mlir::spirv::LinkageAttributesAttr>(
-        linkageName, linkageTypeAttr);
+        StringAttr::get(context, linkageName), linkageTypeAttr);
     decorations[words[0]].set(symbol, llvm::dyn_cast<Attribute>(linkageAttr));
     break;
   }
diff --git a/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir b/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir
index d915f8820c4f4..07e187e6a7d68 100644
--- a/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir
@@ -8,7 +8,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
         spirv.FunctionCall @outside.func.with.linkage(%uchar_0):(i8) -> ()
         spirv.Return
     }
-    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = outside.func, linkage_type = <Import>>
+    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outside.func", linkage_type = <Import>>
     spirv.func @outside.func.with.linkage(%arg0 : i8) -> () "Pure" attributes {
       linkage_attributes=#spirv.linkage_attributes<
         linkage_name="outside.func",
diff --git a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
index 6e4c8688666a5..db0f52dcc40ed 100644
--- a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
@@ -271,7 +271,7 @@ spirv.func @baz(%arg: i32) "DontInline" attributes {
 // -----
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = outside.func, linkage_type = <Import>>
+    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outside.func", linkage_type = <Import>>
     spirv.func @outside.func.with.linkage(%arg0 : i8) -> () "Pure" attributes {
       linkage_attributes=#spirv.linkage_attributes<
         linkage_name="outside.func",
@@ -396,7 +396,7 @@ module {
 // -----
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-  // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = outSideGlobalVar1, linkage_type = <Import>>
+  // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outSideGlobalVar1", linkage_type = <Import>>
   spirv.GlobalVariable @var1 {
     linkage_attributes=#spirv.linkage_attributes<
       linkage_name="outSideGlobalVar1", 
diff --git a/mlir/test/Target/SPIRV/decorations.mlir b/mlir/test/Target/SPIRV/decorations.mlir
index d215c33972e7f..e8f75a1edbd08 100644
--- a/mlir/test/Target/SPIRV/decorations.mlir
+++ b/mlir/test/Target/SPIRV/decorations.mlir
@@ -59,7 +59,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 // -----
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-  // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = outSideGlobalVar1, linkage_type = <Import>>
+  // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outSideGlobalVar1", linkage_type = <Import>>
   spirv.GlobalVariable @var1 {
     linkage_attributes=#spirv.linkage_attributes<
       linkage_name="outSideGlobalVar1", 
diff --git a/mlir/test/Target/SPIRV/function-decorations.mlir b/mlir/test/Target/SPIRV/function-decorations.mlir
index 117d4ca628f76..ef627145aac3e 100644
--- a/mlir/test/Target/SPIRV/function-decorations.mlir
+++ b/mlir/test/Target/SPIRV/function-decorations.mlir
@@ -8,7 +8,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
         spirv.FunctionCall @outside.func.with.linkage(%uchar_0):(i8) -> ()
         spirv.Return
     }
-    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = outside.func, linkage_type = <Import>>
+    // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outside.func", linkage_type = <Import>>
     spirv.func @outside.func.with.linkage(%arg0 : i8) -> () "Pure" attributes {
       linkage_attributes=#spirv.linkage_attributes<
         linkage_name="outside.func",
diff --git a/mlir/test/Target/SPIRV/global-variable.mlir b/mlir/test/Target/SPIRV/global-variable.mlir
index f22d2a9b3d14d..28b2706d3d163 100644
--- a/mlir/test/Target/SPIRV/global-variable.mlir
+++ b/mlir/test/Target/SPIRV/global-variable.mlir
@@ -62,7 +62,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
 // -----
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, Linkage], []> {
-  // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = outSideGlobalVar1, linkage_type = <Import>>
+  // CHECK: linkage_attributes = #spirv.linkage_attributes<linkage_name = "outSideGlobalVar1", linkage_type = <Import>>
   spirv.GlobalVariable @var1 {
     linkage_attributes=#spirv.linkage_attributes<
       linkage_name="outSideGlobalVar1", 

From f75c6ed93e785c09884a317ce2bfd440e7f8f573 Mon Sep 17 00:00:00 2001
From: Jason Eckhardt <jeckhardt@nvidia.com>
Date: Sun, 25 Feb 2024 22:58:17 -0600
Subject: [PATCH 275/546] [TableGen] Efficiency improvements for encoding
 HwMode collection. (#82902)

Currently the DecoderEmitter spends a fair amount of cycles performing
repeated linear walks over the entire instruction list. This patch
eliminates one such walk during HwMode collection for EncodingInfos.

The eliminated traversal visits every instruction and then every
EncodingInfos entry for that instruction merely to collect all
referenced HwModes. That information already happens to be present in
the HwModeSelects created during the one-time construction of
CodeGenHwModes. We instead traverse the HwModeSelects, collecting each
one referenced as an encoding select. This set is a small constant in
size and does not generally grow with the size of the instruction set.
---
 llvm/utils/TableGen/CodeGenHwModes.h   |  3 ++
 llvm/utils/TableGen/DecoderEmitter.cpp | 45 ++++++++++++++------------
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 9a5b7a8c2c1c9..56639f741ede1 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -53,6 +53,9 @@ struct CodeGenHwModes {
     return Modes[Id - 1];
   }
   const HwModeSelect &getHwModeSelect(Record *R) const;
+  const std::map<Record *, HwModeSelect> &getHwModeSelects() const {
+    return ModeSelects;
+  }
   unsigned getNumModeIds() const { return Modes.size() + 1; }
   void dump() const;
 
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 36f437f02cf51..4ce5a73d77566 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -2448,6 +2449,22 @@ static void emitCheck(formatted_raw_ostream &OS) {
      << "}\n\n";
 }
 
+// Collect all HwModes referenced by the target for encoding purposes,
+// returning a vector of corresponding names.
+static void
+collectHwModesReferencedForEncodings(const CodeGenHwModes &HWM,
+                                     std::vector<StringRef> &Names) {
+  SmallBitVector BV(HWM.getNumModeIds());
+  for (const auto &MS : HWM.getHwModeSelects()) {
+    for (const HwModeSelect::PairType &P : MS.second.Items) {
+      if (P.second->isSubClassOf("InstructionEncoding"))
+        BV.set(P.first);
+    }
+  }
+  transform(BV.set_bits(), std::back_inserter(Names),
+            [&HWM](const int &M) { return HWM.getMode(M).Name; });
+}
+
 // Emits disassembler code for instruction decoding.
 void DecoderEmitter::run(raw_ostream &o) {
   formatted_raw_ostream OS(o);
@@ -2469,37 +2486,25 @@ void DecoderEmitter::run(raw_ostream &o) {
   Target.reverseBitsForLittleEndianEncoding();
 
   // Parameterize the decoders based on namespace and instruction width.
-  std::set<StringRef> HwModeNames;
-  const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
-  NumberedEncodings.reserve(NumberedInstructions.size());
-  // First, collect all HwModes referenced by the target.
-  for (const auto &NumberedInstruction : NumberedInstructions) {
-    if (const RecordVal *RV =
-            NumberedInstruction->TheDef->getValue("EncodingInfos")) {
-      if (auto *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
-        const CodeGenHwModes &HWM = Target.getHwModes();
-        EncodingInfoByHwMode EBM(DI->getDef(), HWM);
-        for (auto &KV : EBM)
-          HwModeNames.insert(HWM.getMode(KV.first).Name);
-      }
-    }
-  }
 
+  // First, collect all encoding-related HwModes referenced by the target.
   // If HwModeNames is empty, add the empty string so we always have one HwMode.
+  const CodeGenHwModes &HWM = Target.getHwModes();
+  std::vector<StringRef> HwModeNames;
+  collectHwModesReferencedForEncodings(HWM, HwModeNames);
   if (HwModeNames.empty())
-    HwModeNames.insert("");
+    HwModeNames.push_back("");
 
+  const auto &NumberedInstructions = Target.getInstructionsByEnumValue();
+  NumberedEncodings.reserve(NumberedInstructions.size());
   for (const auto &NumberedInstruction : NumberedInstructions) {
     if (const RecordVal *RV =
             NumberedInstruction->TheDef->getValue("EncodingInfos")) {
       if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
-        const CodeGenHwModes &HWM = Target.getHwModes();
         EncodingInfoByHwMode EBM(DI->getDef(), HWM);
-        for (auto &KV : EBM) {
+        for (auto &KV : EBM)
           NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
                                          HWM.getMode(KV.first).Name);
-          HwModeNames.insert(HWM.getMode(KV.first).Name);
-        }
         continue;
       }
     }

From 8c5e9cf737138aba22a4a8f64ef2c5efc80dd7f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sun, 25 Feb 2024 14:03:47 +0100
Subject: [PATCH 276/546] [clang][Interp] Implement nullability argument
 checking

Implement constexpr checking for null pointers being passed to
arguments annotated as nonnull.
---
 clang/lib/AST/CMakeLists.txt             |  1 +
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 26 ++++++--
 clang/lib/AST/Interp/Function.h          |  5 +-
 clang/lib/AST/Interp/Interp.cpp          | 29 ++++++++-
 clang/lib/AST/Interp/Interp.h            | 26 +++++++-
 clang/lib/AST/Interp/InterpShared.cpp    | 42 +++++++++++++
 clang/lib/AST/Interp/InterpShared.h      | 26 ++++++++
 clang/lib/AST/Interp/Opcodes.td          |  7 ++-
 clang/test/AST/Interp/nullable.cpp       | 77 ++++++++++++++++++++++++
 clang/test/Sema/attr-nonnull.c           |  1 +
 clang/test/SemaCXX/attr-nonnull.cpp      |  3 +-
 11 files changed, 231 insertions(+), 12 deletions(-)
 create mode 100644 clang/lib/AST/Interp/InterpShared.cpp
 create mode 100644 clang/lib/AST/Interp/InterpShared.h
 create mode 100644 clang/test/AST/Interp/nullable.cpp

diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index d793c3ed0410a..6ea1ca3e76cf3 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -88,6 +88,7 @@ add_clang_library(clangAST
   Interp/Record.cpp
   Interp/Source.cpp
   Interp/State.cpp
+  Interp/InterpShared.cpp
   ItaniumCXXABI.cpp
   ItaniumMangle.cpp
   JSONNodeDumper.cpp
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 7f97d8ce9fb80..eb5a1b536b779 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -13,8 +13,10 @@
 #include "Context.h"
 #include "Floating.h"
 #include "Function.h"
+#include "InterpShared.h"
 #include "PrimType.h"
 #include "Program.h"
+#include "clang/AST/Attr.h"
 
 using namespace clang;
 using namespace clang::interp;
@@ -2656,6 +2658,7 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
   QualType ReturnType = E->getCallReturnType(Ctx.getASTContext());
   std::optional<PrimType> T = classify(ReturnType);
   bool HasRVO = !ReturnType->isVoidType() && !T;
+  const FunctionDecl *FuncDecl = E->getDirectCallee();
 
   if (HasRVO) {
     if (DiscardResult) {
@@ -2673,17 +2676,16 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
     }
   }
 
-  auto Args = E->arguments();
+  auto Args = llvm::ArrayRef(E->getArgs(), E->getNumArgs());
   // Calling a static operator will still
   // pass the instance, but we don't need it.
   // Discard it here.
   if (isa<CXXOperatorCallExpr>(E)) {
-    if (const auto *MD =
-            dyn_cast_if_present<CXXMethodDecl>(E->getDirectCallee());
+    if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(FuncDecl);
         MD && MD->isStatic()) {
       if (!this->discard(E->getArg(0)))
         return false;
-      Args = drop_begin(Args, 1);
+      Args = Args.drop_front();
     }
   }
 
@@ -2693,13 +2695,25 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
       return false;
   }
 
+  llvm::BitVector NonNullArgs = collectNonNullArgs(FuncDecl, Args);
   // Put arguments on the stack.
+  unsigned ArgIndex = 0;
   for (const auto *Arg : Args) {
     if (!this->visit(Arg))
       return false;
+
+    // If we know the callee already, check the known parametrs for nullability.
+    if (FuncDecl && NonNullArgs[ArgIndex]) {
+      PrimType ArgT = classify(Arg).value_or(PT_Ptr);
+      if (ArgT == PT_Ptr || ArgT == PT_FnPtr) {
+        if (!this->emitCheckNonNullArg(ArgT, Arg))
+          return false;
+      }
+    }
+    ++ArgIndex;
   }
 
-  if (const FunctionDecl *FuncDecl = E->getDirectCallee()) {
+  if (FuncDecl) {
     const Function *Func = getFunction(FuncDecl);
     if (!Func)
       return false;
@@ -2748,7 +2762,7 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
     if (!this->visit(E->getCallee()))
       return false;
 
-    if (!this->emitCallPtr(ArgSize, E))
+    if (!this->emitCallPtr(ArgSize, E, E))
       return false;
   }
 
diff --git a/clang/lib/AST/Interp/Function.h b/clang/lib/AST/Interp/Function.h
index b19d64f9371e3..0be4564e1e9ec 100644
--- a/clang/lib/AST/Interp/Function.h
+++ b/clang/lib/AST/Interp/Function.h
@@ -15,9 +15,10 @@
 #ifndef LLVM_CLANG_AST_INTERP_FUNCTION_H
 #define LLVM_CLANG_AST_INTERP_FUNCTION_H
 
-#include "Source.h"
 #include "Descriptor.h"
+#include "Source.h"
 #include "clang/AST/ASTLambda.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -108,6 +109,8 @@ class Function final {
   /// Checks if the first argument is a RVO pointer.
   bool hasRVO() const { return HasRVO; }
 
+  bool hasNonNullAttr() const { return getDecl()->hasAttr<NonNullAttr>(); }
+
   /// Range over the scope blocks.
   llvm::iterator_range<llvm::SmallVector<Scope, 2>::const_iterator>
   scopes() const {
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index b2fe70dc14f9d..5670888c245eb 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -7,10 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "Interp.h"
-#include <limits>
-#include <vector>
 #include "Function.h"
 #include "InterpFrame.h"
+#include "InterpShared.h"
 #include "InterpStack.h"
 #include "Opcode.h"
 #include "PrimType.h"
@@ -22,6 +21,10 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "llvm/ADT/APSInt.h"
+#include <limits>
+#include <vector>
+
+using namespace clang;
 
 using namespace clang;
 using namespace clang::interp;
@@ -622,6 +625,28 @@ bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
   return false;
 }
 
+bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
+                      const CallExpr *CE, unsigned ArgSize) {
+  auto Args = llvm::ArrayRef(CE->getArgs(), CE->getNumArgs());
+  auto NonNullArgs = collectNonNullArgs(F->getDecl(), Args);
+  unsigned Offset = 0;
+  unsigned Index = 0;
+  for (const Expr *Arg : Args) {
+    if (NonNullArgs[Index] && Arg->getType()->isPointerType()) {
+      const Pointer &ArgPtr = S.Stk.peek<Pointer>(ArgSize - Offset);
+      if (ArgPtr.isZero()) {
+        const SourceLocation &Loc = S.Current->getLocation(OpPC);
+        S.CCEDiag(Loc, diag::note_non_null_attribute_failed);
+        return false;
+      }
+    }
+
+    Offset += align(primSize(S.Ctx.classify(Arg).value_or(PT_Ptr)));
+    ++Index;
+  }
+  return true;
+}
+
 bool Interpret(InterpState &S, APValue &Result) {
   // The current stack frame when we started Interpret().
   // This is being used by the ops to determine wheter
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index d885d19ce7064..7994550cc7b97 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -113,6 +113,10 @@ bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This);
 /// Checks if a method is pure virtual.
 bool CheckPure(InterpState &S, CodePtr OpPC, const CXXMethodDecl *MD);
 
+/// Checks if all the arguments annotated as 'nonnull' are in fact not null.
+bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
+                      const CallExpr *CE, unsigned ArgSize);
+
 /// Sets the given integral value to the pointer, which is of
 /// a std::{weak,partial,strong}_ordering type.
 bool SetThreeWayComparisonField(InterpState &S, CodePtr OpPC,
@@ -1980,6 +1984,7 @@ inline bool CallVar(InterpState &S, CodePtr OpPC, const Function *Func,
 
   return false;
 }
+
 inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
                  uint32_t VarArgSize) {
   if (Func->hasThisPointer()) {
@@ -2083,7 +2088,8 @@ inline bool CallBI(InterpState &S, CodePtr &PC, const Function *Func,
   return false;
 }
 
-inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize) {
+inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize,
+                    const CallExpr *CE) {
   const FunctionPointer &FuncPtr = S.Stk.pop<FunctionPointer>();
 
   const Function *F = FuncPtr.getFunction();
@@ -2095,6 +2101,12 @@ inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize) {
   }
   assert(F);
 
+  // Check argument nullability state.
+  if (F->hasNonNullAttr()) {
+    if (!CheckNonNullArgs(S, OpPC, F, CE, ArgSize))
+      return false;
+  }
+
   assert(ArgSize >= F->getWrittenArgSize());
   uint32_t VarArgSize = ArgSize - F->getWrittenArgSize();
 
@@ -2151,6 +2163,18 @@ inline bool OffsetOf(InterpState &S, CodePtr OpPC, const OffsetOfExpr *E) {
   return true;
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+inline bool CheckNonNullArg(InterpState &S, CodePtr OpPC) {
+  const T &Arg = S.Stk.peek<T>();
+  if (!Arg.isZero())
+    return true;
+
+  const SourceLocation &Loc = S.Current->getLocation(OpPC);
+  S.CCEDiag(Loc, diag::note_non_null_attribute_failed);
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Read opcode arguments
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/InterpShared.cpp b/clang/lib/AST/Interp/InterpShared.cpp
new file mode 100644
index 0000000000000..6af03691f1b20
--- /dev/null
+++ b/clang/lib/AST/Interp/InterpShared.cpp
@@ -0,0 +1,42 @@
+//===--- InterpShared.cpp ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "InterpShared.h"
+#include "clang/AST/Attr.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace clang {
+namespace interp {
+
+llvm::BitVector collectNonNullArgs(const FunctionDecl *F,
+                                   const llvm::ArrayRef<const Expr *> &Args) {
+  llvm::BitVector NonNullArgs;
+  if (!F)
+    return NonNullArgs;
+
+  assert(F);
+  NonNullArgs.resize(Args.size());
+
+  for (const auto *Attr : F->specific_attrs<NonNullAttr>()) {
+    if (!Attr->args_size()) {
+      NonNullArgs.set();
+      break;
+    } else
+      for (auto Idx : Attr->args()) {
+        unsigned ASTIdx = Idx.getASTIndex();
+        if (ASTIdx >= Args.size())
+          continue;
+        NonNullArgs[ASTIdx] = true;
+      }
+  }
+
+  return NonNullArgs;
+}
+
+} // namespace interp
+} // namespace clang
diff --git a/clang/lib/AST/Interp/InterpShared.h b/clang/lib/AST/Interp/InterpShared.h
new file mode 100644
index 0000000000000..8c5e0bee22c92
--- /dev/null
+++ b/clang/lib/AST/Interp/InterpShared.h
@@ -0,0 +1,26 @@
+//===--- InterpShared.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_AST_INTERP_SHARED_H
+#define LLVM_CLANG_LIB_AST_INTERP_SHARED_H
+
+#include "llvm/ADT/BitVector.h"
+
+namespace clang {
+class FunctionDecl;
+class Expr;
+
+namespace interp {
+
+llvm::BitVector collectNonNullArgs(const FunctionDecl *F,
+                                   const llvm::ArrayRef<const Expr *> &Args);
+
+} // namespace interp
+} // namespace clang
+
+#endif
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 5add723842d2b..e36c42d450fc9 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -206,7 +206,7 @@ def CallBI : Opcode {
 }
 
 def CallPtr : Opcode {
-  let Args = [ArgUint32];
+  let Args = [ArgUint32, ArgCallExpr];
   let Types = [];
 }
 
@@ -706,3 +706,8 @@ def InvalidDeclRef : Opcode {
 }
 
 def ArrayDecay : Opcode;
+
+def CheckNonNullArg : Opcode {
+  let Types = [PtrTypeClass];
+  let HasGroup = 1;
+}
diff --git a/clang/test/AST/Interp/nullable.cpp b/clang/test/AST/Interp/nullable.cpp
new file mode 100644
index 0000000000000..3bc2595fb8f00
--- /dev/null
+++ b/clang/test/AST/Interp/nullable.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+
+constexpr int dummy = 1;
+constexpr const int *null = nullptr;
+
+namespace simple {
+  __attribute__((nonnull))
+  constexpr int simple1(const int*) {
+    return 1;
+  }
+  static_assert(simple1(&dummy) == 1, "");
+  static_assert(simple1(nullptr) == 1, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{null passed to a callee}}
+  static_assert(simple1(null) == 1, ""); // both-error {{not an integral constant expression}} \
+                                         // both-note {{null passed to a callee}}
+
+  __attribute__((nonnull)) // both-warning {{applied to function with no pointer arguments}}
+  constexpr int simple2(const int &a) {
+    return 12;
+  }
+  static_assert(simple2(1) == 12, "");
+}
+
+namespace methods {
+  struct S {
+    __attribute__((nonnull(2))) // both-warning {{only applies to pointer arguments}}
+    __attribute__((nonnull(3)))
+    constexpr int foo(int a, const void *p) const {
+      return 12;
+    }
+
+    __attribute__((nonnull(3)))
+    constexpr int foo2(...) const {
+      return 12;
+    }
+
+    __attribute__((nonnull))
+    constexpr int foo3(...) const {
+      return 12;
+    }
+  };
+
+  constexpr S s{};
+  static_assert(s.foo(8, &dummy) == 12, "");
+
+  static_assert(s.foo2(nullptr) == 12, "");
+  static_assert(s.foo2(1, nullptr) == 12, ""); // both-error {{not an integral constant expression}} \
+                                               // both-note {{null passed to a callee}}
+
+  constexpr S *s2 = nullptr;
+  static_assert(s2->foo3() == 12, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{member call on dereferenced null pointer}}
+}
+
+namespace fnptrs {
+  __attribute__((nonnull))
+  constexpr int add(int a, const void *p) {
+    return a + 1;
+  }
+  __attribute__((nonnull(3)))
+  constexpr int applyBinOp(int a, int b, int (*op)(int, const void *)) {
+    return op(a, nullptr); // both-note {{null passed to a callee}}
+  }
+  static_assert(applyBinOp(10, 20, add) == 11, ""); // both-error {{not an integral constant expression}} \
+                                                    // both-note {{in call to}}
+
+  static_assert(applyBinOp(10, 20, nullptr) == 11, ""); // both-error {{not an integral constant expression}} \
+                                                        // both-note {{null passed to a callee}}
+}
+
+namespace lambdas {
+  auto lstatic = [](const void *P) __attribute__((nonnull)) { return 3; };
+  static_assert(lstatic(nullptr) == 3, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{null passed to a callee}}
+}
diff --git a/clang/test/Sema/attr-nonnull.c b/clang/test/Sema/attr-nonnull.c
index f8de31716a80c..865348daef10e 100644
--- a/clang/test/Sema/attr-nonnull.c
+++ b/clang/test/Sema/attr-nonnull.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -fexperimental-new-constant-interpreter
 
 void f1(int *a1, int *a2, int *a3, int *a4, int *a5, int *a6, int *a7,
         int *a8, int *a9, int *a10, int *a11, int *a12, int *a13, int *a14,
diff --git a/clang/test/SemaCXX/attr-nonnull.cpp b/clang/test/SemaCXX/attr-nonnull.cpp
index 21eedcf376d5b..6f9119b519d09 100644
--- a/clang/test/SemaCXX/attr-nonnull.cpp
+++ b/clang/test/SemaCXX/attr-nonnull.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
 struct S {
   S(const char *) __attribute__((nonnull(2)));
 
@@ -84,4 +85,4 @@ constexpr int i4 = f4(&c, 0, 0); //expected-error {{constant expression}} expect
 constexpr int i42 = f4(0, &c, 1); //expected-error {{constant expression}} expected-note {{null passed}}
 constexpr int i43 = f4(&c, &c, 0);
 
-}
\ No newline at end of file
+}

From 16d0592dda89495afdbefa5c57eb006559a59465 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 25 Feb 2024 23:06:59 -0800
Subject: [PATCH 277/546] [llvm-exegesis] Remove exegesis prefix in exegesis
 namespace (#82871)

This patch removes the exegesis:: prefix within the exegesis namespace
in llvm-exegesis.cpp as it isn't necessary due to the code already being
wrapped in the namespace.
---
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp | 111 +++++++++------------
 1 file changed, 49 insertions(+), 62 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 66387bdec5a5a..782d44422791c 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -72,79 +72,68 @@ static cl::opt<std::string>
                            "results. “-” uses stdin/stdout."),
                   cl::cat(Options), cl::init(""));
 
-static cl::opt<exegesis::Benchmark::ModeE> BenchmarkMode(
+static cl::opt<Benchmark::ModeE> BenchmarkMode(
     "mode", cl::desc("the mode to run"), cl::cat(Options),
-    cl::values(clEnumValN(exegesis::Benchmark::Latency, "latency",
-                          "Instruction Latency"),
-               clEnumValN(exegesis::Benchmark::InverseThroughput,
-                          "inverse_throughput",
+    cl::values(clEnumValN(Benchmark::Latency, "latency", "Instruction Latency"),
+               clEnumValN(Benchmark::InverseThroughput, "inverse_throughput",
                           "Instruction Inverse Throughput"),
-               clEnumValN(exegesis::Benchmark::Uops, "uops",
-                          "Uop Decomposition"),
+               clEnumValN(Benchmark::Uops, "uops", "Uop Decomposition"),
                // When not asking for a specific benchmark mode,
                // we'll analyse the results.
-               clEnumValN(exegesis::Benchmark::Unknown, "analysis",
-                          "Analysis")));
-
-static cl::opt<exegesis::Benchmark::ResultAggregationModeE>
-    ResultAggMode(
-        "result-aggregation-mode",
-        cl::desc("How to aggregate multi-values result"),
-        cl::cat(BenchmarkOptions),
-        cl::values(clEnumValN(exegesis::Benchmark::Min, "min",
-                              "Keep min reading"),
-                   clEnumValN(exegesis::Benchmark::Max, "max",
-                              "Keep max reading"),
-                   clEnumValN(exegesis::Benchmark::Mean, "mean",
-                              "Compute mean of all readings"),
-                   clEnumValN(exegesis::Benchmark::MinVariance,
-                              "min-variance",
-                              "Keep readings set with min-variance")),
-        cl::init(exegesis::Benchmark::Min));
-
-static cl::opt<exegesis::Benchmark::RepetitionModeE> RepetitionMode(
+               clEnumValN(Benchmark::Unknown, "analysis", "Analysis")));
+
+static cl::opt<Benchmark::ResultAggregationModeE> ResultAggMode(
+    "result-aggregation-mode", cl::desc("How to aggregate multi-values result"),
+    cl::cat(BenchmarkOptions),
+    cl::values(clEnumValN(Benchmark::Min, "min", "Keep min reading"),
+               clEnumValN(Benchmark::Max, "max", "Keep max reading"),
+               clEnumValN(Benchmark::Mean, "mean",
+                          "Compute mean of all readings"),
+               clEnumValN(Benchmark::MinVariance, "min-variance",
+                          "Keep readings set with min-variance")),
+    cl::init(Benchmark::Min));
+
+static cl::opt<Benchmark::RepetitionModeE> RepetitionMode(
     "repetition-mode", cl::desc("how to repeat the instruction snippet"),
     cl::cat(BenchmarkOptions),
     cl::values(
-        clEnumValN(exegesis::Benchmark::Duplicate, "duplicate",
-                   "Duplicate the snippet"),
-        clEnumValN(exegesis::Benchmark::Loop, "loop", "Loop over the snippet"),
-        clEnumValN(exegesis::Benchmark::AggregateMin, "min",
+        clEnumValN(Benchmark::Duplicate, "duplicate", "Duplicate the snippet"),
+        clEnumValN(Benchmark::Loop, "loop", "Loop over the snippet"),
+        clEnumValN(Benchmark::AggregateMin, "min",
                    "All of the above and take the minimum of measurements"),
-        clEnumValN(exegesis::Benchmark::MiddleHalfDuplicate,
-                   "middle-half-duplicate", "Middle half duplicate mode"),
-        clEnumValN(exegesis::Benchmark::MiddleHalfLoop, "middle-half-loop",
+        clEnumValN(Benchmark::MiddleHalfDuplicate, "middle-half-duplicate",
+                   "Middle half duplicate mode"),
+        clEnumValN(Benchmark::MiddleHalfLoop, "middle-half-loop",
                    "Middle half loop mode")),
-    cl::init(exegesis::Benchmark::Duplicate));
+    cl::init(Benchmark::Duplicate));
 
 static cl::opt<bool> BenchmarkMeasurementsPrintProgress(
     "measurements-print-progress",
     cl::desc("Produce progress indicator when performing measurements"),
     cl::cat(BenchmarkOptions), cl::init(false));
 
-static cl::opt<exegesis::BenchmarkPhaseSelectorE> BenchmarkPhaseSelector(
+static cl::opt<BenchmarkPhaseSelectorE> BenchmarkPhaseSelector(
     "benchmark-phase",
     cl::desc(
         "it is possible to stop the benchmarking process after some phase"),
     cl::cat(BenchmarkOptions),
     cl::values(
-        clEnumValN(exegesis::BenchmarkPhaseSelectorE::PrepareSnippet,
-                   "prepare-snippet",
+        clEnumValN(BenchmarkPhaseSelectorE::PrepareSnippet, "prepare-snippet",
                    "Only generate the minimal instruction sequence"),
-        clEnumValN(exegesis::BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet,
+        clEnumValN(BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet,
                    "prepare-and-assemble-snippet",
                    "Same as prepare-snippet, but also dumps an excerpt of the "
                    "sequence (hex encoded)"),
-        clEnumValN(exegesis::BenchmarkPhaseSelectorE::AssembleMeasuredCode,
+        clEnumValN(BenchmarkPhaseSelectorE::AssembleMeasuredCode,
                    "assemble-measured-code",
                    "Same as prepare-and-assemble-snippet, but also creates the "
                    "full sequence "
                    "that can be dumped to a file using --dump-object-to-disk"),
         clEnumValN(
-            exegesis::BenchmarkPhaseSelectorE::Measure, "measure",
+            BenchmarkPhaseSelectorE::Measure, "measure",
             "Same as prepare-measured-code, but also runs the measurement "
             "(default)")),
-    cl::init(exegesis::BenchmarkPhaseSelectorE::Measure));
+    cl::init(BenchmarkPhaseSelectorE::Measure));
 
 static cl::opt<bool>
     UseDummyPerfCounters("use-dummy-perf-counters",
@@ -176,27 +165,26 @@ static cl::opt<bool> IgnoreInvalidSchedClass(
     cl::desc("ignore instructions that do not define a sched class"),
     cl::cat(BenchmarkOptions), cl::init(false));
 
-static cl::opt<exegesis::BenchmarkFilter> AnalysisSnippetFilter(
+static cl::opt<BenchmarkFilter> AnalysisSnippetFilter(
     "analysis-filter", cl::desc("Filter the benchmarks before analysing them"),
     cl::cat(BenchmarkOptions),
     cl::values(
-        clEnumValN(exegesis::BenchmarkFilter::All, "all",
+        clEnumValN(BenchmarkFilter::All, "all",
                    "Keep all benchmarks (default)"),
-        clEnumValN(exegesis::BenchmarkFilter::RegOnly, "reg-only",
+        clEnumValN(BenchmarkFilter::RegOnly, "reg-only",
                    "Keep only those benchmarks that do *NOT* involve memory"),
-        clEnumValN(exegesis::BenchmarkFilter::WithMem, "mem-only",
+        clEnumValN(BenchmarkFilter::WithMem, "mem-only",
                    "Keep only the benchmarks that *DO* involve memory")),
-    cl::init(exegesis::BenchmarkFilter::All));
-
-static cl::opt<exegesis::BenchmarkClustering::ModeE>
-    AnalysisClusteringAlgorithm(
-        "analysis-clustering", cl::desc("the clustering algorithm to use"),
-        cl::cat(AnalysisOptions),
-        cl::values(clEnumValN(exegesis::BenchmarkClustering::Dbscan,
-                              "dbscan", "use DBSCAN/OPTICS algorithm"),
-                   clEnumValN(exegesis::BenchmarkClustering::Naive,
-                              "naive", "one cluster per opcode")),
-        cl::init(exegesis::BenchmarkClustering::Dbscan));
+    cl::init(BenchmarkFilter::All));
+
+static cl::opt<BenchmarkClustering::ModeE> AnalysisClusteringAlgorithm(
+    "analysis-clustering", cl::desc("the clustering algorithm to use"),
+    cl::cat(AnalysisOptions),
+    cl::values(clEnumValN(BenchmarkClustering::Dbscan, "dbscan",
+                          "use DBSCAN/OPTICS algorithm"),
+               clEnumValN(BenchmarkClustering::Naive, "naive",
+                          "one cluster per opcode")),
+    cl::init(BenchmarkClustering::Dbscan));
 
 static cl::opt<unsigned> AnalysisDbscanNumPoints(
     "analysis-numpoints",
@@ -478,7 +466,7 @@ void benchmarkMain() {
         "--use-dummy-perf-counters to not query the kernel for real event "
         "counts.");
 #else
-    if (exegesis::pfm::pfmInitialize())
+    if (pfm::pfmInitialize())
       ExitWithError("cannot initialize libpfm");
 #endif
   }
@@ -571,7 +559,7 @@ void benchmarkMain() {
   if (!Configurations.empty())
     runBenchmarkConfigurations(State, Configurations, Repetitors, *Runner);
 
-  exegesis::pfm::pfmTerminate();
+  pfm::pfmTerminate();
 }
 
 // Prints the results of running analysis pass `Pass` to file `OutputFilename`
@@ -596,11 +584,10 @@ static void maybeRunAnalysis(const Analysis &Analyzer, const std::string &Name,
 
 static void filterPoints(MutableArrayRef<Benchmark> Points,
                          const MCInstrInfo &MCII) {
-  if (AnalysisSnippetFilter == exegesis::BenchmarkFilter::All)
+  if (AnalysisSnippetFilter == BenchmarkFilter::All)
     return;
 
-  bool WantPointsWithMemOps =
-      AnalysisSnippetFilter == exegesis::BenchmarkFilter::WithMem;
+  bool WantPointsWithMemOps = AnalysisSnippetFilter == BenchmarkFilter::WithMem;
   for (Benchmark &Point : Points) {
     if (!Point.Error.empty())
       continue;

From cace477c0b6c3d9494ead66eb725c6e72f27b767 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20W=C3=B3jt?= <dominik.wojt@arm.com>
Date: Mon, 26 Feb 2024 08:09:33 +0100
Subject: [PATCH 278/546] [clang][AArch64] multilib: fix deduction of "-march="
 option (#81474)

The deduced "-march=" option always started with aarch64, which is not a
valid value. There was also no way to distinguish between armv8-r and
armv8-a. After this commit, the deduced "-march=" option will start with
greatest available "armv*-a" value or "armv8-r".
---
 clang/lib/Driver/ToolChain.cpp                  | 7 ++++++-
 clang/test/Driver/print-multi-selection-flags.c | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index f8c13c86daf9b..bd854aae35d4c 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -200,7 +200,12 @@ static void getAArch64MultilibFlags(const Driver &D,
   for (const auto &Ext : AArch64::Extensions)
     if (FeatureSet.contains(Ext.NegFeature))
       MArch.push_back(("no" + Ext.Name).str());
-  MArch.insert(MArch.begin(), ("-march=" + Triple.getArchName()).str());
+  StringRef ArchName;
+  for (const auto &ArchInfo : AArch64::ArchInfos)
+    if (FeatureSet.contains(ArchInfo->ArchFeature))
+      ArchName = ArchInfo->Name;
+  assert(!ArchName.empty() && "at least one architecture should be found");
+  MArch.insert(MArch.begin(), ("-march=" + ArchName).str());
   Result.push_back(llvm::join(MArch, "+"));
 }
 
diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c
index 248d9a3cdf49b..06a12db9d9779 100644
--- a/clang/test/Driver/print-multi-selection-flags.c
+++ b/clang/test/Driver/print-multi-selection-flags.c
@@ -51,9 +51,10 @@
 // CHECK-M85_NO_FP_DP: -mfpu=fp-armv8-fullfp16-sp-d16
 
 // RUN: %clang -print-multi-flags-experimental --target=aarch64-none-elf -march=armv8-a+lse | FileCheck --check-prefix=CHECK-LSE %s
-// CHECK-LSE: -march=aarch64{{.*}}+lse{{.*}}
+// CHECK-LSE: --target=aarch64-none-unknown-elf
+// CHECK-LSE: -march=armv8-a{{.*}}+lse{{.*}}
 
 // RUN: %clang -print-multi-flags-experimental --target=aarch64-none-elf -march=armv8.5-a+sve+sve2 | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -print-multi-flags-experimental --target=aarch64-none-elf -march=armv9-a            | FileCheck --check-prefix=CHECK-SVE2 %s
 // CHECK-SVE2: --target=aarch64-none-unknown-elf
-// CHECK-SVE2: -march=aarch64{{.*}}+simd{{.*}}+sve{{.*}}+sve2{{.*}}
+// CHECK-SVE2: -march=armv{{.*}}-a{{.*}}+simd{{.*}}+sve{{.*}}+sve2{{.*}}

From 0c7a605ada6cb392e6e8c16dbccf2b7e59017399 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Mon, 26 Feb 2024 15:47:03 +0900
Subject: [PATCH 279/546] clangCodeGen: [MC/DC] Refactor CoverageGen.

- Introduce `createDecision(E)` for the root node of `VisitBin`.
- Handle `mcdc::DecisionParameters` for each Decision method.
---
 clang/lib/CodeGen/CoverageMappingGen.cpp | 48 ++++++++++++++----------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index d98ab79eac3ae..e25a92758f32b 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -852,10 +852,6 @@ struct CounterCoverageMappingBuilder
     return Counter::getCounter(CounterMap[S]);
   }
 
-  auto getBitmapIdx(const Stmt *S) {
-    return MCDCState.DecisionByStmt[S].BitmapIdx;
-  }
-
   /// Push a region onto the stack.
   ///
   /// Returns the index on the stack where the region was pushed. This can be
@@ -888,12 +884,11 @@ struct CounterCoverageMappingBuilder
     return RegionStack.size() - 1;
   }
 
-  size_t pushRegion(unsigned BitmapIdx, uint16_t Conditions,
+  size_t pushRegion(const mcdc::DecisionParameters &DecisionParams,
                     std::optional<SourceLocation> StartLoc = std::nullopt,
                     std::optional<SourceLocation> EndLoc = std::nullopt) {
 
-    RegionStack.emplace_back(mcdc::DecisionParameters{BitmapIdx, Conditions},
-                             StartLoc, EndLoc);
+    RegionStack.emplace_back(DecisionParams, StartLoc, EndLoc);
 
     return RegionStack.size() - 1;
   }
@@ -1059,8 +1054,9 @@ struct CounterCoverageMappingBuilder
   /// Create a Decision Region with a BitmapIdx and number of Conditions. This
   /// type of region "contains" branch regions, one for each of the conditions.
   /// The visualization tool will group everything together.
-  void createDecisionRegion(const Expr *C, unsigned BitmapIdx, unsigned Conds) {
-    popRegions(pushRegion(BitmapIdx, Conds, getStart(C), getEnd(C)));
+  void createDecisionRegion(const Expr *C,
+                            const mcdc::DecisionParameters &DecisionParams) {
+    popRegions(pushRegion(DecisionParams, getStart(C), getEnd(C)));
   }
 
   /// Create a Branch Region around a SwitchCase for code coverage
@@ -1312,7 +1308,7 @@ struct CounterCoverageMappingBuilder
       return;
     assert(SpellingRegion(SM, NewStartLoc, EndLoc).isInSourceOrder());
     handleFileExit(NewStartLoc);
-    size_t Index = pushRegion({}, NewStartLoc, EndLoc);
+    size_t Index = pushRegion(Counter{}, NewStartLoc, EndLoc);
     getRegion().setSkipped(true);
     handleFileExit(EndLoc);
     popRegions(Index);
@@ -1962,6 +1958,20 @@ struct CounterCoverageMappingBuilder
                        subtractCounters(ParentCount, TrueCount));
   }
 
+  void createDecision(const BinaryOperator *E) {
+    unsigned NumConds = MCDCBuilder.getTotalConditionsAndReset(E);
+    if (NumConds == 0)
+      return;
+
+    auto DecisionParams = mcdc::DecisionParameters{
+        MCDCState.DecisionByStmt[E].BitmapIdx,
+        NumConds,
+    };
+
+    // Create MCDC Decision Region.
+    createDecisionRegion(E, DecisionParams);
+  }
+
   void VisitBinLAnd(const BinaryOperator *E) {
     bool IsRootNode = MCDCBuilder.isIdle();
 
@@ -1982,11 +1992,6 @@ struct CounterCoverageMappingBuilder
     // Track RHS True/False Decision.
     const auto DecisionRHS = MCDCBuilder.back();
 
-    // Create MCDC Decision Region if at top-level (root).
-    unsigned NumConds = 0;
-    if (IsRootNode && (NumConds = MCDCBuilder.getTotalConditionsAndReset(E)))
-      createDecisionRegion(E, getBitmapIdx(E), NumConds);
-
     // Extract the RHS's Execution Counter.
     Counter RHSExecCnt = getRegionCounter(E);
 
@@ -2003,6 +2008,10 @@ struct CounterCoverageMappingBuilder
     // Create Branch Region around RHS condition.
     createBranchRegion(E->getRHS(), RHSTrueCnt,
                        subtractCounters(RHSExecCnt, RHSTrueCnt), DecisionRHS);
+
+    // Create MCDC Decision Region if at top-level (root).
+    if (IsRootNode)
+      createDecision(E);
   }
 
   // Determine whether the right side of OR operation need to be visited.
@@ -2035,11 +2044,6 @@ struct CounterCoverageMappingBuilder
     // Track RHS True/False Decision.
     const auto DecisionRHS = MCDCBuilder.back();
 
-    // Create MCDC Decision Region if at top-level (root).
-    unsigned NumConds = 0;
-    if (IsRootNode && (NumConds = MCDCBuilder.getTotalConditionsAndReset(E)))
-      createDecisionRegion(E, getBitmapIdx(E), NumConds);
-
     // Extract the RHS's Execution Counter.
     Counter RHSExecCnt = getRegionCounter(E);
 
@@ -2060,6 +2064,10 @@ struct CounterCoverageMappingBuilder
     // Create Branch Region around RHS condition.
     createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
                        RHSFalseCnt, DecisionRHS);
+
+    // Create MCDC Decision Region if at top-level (root).
+    if (IsRootNode)
+      createDecision(E);
   }
 
   void VisitLambdaExpr(const LambdaExpr *LE) {

From 892b4beeac50920e630f10905b2916295e2eb6d8 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 26 Feb 2024 15:55:56 +0800
Subject: [PATCH 280/546] [GVN] Drop nsw/nuw flags when replacing the result of
 a with.overflow intrinsic with a overflowing binary operator (#82935)

Alive2: https://alive2.llvm.org/ce/z/gyL7mn
Fixes https://github.com/llvm/llvm-project/issues/82884.
---
 llvm/lib/Transforms/Utils/Local.cpp |  8 +++++++-
 llvm/test/Transforms/GVN/pr82884.ll | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/GVN/pr82884.ll

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 1373f5f7f4490..075eeb5b19fd2 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3369,11 +3369,17 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
 
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
+  WithOverflowInst *UnusedWO;
+  // When replacing the result of a llvm.*.with.overflow intrinsic with a
+  // overflowing binary operator, nuw/nsw flags may no longer hold.
+  if (isa<OverflowingBinaryOperator>(ReplInst) &&
+      match(I, m_ExtractValue<0>(m_WithOverflowInst(UnusedWO))))
+    ReplInst->dropPoisonGeneratingFlags();
   // Note that if 'I' is a load being replaced by some operation,
   // for example, by an arithmetic operation, then andIRFlags()
   // would just erase all math flags from the original arithmetic
   // operation, which is clearly not wanted and not needed.
-  if (!isa<LoadInst>(I))
+  else if (!isa<LoadInst>(I))
     ReplInst->andIRFlags(I);
 
   // FIXME: If both the original and replacement value are part of the
diff --git a/llvm/test/Transforms/GVN/pr82884.ll b/llvm/test/Transforms/GVN/pr82884.ll
new file mode 100644
index 0000000000000..71abafda60d93
--- /dev/null
+++ b/llvm/test/Transforms/GVN/pr82884.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=gvn < %s | FileCheck %s
+
+; Make sure nsw/nuw flags are dropped.
+
+define i32 @pr82884(i32 %x) {
+; CHECK-LABEL: define i32 @pr82884(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[X]], [[X]]
+; CHECK-NEXT:    call void @use(i32 [[MUL]])
+; CHECK-NEXT:    [[MUL2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[X]], i32 [[X]])
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %mul = mul nsw nuw i32 %x, %x
+  call void @use(i32 %mul)
+  %mul2 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %x, i32 %x)
+  %ret = extractvalue { i32, i1 } %mul2, 0
+  ret i32 %ret
+}
+
+declare void @use(i32)

From 15426017bda54fb8d9a62cb887edae754e8b7733 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 08:24:19 +0100
Subject: [PATCH 281/546] [clang][Interp] Handle non-complex operands in
 complex bin ops

Either LHS or RHS might be non-complex, but not both.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 103 +++++++++++++++--------
 clang/test/AST/Interp/complex.cpp        |  25 ++++++
 2 files changed, 93 insertions(+), 35 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index eb5a1b536b779..49ba8e95f1799 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -660,19 +660,16 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
       return false;
   }
 
+  // Both LHS and RHS might _not_ be of complex type, but one of them
+  // needs to be.
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
-  PrimType LHSElemT = this->classifyComplexElementType(LHS->getType());
-  PrimType RHSElemT = this->classifyComplexElementType(RHS->getType());
 
-  unsigned LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
-  unsigned RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
+  PrimType ResultElemT = this->classifyComplexElementType(E->getType());
   unsigned ResultOffset = ~0u;
-  if (!this->DiscardResult)
+  if (!DiscardResult)
     ResultOffset = this->allocateLocalPrimitive(E, PT_Ptr, true, false);
 
-  assert(LHSElemT == RHSElemT);
-
   // Save result pointer in ResultOffset
   if (!this->DiscardResult) {
     if (!this->emitDupPtr(E))
@@ -682,16 +679,64 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
   }
 
   // Evaluate LHS and save value to LHSOffset.
-  if (!this->visit(LHS))
-    return false;
-  if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
-    return false;
+  bool LHSIsComplex;
+  unsigned LHSOffset;
+  if (LHS->getType()->isAnyComplexType()) {
+    LHSIsComplex = true;
+    LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
+    if (!this->visit(LHS))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
+      return false;
+  } else {
+    LHSIsComplex = false;
+    PrimType LHST = classifyPrim(LHS->getType());
+    LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
+    if (!this->visit(LHS))
+      return false;
+    if (!this->emitSetLocal(LHST, LHSOffset, E))
+      return false;
+  }
 
   // Same with RHS.
-  if (!this->visit(RHS))
-    return false;
-  if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
-    return false;
+  bool RHSIsComplex;
+  unsigned RHSOffset;
+  if (RHS->getType()->isAnyComplexType()) {
+    RHSIsComplex = true;
+    RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
+    if (!this->visit(RHS))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
+      return false;
+  } else {
+    RHSIsComplex = false;
+    PrimType RHST = classifyPrim(RHS->getType());
+    RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
+    if (!this->visit(RHS))
+      return false;
+    if (!this->emitSetLocal(RHST, RHSOffset, E))
+      return false;
+  }
+
+  // For both LHS and RHS, either load the value from the complex pointer, or
+  // directly from the local variable. For index 1 (i.e. the imaginary part),
+  // just load 0 and do the operation anyway.
+  auto loadComplexValue = [this](bool IsComplex, unsigned ElemIndex,
+                                 unsigned Offset, const Expr *E) -> bool {
+    if (IsComplex) {
+      if (!this->emitGetLocal(PT_Ptr, Offset, E))
+        return false;
+      if (!this->emitConstUint8(ElemIndex, E))
+        return false;
+      if (!this->emitArrayElemPtrPopUint8(E))
+        return false;
+      return this->emitLoadPop(classifyComplexElementType(E->getType()), E);
+    }
+    if (ElemIndex == 0)
+      return this->emitGetLocal(classifyPrim(E->getType()), Offset, E);
+    return this->visitZeroInitializer(classifyPrim(E->getType()), E->getType(),
+                                      E);
+  };
 
   // Now we can get pointers to the LHS and RHS from the offsets above.
   BinaryOperatorKind Op = E->getOpcode();
@@ -702,41 +747,29 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
         return false;
     }
 
-    if (!this->emitGetLocal(PT_Ptr, LHSOffset, E))
-      return false;
-    if (!this->emitConstUint8(ElemIndex, E))
-      return false;
-    if (!this->emitArrayElemPtrPopUint8(E))
-      return false;
-    if (!this->emitLoadPop(LHSElemT, E))
+    if (!loadComplexValue(LHSIsComplex, ElemIndex, LHSOffset, LHS))
       return false;
 
-    if (!this->emitGetLocal(PT_Ptr, RHSOffset, E))
-      return false;
-    if (!this->emitConstUint8(ElemIndex, E))
-      return false;
-    if (!this->emitArrayElemPtrPopUint8(E))
-      return false;
-    if (!this->emitLoadPop(RHSElemT, E))
+    if (!loadComplexValue(RHSIsComplex, ElemIndex, RHSOffset, RHS))
       return false;
 
     // The actual operation.
     switch (Op) {
     case BO_Add:
-      if (LHSElemT == PT_Float) {
+      if (ResultElemT == PT_Float) {
         if (!this->emitAddf(getRoundingMode(E), E))
           return false;
       } else {
-        if (!this->emitAdd(LHSElemT, E))
+        if (!this->emitAdd(ResultElemT, E))
           return false;
       }
       break;
     case BO_Sub:
-      if (LHSElemT == PT_Float) {
+      if (ResultElemT == PT_Float) {
         if (!this->emitSubf(getRoundingMode(E), E))
           return false;
       } else {
-        if (!this->emitSub(LHSElemT, E))
+        if (!this->emitSub(ResultElemT, E))
           return false;
       }
       break;
@@ -747,10 +780,10 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
 
     if (!this->DiscardResult) {
       // Initialize array element with the value we just computed.
-      if (!this->emitInitElemPop(LHSElemT, ElemIndex, E))
+      if (!this->emitInitElemPop(ResultElemT, ElemIndex, E))
         return false;
     } else {
-      if (!this->emitPop(LHSElemT, E))
+      if (!this->emitPop(ResultElemT, E))
         return false;
     }
   }
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 612a94b31171b..2b65ccf9946e7 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -164,6 +164,20 @@ namespace Add {
   constexpr _Complex unsigned int I3 = I1 + I2;
   static_assert(__real(I3) == 45, "");
   static_assert(__imag(I3) == 12, "");
+
+  static_assert(__real(A + 2.0) == 15, "");
+  static_assert(__imag(A + 2.0) == 2, "");
+  static_assert(__real(2.0 + A) == 15, "");
+  static_assert(__imag(2.0 + A) == 2, "");
+
+  static_assert(__real(D + 1) == 16, "");
+  static_assert(__real(D + 1.0) == 16, "");
+  constexpr _Complex double D2 = D + 3.0;
+  static_assert(__real(D2) == 18.0, "");
+  static_assert(__imag(D2) == 3.0, "");
+  constexpr _Complex double D3 = 3.0 + D;
+  static_assert(__real(D3) == 18.0, "");
+  static_assert(__imag(D3) == 3.0, "");
 }
 
 namespace Sub {
@@ -172,6 +186,8 @@ namespace Sub {
   constexpr _Complex float C = A - B;
   static_assert(__real(C) == 11.0, "");
   static_assert(__imag(C) == 1.0, "");
+  static_assert(__real(A - 2.0) == 11, "");
+  static_assert(__real(2.0 - A) == -11, "");
 
   constexpr _Complex float D = B - A;
   static_assert(__real(D) == -11.0, "");
@@ -189,6 +205,15 @@ namespace Sub {
   constexpr _Complex float D_ = A_ - B_;
   static_assert(__real(D_) == 11.0, "");
   static_assert(__imag(D_) == 1.0, "");
+
+  static_assert(__real(D - 1) == -12, "");
+  static_assert(__real(D - 1.0) == -12, "");
+  constexpr _Complex double D2 = D - 3.0;
+  static_assert(__real(D2) == -14.0, "");
+  static_assert(__imag(D2) == -1.0, "");
+  constexpr _Complex double D3 = 3.0 - D;
+  static_assert(__real(D3) == 14.0, "");
+  static_assert(__imag(D3) == 1.0, "");
 }
 
 }

From 4216a300e324d58bb6f0efcc8b2490e6c6983ae8 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Mon, 26 Feb 2024 16:18:32 +0800
Subject: [PATCH 282/546] [RISCV][NFC] Use sub to construct RVV registers
 without V0 (#82962)

This reduces some lines.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 193b85e281860..381e0082c49b0 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -512,25 +512,21 @@ def VR : VReg<!listconcat(VM1VTs, VMaskVTs),
               (add (sequence "V%u", 8, 31),
                    (sequence "V%u", 0, 7)), 1>;
 
-def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs),
-                  (add (sequence "V%u", 8, 31),
-                       (sequence "V%u", 1, 7)), 1>;
+def VRNoV0 : VReg<!listconcat(VM1VTs, VMaskVTs), (sub VR, V0), 1>;
 
 def VRM2 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
                              (sequence "V%uM2", 0, 7, 2)), 2>;
 
-def VRM2NoV0 : VReg<VM2VTs, (add (sequence "V%uM2", 8, 31, 2),
-                                 (sequence "V%uM2", 2, 7, 2)), 2>;
+def VRM2NoV0 : VReg<VM2VTs, (sub VRM2, V0M2), 2>;
 
-def VRM4 : VReg<VM4VTs,
-             (add V8M4, V12M4, V16M4, V20M4, V24M4, V28M4, V0M4, V4M4), 4>;
+def VRM4 : VReg<VM4VTs, (add V8M4, V12M4, V16M4, V20M4,
+                             V24M4, V28M4, V0M4, V4M4), 4>;
 
-def VRM4NoV0 : VReg<VM4VTs,
-             (add V8M4, V12M4, V16M4, V20M4, V24M4, V28M4, V4M4), 4>;
+def VRM4NoV0 : VReg<VM4VTs, (sub VRM4, V0M4), 4>;
 
 def VRM8 : VReg<VM8VTs, (add V8M8, V16M8, V24M8, V0M8), 8>;
 
-def VRM8NoV0 : VReg<VM8VTs, (add V8M8, V16M8, V24M8), 8>;
+def VRM8NoV0 : VReg<VM8VTs, (sub VRM8, V0M8), 8>;
 
 def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> {
   let Size = 64;

From e510fc77539022c195cc83b5dceb1c0b493dafcb Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Mon, 26 Feb 2024 16:37:41 +0800
Subject: [PATCH 283/546] [VP][RISCV] Introduce vp.lrint/llrint and RISC-V
 support. (#82627)

RISC-V implements vector lrint/llrint by vfcvt.x.f.v.
---
 llvm/docs/LangRef.rst                         |  98 ++++++++
 llvm/include/llvm/IR/Intrinsics.td            |   8 +
 llvm/include/llvm/IR/VPIntrinsics.def         |  12 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  14 +-
 llvm/lib/IR/IntrinsicInst.cpp                 |   2 +
 llvm/lib/IR/Verifier.cpp                      |   4 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  10 +-
 .../RISCV/rvv/fixed-vectors-llrint-vp.ll      | 187 ++++++++++++++
 .../RISCV/rvv/fixed-vectors-lrint-vp.ll       | 233 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll      | 125 ++++++++++
 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll       | 209 ++++++++++++++++
 llvm/unittests/IR/VPIntrinsicTest.cpp         |   4 +
 12 files changed, 903 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 19ca9f6ae3fe3..570a058bde8da 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -16021,6 +16021,8 @@ functions would, but without setting errno. If the rounded value is
 too large to be stored in the result type, the return value is a
 non-deterministic value (equivalent to `freeze poison`).
 
+.. _int_lrint:
+
 '``llvm.lrint.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16066,6 +16068,8 @@ would, but without setting errno. If the rounded value is too large to
 be stored in the result type, the return value is a non-deterministic
 value (equivalent to `freeze poison`).
 
+.. _int_llrint:
+
 '``llvm.llrint.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -23382,6 +23386,100 @@ Examples:
       %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a)
       %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison
 
+.. _int_vp_lrint:
+
+'``llvm.vp.lrint.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32> @llvm.vp.lrint.v16i32.v16f32(<16 x float> <op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32> @llvm.vp.lrint.nxv4i32.nxv4f32(<vscale x 4 x float> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64> @llvm.vp.lrint.v256i64.v256f64(<256 x double> <op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated lrint of a vector of floating-point values.
+
+
+Arguments:
+""""""""""
+
+The result is an integer vector and the first operand is a vector of :ref:`floating-point <t_floating>`
+type with the same number of elements as the result vector type. The second
+operand is the vector mask and has the same number of elements as the result
+vector type. The third operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.lrint``' intrinsic performs lrint (:ref:`lrint <int_lrint>`) of
+the first vector operand on each enabled lane. The result on disabled lanes is a
+:ref:`poison value <poisonvalues>`.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.lrint.v4i32.v4f32(<4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.lrint.v4f32(<4 x float> %a)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+.. _int_vp_llrint:
+
+'``llvm.vp.llrint.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32> @llvm.vp.llrint.v16i32.v16f32(<16 x float> <op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32> @llvm.vp.llrint.nxv4i32.nxv4f32(<vscale x 4 x float> <op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64> @llvm.vp.llrint.v256i64.v256f64(<256 x double> <op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated llrint of a vector of floating-point values.
+
+
+Arguments:
+""""""""""
+The result is an integer vector and the first operand is a vector of :ref:`floating-point <t_floating>`
+type with the same number of elements as the result vector type. The second
+operand is the vector mask and has the same number of elements as the result
+vector type. The third operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.llrint``' intrinsic performs lrint (:ref:`llrint <int_llrint>`) of
+the first vector operand on each enabled lane. The result on disabled lanes is a
+:ref:`poison value <poisonvalues>`.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.llrint.v4i32.v4f32(<4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x i32> @llvm.llrint.v4f32(<4 x float> %a)
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison
+
+
 .. _int_vp_bitreverse:
 
 '``llvm.vp.bitreverse.*``' Intrinsics
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index d7c1ce153a6c8..0f13d25eb30eb 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2057,6 +2057,14 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                              [ LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_lrint : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_llrint : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ llvm_anyvector_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
 
   // Casts
   def int_vp_trunc : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 4089acf9ec3f0..1c2708a9e8543 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -461,6 +461,18 @@ VP_PROPERTY_FUNCTIONAL_INTRINSIC(nearbyint)
 VP_PROPERTY_FUNCTIONAL_SDOPC(FNEARBYINT)
 END_REGISTER_VP(vp_nearbyint, VP_FNEARBYINT)
 
+// llvm.vp.lrint(x,mask,vlen)
+BEGIN_REGISTER_VP(vp_lrint, 1, 2, VP_LRINT, 0)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(lrint)
+VP_PROPERTY_FUNCTIONAL_SDOPC(LRINT)
+END_REGISTER_VP(vp_lrint, VP_LRINT)
+
+// llvm.vp.llrint(x,mask,vlen)
+BEGIN_REGISTER_VP(vp_llrint, 1, 2, VP_LLRINT, 0)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(llrint)
+VP_PROPERTY_FUNCTIONAL_SDOPC(LLRINT)
+END_REGISTER_VP(vp_llrint, VP_LLRINT)
+
 ///// } Floating-Point Arithmetic
 
 ///// Type Casts {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 90cda2a1155b6..5fb9d8d07d151 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1102,7 +1102,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FRINT:
   case ISD::VP_FRINT:
   case ISD::LRINT:
+  case ISD::VP_LRINT:
   case ISD::LLRINT:
+  case ISD::VP_LLRINT:
   case ISD::FROUND:
   case ISD::VP_FROUND:
   case ISD::FROUNDEVEN:
@@ -4263,6 +4265,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::LRINT:
   case ISD::LLRINT:
+  case ISD::VP_LRINT:
+  case ISD::VP_LLRINT:
     Res = WidenVecRes_XRINT(N);
     break;
 
@@ -4869,7 +4873,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_XRINT(SDNode *N) {
   if (WidenNumElts != SrcVT.getVectorElementCount())
     return DAG.UnrollVectorOp(N, WidenNumElts.getKnownMinValue());
 
-  return DAG.getNode(N->getOpcode(), dl, WidenVT, Src);
+  if (N->getNumOperands() == 1)
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, Src);
+
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(1), WidenVT.getVectorElementCount());
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, Mask, N->getOperand(2));
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 5050091836b7f..89403e1d7fcb4 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -666,6 +666,8 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   case Intrinsic::vp_fpext:
   case Intrinsic::vp_ptrtoint:
   case Intrinsic::vp_inttoptr:
+  case Intrinsic::vp_lrint:
+  case Intrinsic::vp_llrint:
     VPFunc =
         Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[0]->getType()});
     break;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index b04d39c700a8f..4f321bc516cc3 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6183,9 +6183,11 @@ void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
       break;
     case Intrinsic::vp_fptoui:
     case Intrinsic::vp_fptosi:
+    case Intrinsic::vp_lrint:
+    case Intrinsic::vp_llrint:
       Check(
           RetTy->isIntOrIntVectorTy() && ValTy->isFPOrFPVectorTy(),
-          "llvm.vp.fptoui or llvm.vp.fptosi intrinsic first argument element "
+          "llvm.vp.fptoui, llvm.vp.fptosi, llvm.vp.lrint or llvm.vp.llrint" "intrinsic first argument element "
           "type must be floating-point and result element type must be integer",
           *VPCast);
       break;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 540c2e7476dc1..0c98642748d4e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -706,7 +706,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FCEIL,       ISD::VP_FFLOOR,      ISD::VP_FROUND,
         ISD::VP_FROUNDEVEN,  ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO,
         ISD::VP_FRINT,       ISD::VP_FNEARBYINT,  ISD::VP_IS_FPCLASS,
-        ISD::VP_FMINIMUM,    ISD::VP_FMAXIMUM,    ISD::EXPERIMENTAL_VP_REVERSE,
+        ISD::VP_FMINIMUM,    ISD::VP_FMAXIMUM,    ISD::VP_LRINT,
+        ISD::VP_LLRINT,      ISD::EXPERIMENTAL_VP_REVERSE,
         ISD::EXPERIMENTAL_VP_SPLICE};
 
     static const unsigned IntegerVecReduceOps[] = {
@@ -5811,6 +5812,11 @@ static unsigned getRISCVVLOp(SDValue Op) {
   case ISD::FMAXNUM:
   case ISD::VP_FMAXNUM:
     return RISCVISD::VFMAX_VL;
+  case ISD::LRINT:
+  case ISD::VP_LRINT:
+  case ISD::LLRINT:
+  case ISD::VP_LLRINT:
+    return RISCVISD::VFCVT_X_F_VL;
   }
   // clang-format on
 #undef OP_CASE
@@ -6801,6 +6807,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VP_USUBSAT:
   case ISD::VP_SADDSAT:
   case ISD::VP_SSUBSAT:
+  case ISD::VP_LRINT:
+  case ISD::VP_LLRINT:
     return lowerVPOp(Op, DAG);
   case ISD::VP_AND:
   case ISD::VP_OR:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll
new file mode 100644
index 0000000000000..8282b8884aed6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint-vp.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
+
+define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x, <1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v1i64_v1f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v1i64_v1f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %a = call <1 x i64> @llvm.vp.llrint.v1i64.v1f32(<1 x float> %x, <1 x i1> %m, i32 %evl)
+  ret <1 x i64> %a
+}
+declare <1 x i64> @llvm.vp.llrint.v1i64.v1f32(<1 x float>, <1 x i1>, i32)
+
+define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v2i64_v2f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v2i64_v2f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV64-NEXT:    vmv1r.v v8, v9
+; RV64-NEXT:    ret
+  %a = call <2 x i64> @llvm.vp.llrint.v2i64.v2f32(<2 x float> %x, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %a
+}
+declare <2 x i64> @llvm.vp.llrint.v2i64.v2f32(<2 x float>, <2 x i1>, i32)
+
+define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x, <3 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v3i64_v3f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v3i64_v3f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %a = call <3 x i64> @llvm.vp.llrint.v3i64.v3f32(<3 x float> %x, <3 x i1> %m, i32 %evl)
+  ret <3 x i64> %a
+}
+declare <3 x i64> @llvm.vp.llrint.v3i64.v3f32(<3 x float>, <3 x i1>, i32)
+
+define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v4i64_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV32-NEXT:    vmv2r.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v4i64_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV64-NEXT:    vmv2r.v v8, v10
+; RV64-NEXT:    ret
+  %a = call <4 x i64> @llvm.vp.llrint.v4i64.v4f32(<4 x float> %x, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %a
+}
+declare <4 x i64> @llvm.vp.llrint.v4i64.v4f32(<4 x float>, <4 x i1>, i32)
+
+define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v8i64_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV32-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
+; RV32-NEXT:    vmv4r.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v8i64_v8f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
+; RV64-NEXT:    vmv4r.v v8, v12
+; RV64-NEXT:    ret
+  %a = call <8 x i64> @llvm.vp.llrint.v8i64.v8f32(<8 x float> %x, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %a
+}
+declare <8 x i64> @llvm.vp.llrint.v8i64.v8f32(<8 x float>, <8 x i1>, i32)
+
+define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v16i64_v16f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV32-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
+; RV32-NEXT:    vmv8r.v v8, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v16i64_v16f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV64-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
+; RV64-NEXT:    vmv8r.v v8, v16
+; RV64-NEXT:    ret
+  %a = call <16 x i64> @llvm.vp.llrint.v16i64.v16f32(<16 x float> %x, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %a
+}
+declare <16 x i64> @llvm.vp.llrint.v16i64.v16f32(<16 x float>, <16 x i1>, i32)
+
+define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x, <1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v1i64_v1f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v1i64_v1f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-NEXT:    ret
+  %a = call <1 x i64> @llvm.vp.llrint.v1i64.v1f64(<1 x double> %x, <1 x i1> %m, i32 %evl)
+  ret <1 x i64> %a
+}
+declare <1 x i64> @llvm.vp.llrint.v1i64.v1f64(<1 x double>, <1 x i1>, i32)
+
+define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v2i64_v2f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v2i64_v2f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-NEXT:    ret
+  %a = call <2 x i64> @llvm.vp.llrint.v2i64.v2f64(<2 x double> %x, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %a
+}
+declare <2 x i64> @llvm.vp.llrint.v2i64.v2f64(<2 x double>, <2 x i1>, i32)
+
+define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v4i64_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v4i64_v4f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-NEXT:    ret
+  %a = call <4 x i64> @llvm.vp.llrint.v4i64.v4f64(<4 x double> %x, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %a
+}
+declare <4 x i64> @llvm.vp.llrint.v4i64.v4f64(<4 x double>, <4 x i1>, i32)
+
+define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: llrint_v8i64_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: llrint_v8i64_v8f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-NEXT:    ret
+  %a = call <8 x i64> @llvm.vp.llrint.v8i64.v8f64(<8 x double> %x, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %a
+}
+declare <8 x i64> @llvm.vp.llrint.v8i64.v8f64(<8 x double>, <8 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll
new file mode 100644
index 0000000000000..08dd1c79f24c9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint-vp.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+f,+d \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs | FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v,+f,+d \
+; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64-i32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+f,+d \
+; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64-i64
+
+define <1 x iXLen> @lrint_v1f32(<1 x float> %x, <1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v1f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v1f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v1f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV64-i64-NEXT:    vmv1r.v v8, v9
+; RV64-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.vp.lrint.v1iXLen.v1f32(<1 x float> %x, <1 x i1> %m, i32 %evl)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.vp.lrint.v1iXLen.v1f32(<1 x float>, <1 x i1>, i32)
+
+define <2 x iXLen> @lrint_v2f32(<2 x float> %x, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v2f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v2f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v2f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV64-i64-NEXT:    vmv1r.v v8, v9
+; RV64-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.vp.lrint.v2iXLen.v2f32(<2 x float> %x, <2 x i1> %m, i32 %evl)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.vp.lrint.v2iXLen.v2f32(<2 x float>, <2 x i1>, i32)
+
+define <3 x iXLen> @lrint_v3f32(<3 x float> %x, <3 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v3f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v3f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v3f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    ret
+  %a = call <3 x iXLen> @llvm.vp.lrint.v3iXLen.v3f32(<3 x float> %x, <3 x i1> %m, i32 %evl)
+  ret <3 x iXLen> %a
+}
+declare <3 x iXLen> @llvm.vp.lrint.v3iXLen.v3f32(<3 x float>, <3 x i1>, i32)
+
+define <4 x iXLen> @lrint_v4f32(<4 x float> %x, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v4f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v4f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.vp.lrint.v4iXLen.v4f32(<4 x float> %x, <4 x i1> %m, i32 %evl)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.vp.lrint.v4iXLen.v4f32(<4 x float>, <4 x i1>, i32)
+
+define <8 x iXLen> @lrint_v8f32(<8 x float> %x, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v8f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v8f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
+; RV64-i64-NEXT:    vmv4r.v v8, v12
+; RV64-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.vp.lrint.v8iXLen.v8f32(<8 x float> %x, <8 x i1> %m, i32 %evl)
+  ret <8 x iXLen> %a
+}
+declare <8 x iXLen> @llvm.vp.lrint.v8iXLen.v8f32(<8 x float>, <8 x i1>, i32)
+
+define <16 x iXLen> @lrint_v16iXLen_v16f32(<16 x float> %x, <16 x i1> %m, i32 zeroext %evl) {
+  %a = call <16 x iXLen> @llvm.vp.lrint.v16iXLen.v16f32(<16 x float> %x, <16 x i1> %m, i32 %evl)
+  ret <16 x iXLen> %a
+}
+declare <16 x iXLen> @llvm.vp.lrint.v16iXLen.v16f32(<16 x float>, <16 x i1>, i32)
+
+define <1 x iXLen> @lrint_v1f64(<1 x double> %x, <1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v1f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v9, v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v1f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v9, v8, v0.t
+; RV64-i32-NEXT:    vmv1r.v v8, v9
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v1f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <1 x iXLen> @llvm.vp.lrint.v1iXLen.v1f64(<1 x double> %x, <1 x i1> %m, i32 %evl)
+  ret <1 x iXLen> %a
+}
+declare <1 x iXLen> @llvm.vp.lrint.v1iXLen.v1f64(<1 x double>, <1 x i1>, i32)
+
+define <2 x iXLen> @lrint_v2f64(<2 x double> %x, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v2f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v9, v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v2f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v9, v8, v0.t
+; RV64-i32-NEXT:    vmv1r.v v8, v9
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v2f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <2 x iXLen> @llvm.vp.lrint.v2iXLen.v2f64(<2 x double> %x, <2 x i1> %m, i32 %evl)
+  ret <2 x iXLen> %a
+}
+declare <2 x iXLen> @llvm.vp.lrint.v2iXLen.v2f64(<2 x double>, <2 x i1>, i32)
+
+define <4 x iXLen> @lrint_v4f64(<4 x double> %x, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v10, v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v4f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v10, v8, v0.t
+; RV64-i32-NEXT:    vmv.v.v v8, v10
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v4f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <4 x iXLen> @llvm.vp.lrint.v4iXLen.v4f64(<4 x double> %x, <4 x i1> %m, i32 %evl)
+  ret <4 x iXLen> %a
+}
+declare <4 x iXLen> @llvm.vp.lrint.v4iXLen.v4f64(<4 x double>, <4 x i1>, i32)
+
+define <8 x iXLen> @lrint_v8f64(<8 x double> %x, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_v8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v12, v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_v8f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v12, v8, v0.t
+; RV64-i32-NEXT:    vmv.v.v v8, v12
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_v8f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <8 x iXLen> @llvm.vp.lrint.v8iXLen.v8f64(<8 x double> %x, <8 x i1> %m, i32 %evl)
+  ret <8 x iXLen> %a
+}
+declare <8 x iXLen> @llvm.vp.lrint.v8iXLen.v8f64(<8 x double>, <8 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
new file mode 100644
index 0000000000000..6d8763d34ec1b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
+
+define <vscale x 1 x i64> @llrint_nxv1i64_nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv1i64_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.vp.llrint.nxv1i64.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %a
+}
+declare <vscale x 1 x i64> @llvm.vp.llrint.nxv1i64.nxv1f32(<vscale x 1 x float>, <vscale x 1 x i1>, i32)
+
+define <vscale x 2 x i64> @llrint_nxv2i64_nxv2f32(<vscale x 2 x float> %x, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv2i64_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i64> @llvm.vp.llrint.nxv2i64.nxv2f32(<vscale x 2 x float> %x, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %a
+}
+declare <vscale x 2 x i64> @llvm.vp.llrint.nxv2i64.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
+
+define <vscale x 4 x i64> @llrint_nxv4i64_nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv4i64_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
+; CHECK-NEXT:    vmv4r.v v8, v12
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x i64> @llvm.vp.llrint.nxv4i64.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.vp.llrint.nxv4i64.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32)
+
+define <vscale x 8 x i64> @llrint_nxv8i64_nxv8f32(<vscale x 8 x float> %x, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv8i64_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x i64> @llvm.vp.llrint.nxv8i64.nxv8f32(<vscale x 8 x float> %x, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %a
+}
+declare <vscale x 8 x i64> @llvm.vp.llrint.nxv8i64.nxv8f32(<vscale x 8 x float>, <vscale x 8 x i1>, i32)
+
+define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv16i64_nxv16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
+; CHECK-NEXT:    vfwcvt.x.f.v v16, v12, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vfwcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    ret
+  %a = call <vscale x 16 x i64> @llvm.vp.llrint.nxv16i64.nxv16f32(<vscale x 16 x float> %x, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i64> %a
+}
+declare <vscale x 16 x i64> @llvm.vp.llrint.nxv16i64.nxv16f32(<vscale x 16 x float>, <vscale x 16 x i1>, i32)
+
+define <vscale x 1 x i64> @llrint_nxv1i64_nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv1i64_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 1 x i64> @llvm.vp.llrint.nxv1i64.nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %a
+}
+declare <vscale x 1 x i64> @llvm.vp.llrint.nxv1i64.nxv1f64(<vscale x 1 x double>, <vscale x 1 x i1>, i32)
+
+define <vscale x 2 x i64> @llrint_nxv2i64_nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv2i64_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 2 x i64> @llvm.vp.llrint.nxv2i64.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %a
+}
+declare <vscale x 2 x i64> @llvm.vp.llrint.nxv2i64.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32)
+
+define <vscale x 4 x i64> @llrint_nxv4i64_nxv4f64(<vscale x 4 x double> %x, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv4i64_nxv4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 4 x i64> @llvm.vp.llrint.nxv4i64.nxv4f64(<vscale x 4 x double> %x, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %a
+}
+declare <vscale x 4 x i64> @llvm.vp.llrint.nxv4i64.nxv4f64(<vscale x 4 x double>, <vscale x 4 x i1>, i32)
+
+define <vscale x 8 x i64> @llrint_nxv8i64_nxv8f64(<vscale x 8 x double> %x, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: llrint_nxv8i64_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; CHECK-NEXT:    ret
+  %a = call <vscale x 8 x i64> @llvm.vp.llrint.nxv8i64.nxv8f64(<vscale x 8 x double> %x, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %a
+}
+declare <vscale x 8 x i64> @llvm.vp.llrint.nxv8i64.nxv8f64(<vscale x 8 x double>, <vscale x 8 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
new file mode 100644
index 0000000000000..8a826fb3ac1ea
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+f,+d \
+; RUN:     -target-abi=ilp32d -verify-machineinstrs | FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v,+f,+d \
+; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64-i32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+f,+d \
+; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64-i64
+
+define <vscale x 1 x iXLen> @lrint_nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv1f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv1f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv1f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v9, v8, v0.t
+; RV64-i64-NEXT:    vmv1r.v v8, v9
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 1 x iXLen> @llvm.vp.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x iXLen> %a
+}
+declare <vscale x 1 x iXLen> @llvm.vp.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float>, <vscale x 1 x i1>, i32)
+
+define <vscale x 2 x iXLen> @lrint_nxv2f32(<vscale x 2 x float> %x, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv2f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv2f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv2f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8, v0.t
+; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 2 x iXLen> @llvm.vp.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x iXLen> %a
+}
+declare <vscale x 2 x iXLen> @llvm.vp.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, i32)
+
+define <vscale x 4 x iXLen> @lrint_nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv4f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv4f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v12, v8, v0.t
+; RV64-i64-NEXT:    vmv4r.v v8, v12
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 4 x iXLen> @llvm.vp.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x iXLen> %a
+}
+declare <vscale x 4 x iXLen> @llvm.vp.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32)
+
+define <vscale x 8 x iXLen> @lrint_nxv8f32(<vscale x 8 x float> %x, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv8f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv8f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv8f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v16, v8, v0.t
+; RV64-i64-NEXT:    vmv8r.v v8, v16
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 8 x iXLen> @llvm.vp.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float> %x, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x iXLen> %a
+}
+declare <vscale x 8 x iXLen> @llvm.vp.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float>, <vscale x 8 x i1>, i32)
+
+define <vscale x 16 x iXLen> @lrint_nxv16iXLen_nxv16f32(<vscale x 16 x float> %x, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+  %a = call <vscale x 16 x iXLen> @llvm.vp.lrint.nxv16iXLen.nxv16f32(<vscale x 16 x float> %x, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x iXLen> %a
+}
+declare <vscale x 16 x iXLen> @llvm.vp.lrint.nxv16iXLen.nxv16f32(<vscale x 16 x float>, <vscale x 16 x i1>, i32)
+
+define <vscale x 1 x iXLen> @lrint_nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv1f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v9, v8, v0.t
+; RV32-NEXT:    vmv1r.v v8, v9
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv1f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v9, v8, v0.t
+; RV64-i32-NEXT:    vmv1r.v v8, v9
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv1f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 1 x iXLen> @llvm.vp.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x iXLen> %a
+}
+declare <vscale x 1 x iXLen> @llvm.vp.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double>, <vscale x 1 x i1>, i32)
+
+define <vscale x 2 x iXLen> @lrint_nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv2f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v10, v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv2f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v10, v8, v0.t
+; RV64-i32-NEXT:    vmv.v.v v8, v10
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv2f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 2 x iXLen> @llvm.vp.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x iXLen> %a
+}
+declare <vscale x 2 x iXLen> @llvm.vp.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32)
+
+define <vscale x 4 x iXLen> @lrint_nxv4f64(<vscale x 4 x double> %x, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv4f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v12, v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv4f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v12, v8, v0.t
+; RV64-i32-NEXT:    vmv.v.v v8, v12
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv4f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 4 x iXLen> @llvm.vp.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double> %x, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x iXLen> %a
+}
+declare <vscale x 4 x iXLen> @llvm.vp.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double>, <vscale x 4 x i1>, i32)
+
+define <vscale x 8 x iXLen> @lrint_nxv8f64(<vscale x 8 x double> %x, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: lrint_nxv8f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v16, v8, v0.t
+; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    ret
+;
+; RV64-i32-LABEL: lrint_nxv8f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v16, v8, v0.t
+; RV64-i32-NEXT:    vmv.v.v v8, v16
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv8f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8, v0.t
+; RV64-i64-NEXT:    ret
+  %a = call <vscale x 8 x iXLen> @llvm.vp.lrint.nxv8iXLen.nxv8f64(<vscale x 8 x double> %x, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x iXLen> %a
+}
+declare <vscale x 8 x iXLen> @llvm.vp.lrint.nxv8iXLen.nxv8f64(<vscale x 8 x double>, <vscale x 8 x i1>, i32)
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index e3462f0f33f11..fd010ef2208c4 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -72,6 +72,10 @@ class VPIntrinsicTest : public testing::Test {
            "i32)";
     Str << " declare <8 x float> @llvm.vp.ceil.v8f32(<8 x float>, <8 x i1>, "
            "i32)";
+    Str << " declare <8 x i32> @llvm.vp.lrint.v8i32.v8f32(<8 x float>, "
+           "<8 x i1>, i32)";
+    Str << " declare <8 x i64> @llvm.vp.llrint.v8i64.v8f32(<8 x float>, "
+           "<8 x i1>, i32)";
     Str << " declare <8 x float> @llvm.vp.fneg.v8f32(<8 x float>, <8 x i1>, "
            "i32)";
     Str << " declare <8 x float> @llvm.vp.fabs.v8f32(<8 x float>, <8 x i1>, "

From 56b63e0886ba369a53df5e1d429cde2e4a2d4a34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 09:07:53 +0100
Subject: [PATCH 284/546] [clang][Interp] Get <=> value info from weak result

This is also what the current interpreter does and a couple of
test cases expect that.
---
 clang/lib/AST/Interp/Interp.h                | 3 ++-
 clang/test/SemaCXX/compare-modules-cxx2a.cpp | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 7994550cc7b97..aecad1598ad2b 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -808,7 +808,8 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) {
   }
 
   assert(CmpInfo);
-  const auto *CmpValueInfo = CmpInfo->getValueInfo(CmpResult);
+  const auto *CmpValueInfo =
+      CmpInfo->getValueInfo(CmpInfo->makeWeakResult(CmpResult));
   assert(CmpValueInfo);
   assert(CmpValueInfo->hasValidIntValue());
   const APSInt &IntValue = CmpValueInfo->getIntValue();
diff --git a/clang/test/SemaCXX/compare-modules-cxx2a.cpp b/clang/test/SemaCXX/compare-modules-cxx2a.cpp
index 470464427efd9..19093b89f60cf 100644
--- a/clang/test/SemaCXX/compare-modules-cxx2a.cpp
+++ b/clang/test/SemaCXX/compare-modules-cxx2a.cpp
@@ -2,6 +2,7 @@
 // RUN: mkdir -p %t
 
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -fcxx-exceptions -verify -std=c++2a -fmodules -fmodules-cache-path=%t.mcp -I%S/Inputs %s -fno-modules-error-recovery -fmodule-map-file=%S/Inputs/compare.modulemap
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fcxx-exceptions -verify -std=c++2a -fmodules -fmodules-cache-path=%t.mcp -I%S/Inputs %s -fno-modules-error-recovery -fmodule-map-file=%S/Inputs/compare.modulemap -fexperimental-new-constant-interpreter
 
 struct CC { CC(...); };
 

From 94ca854d3c874322b1d4b5606c5762adcd3b8e05 Mon Sep 17 00:00:00 2001
From: Clement Courbet <courbet@google.com>
Date: Mon, 26 Feb 2024 09:55:07 +0100
Subject: [PATCH 285/546] [clang-tidy] Add support for determining constness of
 more expressions. (#82617)

This uses a more systematic approach for determining whcich
`DeclRefExpr`s mutate the underlying object: Instead of using a few
matchers, we walk up the AST until we find a parent that we can prove
cannot change the underlying object.

This allows us to handle most address taking and dereference, bindings
to value and const& variables, and track constness of pointee (see
changes in DeclRefExprUtilsTest.cpp).

This allows supporting more patterns in
`performance-unnecessary-copy-initialization`.

Those two patterns are relatively common:

```
const auto e = (*vector_ptr)[i]
```

and

```
const auto e = vector_ptr->at(i);
```

In our codebase, we have around 25% additional findings from
`performance-unnecessary-copy-initialization` with this change. I did
not see any additional false positives.
---
 .../UnnecessaryCopyInitialization.cpp         |  27 +-
 .../clang-tidy/utils/DeclRefExprUtils.cpp     | 210 +++++++++++---
 .../clang-tidy/utils/DeclRefExprUtils.h       |  33 ++-
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 +
 .../unnecessary-copy-initialization.cpp       |  27 +-
 .../clang-tidy/DeclRefExprUtilsTest.cpp       | 274 +++++++++++-------
 6 files changed, 394 insertions(+), 183 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
index dfe12c5b6007d..9beb185cba929 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@@ -86,16 +86,22 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, isConstRefReturningMethodCall,
   const auto MethodDecl =
       cxxMethodDecl(returns(hasCanonicalType(matchers::isReferenceToConst())))
           .bind(MethodDeclId);
-  const auto ReceiverExpr = declRefExpr(to(varDecl().bind(ObjectArgId)));
+  const auto ReceiverExpr =
+      ignoringParenImpCasts(declRefExpr(to(varDecl().bind(ObjectArgId))));
+  const auto OnExpr = anyOf(
+      // Direct reference to `*this`: `a.f()` or `a->f()`.
+      ReceiverExpr,
+      // Access through dereference, typically used for `operator[]`: `(*a)[3]`.
+      unaryOperator(hasOperatorName("*"), hasUnaryOperand(ReceiverExpr)));
   const auto ReceiverType =
       hasCanonicalType(recordType(hasDeclaration(namedDecl(
           unless(matchers::matchesAnyListedName(ExcludedContainerTypes))))));
 
-  return expr(anyOf(
-      cxxMemberCallExpr(callee(MethodDecl), on(ReceiverExpr),
-                        thisPointerType(ReceiverType)),
-      cxxOperatorCallExpr(callee(MethodDecl), hasArgument(0, ReceiverExpr),
-                          hasArgument(0, hasType(ReceiverType)))));
+  return expr(
+      anyOf(cxxMemberCallExpr(callee(MethodDecl), on(OnExpr),
+                              thisPointerType(ReceiverType)),
+            cxxOperatorCallExpr(callee(MethodDecl), hasArgument(0, OnExpr),
+                                hasArgument(0, hasType(ReceiverType)))));
 }
 
 AST_MATCHER_FUNCTION(StatementMatcher, isConstRefReturningFunctionCall) {
@@ -136,10 +142,11 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
 static bool isInitializingVariableImmutable(
     const VarDecl &InitializingVar, const Stmt &BlockStmt, ASTContext &Context,
     const std::vector<StringRef> &ExcludedContainerTypes) {
-  if (!isOnlyUsedAsConst(InitializingVar, BlockStmt, Context))
+  QualType T = InitializingVar.getType().getCanonicalType();
+  if (!isOnlyUsedAsConst(InitializingVar, BlockStmt, Context,
+                         T->isPointerType() ? 1 : 0))
     return false;
 
-  QualType T = InitializingVar.getType().getCanonicalType();
   // The variable is a value type and we know it is only used as const. Safe
   // to reference it and avoid the copy.
   if (!isa<ReferenceType, PointerType>(T))
@@ -273,7 +280,9 @@ void UnnecessaryCopyInitialization::check(
       VarDeclStmt.isSingleDecl() && !NewVar.getLocation().isMacroID();
   const bool IsVarUnused = isVariableUnused(NewVar, BlockStmt, *Result.Context);
   const bool IsVarOnlyUsedAsConst =
-      isOnlyUsedAsConst(NewVar, BlockStmt, *Result.Context);
+      isOnlyUsedAsConst(NewVar, BlockStmt, *Result.Context,
+                        // `NewVar` is always of non-pointer type.
+                        0);
   const CheckContext Context{
       NewVar,   BlockStmt,   VarDeclStmt,         *Result.Context,
       IssueFix, IsVarUnused, IsVarOnlyUsedAsConst};
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
index 2d73179150e8b..f0ffa517047b2 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.cpp
@@ -10,7 +10,9 @@
 #include "Matchers.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
+#include <cassert>
 
 namespace clang::tidy::utils::decl_ref_expr {
 
@@ -34,69 +36,183 @@ void extractNodesByIdTo(ArrayRef<BoundNodes> Matches, StringRef ID,
     Nodes.insert(Match.getNodeAs<Node>(ID));
 }
 
+// A matcher that matches DeclRefExprs that are used in ways such that the
+// underlying declaration is not modified.
+// If the declaration is of pointer type, `Indirections` specifies the level
+// of indirection of the object whose mutations we are tracking.
+//
+// For example, given:
+//   ```
+//   int i;
+//   int* p;
+//   p = &i;  // (A)
+//   *p = 3;  // (B)
+//   ```
+//
+//  `declRefExpr(to(varDecl(hasName("p"))), doesNotMutateObject(0))` matches
+//  (B), but `declRefExpr(to(varDecl(hasName("p"))), doesNotMutateObject(1))`
+//  matches (A).
+//
+AST_MATCHER_P(DeclRefExpr, doesNotMutateObject, int, Indirections) {
+  // We walk up the parents of the DeclRefExpr recursively until we end up on a
+  // parent that cannot modify the underlying object. There are a few kinds of
+  // expressions:
+  //  - Those that cannot be used to mutate the underlying object. We can stop
+  //    recursion there.
+  //  - Those that can be used to mutate the underlying object in analyzable
+  //    ways (such as taking the address or accessing a subobject). We have to
+  //    examine the parents.
+  //  - Those that we don't know how to analyze. In that case we stop there and
+  //    we assume that they can mutate the underlying expression.
+
+  struct StackEntry {
+    StackEntry(const Expr *E, int Indirections)
+        : E(E), Indirections(Indirections) {}
+    // The expression to analyze.
+    const Expr *E;
+    // The number of pointer indirections of the object being tracked (how
+    // many times an address was taken).
+    int Indirections;
+  };
+
+  llvm::SmallVector<StackEntry, 4> Stack;
+  Stack.emplace_back(&Node, Indirections);
+  ASTContext &Ctx = Finder->getASTContext();
+
+  while (!Stack.empty()) {
+    const StackEntry Entry = Stack.back();
+    Stack.pop_back();
+
+    // If the expression type is const-qualified at the appropriate indirection
+    // level then we can not mutate the object.
+    QualType Ty = Entry.E->getType().getCanonicalType();
+    for (int I = 0; I < Entry.Indirections; ++I) {
+      assert(Ty->isPointerType());
+      Ty = Ty->getPointeeType().getCanonicalType();
+    }
+    if (Ty.isConstQualified())
+      continue;
+
+    // Otherwise we have to look at the parents to see how the expression is
+    // used.
+    const DynTypedNodeList Parents = Ctx.getParents(*Entry.E);
+    // Note: most nodes have a single parents, but there exist nodes that have
+    // several parents, such as `InitListExpr` that have semantic and syntactic
+    // forms.
+    for (const auto &Parent : Parents) {
+      if (Parent.get<CompoundStmt>()) {
+        // Unused block-scope statement.
+        continue;
+      }
+      const Expr *const P = Parent.get<Expr>();
+      if (P == nullptr) {
+        // `Parent` is not an expr (e.g. a `VarDecl`).
+        // The case of binding to a `const&` or `const*` variable is handled by
+        // the fact that there is going to be a `NoOp` cast to const below the
+        // `VarDecl`, so we're not even going to get there.
+        // The case of copying into a value-typed variable is handled by the
+        // rvalue cast.
+        // This triggers only when binding to a mutable reference/ptr variable.
+        // FIXME: When we take a mutable reference we could keep checking the
+        // new variable for const usage only.
+        return false;
+      }
+      // Cosmetic nodes.
+      if (isa<ParenExpr>(P) || isa<MaterializeTemporaryExpr>(P)) {
+        Stack.emplace_back(P, Entry.Indirections);
+        continue;
+      }
+      if (const auto *const Cast = dyn_cast<CastExpr>(P)) {
+        switch (Cast->getCastKind()) {
+        // NoOp casts are used to add `const`. We'll check whether adding that
+        // const prevents modification when we process the cast.
+        case CK_NoOp:
+        // These do nothing w.r.t. to mutability.
+        case CK_BaseToDerived:
+        case CK_DerivedToBase:
+        case CK_UncheckedDerivedToBase:
+        case CK_Dynamic:
+        case CK_BaseToDerivedMemberPointer:
+        case CK_DerivedToBaseMemberPointer:
+          Stack.emplace_back(Cast, Entry.Indirections);
+          continue;
+        case CK_ToVoid:
+        case CK_PointerToBoolean:
+          // These do not mutate the underlying variable.
+          continue;
+        case CK_LValueToRValue: {
+          // An rvalue is immutable.
+          if (Entry.Indirections == 0)
+            continue;
+          Stack.emplace_back(Cast, Entry.Indirections);
+          continue;
+        }
+        default:
+          // Bail out on casts that we cannot analyze.
+          return false;
+        }
+      }
+      if (const auto *const Member = dyn_cast<MemberExpr>(P)) {
+        if (const auto *const Method =
+                dyn_cast<CXXMethodDecl>(Member->getMemberDecl())) {
+          if (!Method->isConst()) {
+            // The method can mutate our variable.
+            return false;
+          }
+          continue;
+        }
+        Stack.emplace_back(Member, 0);
+        continue;
+      }
+      if (const auto *const Op = dyn_cast<UnaryOperator>(P)) {
+        switch (Op->getOpcode()) {
+        case UO_AddrOf:
+          Stack.emplace_back(Op, Entry.Indirections + 1);
+          continue;
+        case UO_Deref:
+          assert(Entry.Indirections > 0);
+          Stack.emplace_back(Op, Entry.Indirections - 1);
+          continue;
+        default:
+          // Bail out on unary operators that we cannot analyze.
+          return false;
+        }
+      }
+
+      // Assume any other expression can modify the underlying variable.
+      return false;
+    }
+  }
+
+  // No parent can modify the variable.
+  return true;
+}
+
 } // namespace
 
-// Finds all DeclRefExprs where a const method is called on VarDecl or VarDecl
-// is the a const reference or value argument to a CallExpr or CXXConstructExpr.
 SmallPtrSet<const DeclRefExpr *, 16>
 constReferenceDeclRefExprs(const VarDecl &VarDecl, const Stmt &Stmt,
-                           ASTContext &Context) {
-  auto DeclRefToVar =
-      declRefExpr(to(varDecl(equalsNode(&VarDecl)))).bind("declRef");
-  auto MemberExprOfVar = memberExpr(hasObjectExpression(DeclRefToVar));
-  auto DeclRefToVarOrMemberExprOfVar =
-      stmt(anyOf(DeclRefToVar, MemberExprOfVar));
-  auto ConstMethodCallee = callee(cxxMethodDecl(isConst()));
-  // Match method call expressions where the variable is referenced as the this
-  // implicit object argument and operator call expression for member operators
-  // where the variable is the 0-th argument.
-  auto Matches = match(
-      findAll(expr(anyOf(
-          cxxMemberCallExpr(ConstMethodCallee,
-                            on(DeclRefToVarOrMemberExprOfVar)),
-          cxxOperatorCallExpr(ConstMethodCallee,
-                              hasArgument(0, DeclRefToVarOrMemberExprOfVar))))),
-      Stmt, Context);
+                           ASTContext &Context, int Indirections) {
+  auto Matches = match(findAll(declRefExpr(to(varDecl(equalsNode(&VarDecl))),
+                                           doesNotMutateObject(Indirections))
+                                   .bind("declRef")),
+                       Stmt, Context);
   SmallPtrSet<const DeclRefExpr *, 16> DeclRefs;
   extractNodesByIdTo(Matches, "declRef", DeclRefs);
-  auto ConstReferenceOrValue =
-      qualType(anyOf(matchers::isReferenceToConst(),
-                     unless(anyOf(referenceType(), pointerType(),
-                                  substTemplateTypeParmType()))));
-  auto ConstReferenceOrValueOrReplaced = qualType(anyOf(
-      ConstReferenceOrValue,
-      substTemplateTypeParmType(hasReplacementType(ConstReferenceOrValue))));
-  auto UsedAsConstRefOrValueArg = forEachArgumentWithParam(
-      DeclRefToVarOrMemberExprOfVar,
-      parmVarDecl(hasType(ConstReferenceOrValueOrReplaced)));
-  Matches = match(findAll(invocation(UsedAsConstRefOrValueArg)), Stmt, Context);
-  extractNodesByIdTo(Matches, "declRef", DeclRefs);
-  // References and pointers to const assignments.
-  Matches = match(
-      findAll(declStmt(has(varDecl(
-          hasType(qualType(matchers::isReferenceToConst())),
-          hasInitializer(ignoringImpCasts(DeclRefToVarOrMemberExprOfVar)))))),
-      Stmt, Context);
-  extractNodesByIdTo(Matches, "declRef", DeclRefs);
-  Matches = match(findAll(declStmt(has(varDecl(
-                      hasType(qualType(matchers::isPointerToConst())),
-                      hasInitializer(ignoringImpCasts(unaryOperator(
-                          hasOperatorName("&"),
-                          hasUnaryOperand(DeclRefToVarOrMemberExprOfVar)))))))),
-                  Stmt, Context);
-  extractNodesByIdTo(Matches, "declRef", DeclRefs);
+
   return DeclRefs;
 }
 
 bool isOnlyUsedAsConst(const VarDecl &Var, const Stmt &Stmt,
-                       ASTContext &Context) {
+                       ASTContext &Context, int Indirections) {
   // Collect all DeclRefExprs to the loop variable and all CallExprs and
   // CXXConstructExprs where the loop variable is used as argument to a const
   // reference parameter.
   // If the difference is empty it is safe for the loop variable to be a const
   // reference.
   auto AllDeclRefs = allDeclRefExprs(Var, Stmt, Context);
-  auto ConstReferenceDeclRefs = constReferenceDeclRefExprs(Var, Stmt, Context);
+  auto ConstReferenceDeclRefs =
+      constReferenceDeclRefExprs(Var, Stmt, Context, Indirections);
   return isSetDifferenceEmpty(AllDeclRefs, ConstReferenceDeclRefs);
 }
 
diff --git a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h
index 2c16d95408cf6..8361b9d89ed26 100644
--- a/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/DeclRefExprUtils.h
@@ -15,15 +15,6 @@
 
 namespace clang::tidy::utils::decl_ref_expr {
 
-/// Returns true if all ``DeclRefExpr`` to the variable within ``Stmt``
-/// do not modify it.
-///
-/// Returns ``true`` if only const methods or operators are called on the
-/// variable or the variable is a const reference or value argument to a
-/// ``callExpr()``.
-bool isOnlyUsedAsConst(const VarDecl &Var, const Stmt &Stmt,
-                       ASTContext &Context);
-
 /// Returns set of all ``DeclRefExprs`` to ``VarDecl`` within ``Stmt``.
 llvm::SmallPtrSet<const DeclRefExpr *, 16>
 allDeclRefExprs(const VarDecl &VarDecl, const Stmt &Stmt, ASTContext &Context);
@@ -34,9 +25,31 @@ allDeclRefExprs(const VarDecl &VarDecl, const Decl &Decl, ASTContext &Context);
 
 /// Returns set of all ``DeclRefExprs`` to ``VarDecl`` within ``Stmt`` where
 /// ``VarDecl`` is guaranteed to be accessed in a const fashion.
+///
+/// If ``VarDecl`` is of pointer type, ``Indirections`` specifies the level
+/// of indirection of the object whose mutations we are tracking.
+///
+/// For example, given:
+///   ```
+///   int i;
+///   int* p;
+///   p = &i;  // (A)
+///   *p = 3;  // (B)
+///   ```
+///
+///   - `constReferenceDeclRefExprs(P, Stmt, Context, 0)` returns the reference
+//      to `p` in (B): the pointee is modified, but the pointer is not;
+///   - `constReferenceDeclRefExprs(P, Stmt, Context, 1)` returns the reference
+//      to `p` in (A): the pointee is modified, but the pointer is not;
 llvm::SmallPtrSet<const DeclRefExpr *, 16>
 constReferenceDeclRefExprs(const VarDecl &VarDecl, const Stmt &Stmt,
-                           ASTContext &Context);
+                           ASTContext &Context, int Indirections);
+
+/// Returns true if all ``DeclRefExpr`` to the variable within ``Stmt``
+/// do not modify it.
+/// See `constReferenceDeclRefExprs` for the meaning of ``Indirections``.
+bool isOnlyUsedAsConst(const VarDecl &Var, const Stmt &Stmt,
+                       ASTContext &Context, int Indirections);
 
 /// Returns ``true`` if ``DeclRefExpr`` is the argument of a copy-constructor
 /// call expression within ``Decl``.
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6fd01ed9d471c..69537964f9bce 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -183,6 +183,12 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-override>` check to also remove any trailing
   whitespace when deleting the ``virtual`` keyword.
 
+- Improved :doc:`performance-unnecessary-copy-initialization
+  <clang-tidy/checks/performance/unnecessary-copy-initialization>` check by
+  detecting more cases of constant access. In particular, pointers can be
+  analyzed, se the check now handles the common patterns
+  `const auto e = (*vector_ptr)[i]` and `const auto e = vector_ptr->at(i);`.
+
 - Improved :doc:`readability-implicit-bool-conversion
   <clang-tidy/checks/readability/implicit-bool-conversion>` check to provide
   valid fix suggestions for ``static_cast`` without a preceding space and
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
index 049ba64d6aede..92625cc1332e2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp
@@ -41,6 +41,10 @@ struct Container {
   ConstIterator<T> end() const;
   void nonConstMethod();
   bool constMethod() const;
+
+  const T& at(int) const;
+  T& at(int);
+
 };
 
 using ExpensiveToCopyContainerAlias = Container<ExpensiveToCopyType>;
@@ -225,25 +229,25 @@ void PositiveOperatorCallConstValueParamAlias(const ExpensiveToCopyContainerAlia
   VarCopyConstructed.constMethod();
 }
 
-void PositiveOperatorCallConstValueParam(const Container<ExpensiveToCopyType>* C) {
+void PositiveOperatorCallConstPtrParam(const Container<ExpensiveToCopyType>* C) {
   const auto AutoAssigned = (*C)[42];
-  // TODO-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoAssigned'
-  // TODO-FIXES: const auto& AutoAssigned = (*C)[42];
+  // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoAssigned'
+  // CHECK-FIXES: const auto& AutoAssigned = (*C)[42];
   AutoAssigned.constMethod();
 
   const auto AutoCopyConstructed((*C)[42]);
-  // TODO-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoCopyConstructed'
-  // TODO-FIXES: const auto& AutoCopyConstructed((*C)[42]);
+  // CHECK-MESSAGES: [[@LINE-1]]:14: warning: the const qualified variable 'AutoCopyConstructed'
+  // CHECK-FIXES: const auto& AutoCopyConstructed((*C)[42]);
   AutoCopyConstructed.constMethod();
 
-  const ExpensiveToCopyType VarAssigned = C->operator[](42);
-  // TODO-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarAssigned'
-  // TODO-FIXES: const ExpensiveToCopyType& VarAssigned = C->operator[](42);
+  const ExpensiveToCopyType VarAssigned = C->at(42);
+  // CHECK-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarAssigned'
+  // CHECK-FIXES: const ExpensiveToCopyType& VarAssigned = C->at(42);
   VarAssigned.constMethod();
 
-  const ExpensiveToCopyType VarCopyConstructed(C->operator[](42));
-  // TODO-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarCopyConstructed'
-  // TODO-FIXES: const ExpensiveToCopyType& VarCopyConstructed(C->operator[](42));
+  const ExpensiveToCopyType VarCopyConstructed(C->at(42));
+  // CHECK-MESSAGES: [[@LINE-1]]:29: warning: the const qualified variable 'VarCopyConstructed'
+  // CHECK-FIXES: const ExpensiveToCopyType& VarCopyConstructed(C->at(42));
   VarCopyConstructed.constMethod();
 }
 
@@ -876,3 +880,4 @@ void negativeNonConstMemberExpr() {
     mutate(&Copy.Member);
   }
 }
+
diff --git a/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp b/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp
index a653b11faad28..4c9e81ea0f61a 100644
--- a/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp
+++ b/clang-tools-extra/unittests/clang-tidy/DeclRefExprUtilsTest.cpp
@@ -12,6 +12,7 @@ namespace tidy {
 namespace {
 using namespace clang::ast_matchers;
 
+template <int Indirections>
 class ConstReferenceDeclRefExprsTransform : public ClangTidyCheck {
 public:
   ConstReferenceDeclRefExprsTransform(StringRef CheckName,
@@ -27,7 +28,7 @@ class ConstReferenceDeclRefExprsTransform : public ClangTidyCheck {
     using utils::decl_ref_expr::constReferenceDeclRefExprs;
     const auto const_decrefexprs = constReferenceDeclRefExprs(
         *D, *cast<FunctionDecl>(D->getDeclContext())->getBody(),
-        *Result.Context);
+        *Result.Context, Indirections);
 
     for (const DeclRefExpr *const Expr : const_decrefexprs) {
       assert(Expr);
@@ -40,7 +41,7 @@ class ConstReferenceDeclRefExprsTransform : public ClangTidyCheck {
 
 namespace test {
 
-void RunTest(StringRef Snippet) {
+template <int Indirections> void RunTest(StringRef Snippet) {
 
   StringRef CommonCode = R"(
     struct ConstTag{};
@@ -59,6 +60,9 @@ void RunTest(StringRef Snippet) {
       bool operator==(const S&) const;
 
       int int_member;
+      // We consider a mutation of the `*ptr_member` to be a const use of
+      // `*this`. This is consistent with the semantics of `const`-qualified
+      // methods, which prevent modifying `ptr_member` but not `*ptr_member`.
       int* ptr_member;
 
     };
@@ -92,69 +96,88 @@ void RunTest(StringRef Snippet) {
   llvm::SmallVector<StringRef, 1> Parts;
   StringRef(Code).split(Parts, "/*const*/");
 
-  EXPECT_EQ(Code, runCheckOnCode<ConstReferenceDeclRefExprsTransform>(
-                      join(Parts, "")));
+  EXPECT_EQ(Code,
+            runCheckOnCode<ConstReferenceDeclRefExprsTransform<Indirections>>(
+                join(Parts, "")));
 }
 
 TEST(ConstReferenceDeclRefExprsTest, ConstValueVar) {
-  RunTest(R"(
+  RunTest<0>(R"(
     void f(const S target) {
       useVal(/*const*/target);
       useConstRef(/*const*/target);
-      useConstPtr(&target);
-      useConstPtrConstRef(&target);
+      useConstPtr(&/*const*/target);
+      useConstPtrConstRef(&/*const*/target);
       /*const*/target.constMethod();
       /*const*/target(ConstTag{});
       /*const*/target[42];
       useConstRef((/*const*/target));
       (/*const*/target).constMethod();
       (void)(/*const*/target == /*const*/target);
-      (void)target;
-      (void)&target;
-      (void)*&target;
+      (void)/*const*/target;
+      (void)&/*const*/target;
+      (void)*&/*const*/target;
+      /*const*/target;
       S copy1 = /*const*/target;
       S copy2(/*const*/target);
+      /*const*/target.int_member;
       useInt(/*const*/target.int_member);
       useIntConstRef(/*const*/target.int_member);
-      useIntPtr(target.ptr_member);
-      useIntConstPtr(&target.int_member);
+      useIntPtr(/*const*/target.ptr_member);
+      useIntConstPtr(&/*const*/target.int_member);
+
+      const S& const_target_ref = /*const*/target;
+      const S* const_target_ptr = &/*const*/target;
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, ConstRefVar) {
-  RunTest(R"(
+  RunTest<0>(R"(
     void f(const S& target) {
       useVal(/*const*/target);
       useConstRef(/*const*/target);
-      useConstPtr(&target);
-      useConstPtrConstRef(&target);
+      useConstPtr(&/*const*/target);
+      useConstPtrConstRef(&/*const*/target);
       /*const*/target.constMethod();
       /*const*/target(ConstTag{});
       /*const*/target[42];
       useConstRef((/*const*/target));
       (/*const*/target).constMethod();
       (void)(/*const*/target == /*const*/target);
-      (void)target;
-      (void)&target;
-      (void)*&target;
+      (void)/*const*/target;
+      (void)&/*const*/target;
+      (void)*&/*const*/target;
+      /*const*/target;
       S copy1 = /*const*/target;
       S copy2(/*const*/target);
+      /*const*/target.int_member;
       useInt(/*const*/target.int_member);
       useIntConstRef(/*const*/target.int_member);
-      useIntPtr(target.ptr_member);
-      useIntConstPtr(&target.int_member);
+      useIntPtr(/*const*/target.ptr_member);
+      useIntConstPtr(&/*const*/target.int_member);
+
+      const S& const_target_ref = /*const*/target;
+      const S* const_target_ptr = &/*const*/target;
+    }
+)");
+}
+
+TEST(ConstReferenceDeclRefExprsTest, DEBUGREMOVEME) {
+  RunTest<0>(R"(
+    void f(S target, const S& other) {
+      S* target_ptr = &target;
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, ValueVar) {
-  RunTest(R"(
+  RunTest<0>(R"(
     void f(S target, const S& other) {
       useConstRef(/*const*/target);
       useVal(/*const*/target);
-      useConstPtr(&target);
-      useConstPtrConstRef(&target);
+      useConstPtr(&/*const*/target);
+      useConstPtrConstRef(&/*const*/target);
       /*const*/target.constMethod();
       target.nonConstMethod();
       /*const*/target(ConstTag{});
@@ -167,26 +190,33 @@ TEST(ConstReferenceDeclRefExprsTest, ValueVar) {
       (/*const*/target).constMethod();
       (void)(/*const*/target == /*const*/target);
       (void)(/*const*/target == other);
-      (void)target;
-      (void)&target;
-      (void)*&target;
+      (void)/*const*/target;
+      (void)&/*const*/target;
+      (void)*&/*const*/target;
+      /*const*/target;
       S copy1 = /*const*/target;
       S copy2(/*const*/target);
+      /*const*/target.int_member;
       useInt(/*const*/target.int_member);
       useIntConstRef(/*const*/target.int_member);
-      useIntPtr(target.ptr_member);
-      useIntConstPtr(&target.int_member);
+      useIntPtr(/*const*/target.ptr_member);
+      useIntConstPtr(&/*const*/target.int_member);
+
+      const S& const_target_ref = /*const*/target;
+      const S* const_target_ptr = &/*const*/target;
+      S* target_ptr = &target;
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, RefVar) {
-  RunTest(R"(
+  RunTest<0>(R"(
     void f(S& target) {
       useVal(/*const*/target);
+      usePtr(&target);
       useConstRef(/*const*/target);
-      useConstPtr(&target);
-      useConstPtrConstRef(&target);
+      useConstPtr(&/*const*/target);
+      useConstPtrConstRef(&/*const*/target);
       /*const*/target.constMethod();
       target.nonConstMethod();
       /*const*/target(ConstTag{});
@@ -194,118 +224,150 @@ TEST(ConstReferenceDeclRefExprsTest, RefVar) {
       useConstRef((/*const*/target));
       (/*const*/target).constMethod();
       (void)(/*const*/target == /*const*/target);
-      (void)target;
-      (void)&target;
-      (void)*&target;
+      (void)/*const*/target;
+      (void)&/*const*/target;
+      (void)*&/*const*/target;
+      /*const*/target;
       S copy1 = /*const*/target;
       S copy2(/*const*/target);
+      /*const*/target.int_member;
       useInt(/*const*/target.int_member);
       useIntConstRef(/*const*/target.int_member);
-      useIntPtr(target.ptr_member);
-      useIntConstPtr(&target.int_member);
+      useIntPtr(/*const*/target.ptr_member);
+      useIntConstPtr(&/*const*/target.int_member);
+
+      (void)(&/*const*/target)->int_member;
+      useIntRef((&target)->int_member);
+
+      const S& const_target_ref = /*const*/target;
+      const S* const_target_ptr = &/*const*/target;
+      S* target_ptr = &target;
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, PtrVar) {
-  RunTest(R"(
+  RunTest<1>(R"(
     void f(S* target) {
-      useVal(*target);
-      useConstRef(*target);
-      useConstPtr(target);
+      useVal(*/*const*/target);
+      usePtr(target);
+      useConstRef(*/*const*/target);
+      useConstPtr(/*const*/target);
       useConstPtrConstRef(/*const*/target);
+      usePtrConstPtr(&target);
       /*const*/target->constMethod();
       target->nonConstMethod();
-      (*target)(ConstTag{});
+      (*/*const*/target)(ConstTag{});
       (*target)[42];
       target->operator[](42);
-      useConstRef((*target));
+      useConstRef((*/*const*/target));
       (/*const*/target)->constMethod();
-      (void)(*target == *target);
-      (void)*target;
-      (void)target;
-      S copy1 = *target;
-      S copy2(*target);
-      useInt(target->int_member);
-      useIntConstRef(target->int_member);
-      useIntPtr(target->ptr_member);
-      useIntConstPtr(&target->int_member);
+      (void)(*/*const*/target == */*const*/target);
+      (void)*/*const*/target;
+      (void)/*const*/target;
+      /*const*/target;
+      S copy1 = */*const*/target;
+      S copy2(*/*const*/target);
+      /*const*/target->int_member;
+      useInt(/*const*/target->int_member);
+      useIntConstRef(/*const*/target->int_member);
+      useIntPtr(/*const*/target->ptr_member);
+      useIntConstPtr(&/*const*/target->int_member);
+
+      const S& const_target_ref = */*const*/target;
+      const S* const_target_ptr = /*const*/target;
+      S* target_ptr = target;  // FIXME: we could chect const usage of `target_ptr`.
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, ConstPtrVar) {
-  RunTest(R"(
+  RunTest<1>(R"(
     void f(const S* target) {
-      useVal(*target);
-      useConstRef(*target);
-      useConstPtr(target);
-      useConstPtrRef(target);
-      useConstPtrPtr(&target);
-      useConstPtrConstPtr(&target);
+      useVal(*/*const*/target);
+      useConstRef(*/*const*/target);
+      useConstPtr(/*const*/target);
+      useConstPtrRef(/*const*/target);
+      useConstPtrPtr(&/*const*/target);
+      useConstPtrConstPtr(&/*const*/target);
       useConstPtrConstRef(/*const*/target);
       /*const*/target->constMethod();
-      (*target)(ConstTag{});
-      (*target)[42];
+      (*/*const*/target)(ConstTag{});
+      (*/*const*/target)[42];
       /*const*/target->operator[](42);
-      (void)(*target == *target);
-      (void)target;
-      (void)*target;
-      if(target) {}
-      S copy1 = *target;
-      S copy2(*target);
-      useInt(target->int_member);
-      useIntConstRef(target->int_member);
-      useIntPtr(target->ptr_member);
-      useIntConstPtr(&target->int_member);
+      (void)(*/*const*/target == */*const*/target);
+      (void)/*const*/target;
+      (void)*/*const*/target;
+      /*const*/target;
+      if(/*const*/target) {}
+      S copy1 = */*const*/target;
+      S copy2(*/*const*/target);
+      /*const*/target->int_member;
+      useInt(/*const*/target->int_member);
+      useIntConstRef(/*const*/target->int_member);
+      useIntPtr(/*const*/target->ptr_member);
+      useIntConstPtr(&/*const*/target->int_member);
+
+      const S& const_target_ref = */*const*/target;
+      const S* const_target_ptr = /*const*/target;
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, ConstPtrPtrVar) {
-  RunTest(R"(
+  RunTest<2>(R"(
     void f(const S** target) {
-      useVal(**target);
-      useConstRef(**target);
-      useConstPtr(*target);
-      useConstPtrRef(*target);
-      useConstPtrPtr(target);
-      useConstPtrConstPtr(target);
-      useConstPtrConstRef(*target);
-      (void)target;
-      (void)*target;
-      (void)**target;
-      if(target) {}
-      if(*target) {}
-      S copy1 = **target;
-      S copy2(**target);
-      useInt((*target)->int_member);
-      useIntConstRef((*target)->int_member);
-      useIntPtr((*target)->ptr_member);
-      useIntConstPtr(&(*target)->int_member);
+      useVal(**/*const*/target);
+      useConstRef(**/*const*/target);
+      useConstPtr(*/*const*/target);
+      useConstPtrRef(*/*const*/target);
+      useConstPtrPtr(/*const*/target);
+      useConstPtrConstPtr(/*const*/target);
+      useConstPtrConstRef(*/*const*/target);
+      (void)/*const*/target;
+      (void)*/*const*/target;
+      (void)**/*const*/target;
+      /*const*/target;
+      if(/*const*/target) {}
+      if(*/*const*/target) {}
+      S copy1 = **/*const*/target;
+      S copy2(**/*const*/target);
+      (*/*const*/target)->int_member;
+      useInt((*/*const*/target)->int_member);
+      useIntConstRef((*/*const*/target)->int_member);
+      useIntPtr((*/*const*/target)->ptr_member);
+      useIntConstPtr(&(*/*const*/target)->int_member);
+
+      const S& const_target_ref = **/*const*/target;
+      const S* const_target_ptr = */*const*/target;
     }
 )");
 }
 
 TEST(ConstReferenceDeclRefExprsTest, ConstPtrConstPtrVar) {
-  RunTest(R"(
+  RunTest<2>(R"(
     void f(const S* const* target) {
-      useVal(**target);
-      useConstRef(**target);
-      useConstPtr(*target);
-      useConstPtrConstPtr(target);
-      useConstPtrConstRef(*target);
-      (void)target;
-      (void)target;
-      (void)**target;
-      if(target) {}
-      if(*target) {}
-      S copy1 = **target;
-      S copy2(**target);
-      useInt((*target)->int_member);
-      useIntConstRef((*target)->int_member);
-      useIntPtr((*target)->ptr_member);
-      useIntConstPtr(&(*target)->int_member);
+      useVal(**/*const*/target);
+      useConstRef(**/*const*/target);
+      useConstPtr(*/*const*/target);
+      useConstPtrConstPtr(/*const*/target);
+      useConstPtrConstRef(*/*const*/target);
+      (void)/*const*/target;
+      (void)*/*const*/target;
+      (void)**/*const*/target;
+      /*const*/target;
+      if(/*const*/target) {}
+      if(*/*const*/target) {}
+      S copy1 = **/*const*/target;
+      S copy2(**/*const*/target);
+      (*/*const*/target)->int_member;
+      useInt((*/*const*/target)->int_member);
+      useIntConstRef((*/*const*/target)->int_member);
+      useIntPtr((*/*const*/target)->ptr_member);
+      useIntConstPtr(&(*/*const*/target)->int_member);
+
+      const S& const_target_ref = **/*const*/target;
+      const S* const_target_ptr = */*const*/target;
     }
 )");
 }

From 60677110a2e0b7ba9f5bf2fcfc1b3f8888eb7f58 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Sat, 24 Feb 2024 10:09:38 +0000
Subject: [PATCH 286/546] [RemoveDIs] Fix DPLabel crash reported in #82854

---
 llvm/include/llvm/IR/DebugInfo.h |  6 ++----
 llvm/lib/IR/DebugInfo.cpp        | 11 ++++-------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 022df64880aeb..6673908d3ed35 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -109,10 +109,8 @@ class DebugInfoFinder {
   void processVariable(const Module &M, const DILocalVariable *DVI);
   /// Process debug info location.
   void processLocation(const Module &M, const DILocation *Loc);
-  // Process a DPValue, much like a DbgVariableIntrinsic.
-  void processDPValue(const Module &M, const DPValue &DPV);
-  /// Dispatch to DbgRecord subclasses handlers.
-  void processDbgRecord(const Module &M, const DbgRecord &DPE);
+  /// Process a DbgRecord (e.g, treat a DPValue like a DbgVariableIntrinsic).
+  void processDbgRecord(const Module &M, const DbgRecord &DR);
 
   /// Process subprogram.
   void processSubprogram(DISubprogram *SP);
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index a4fec60190850..e044ab3230c5f 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -251,13 +251,10 @@ void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
   processLocation(M, Loc->getInlinedAt());
 }
 
-void DebugInfoFinder::processDPValue(const Module &M, const DPValue &DPV) {
-  processVariable(M, DPV.getVariable());
-  processLocation(M, DPV.getDebugLoc().get());
-}
-
-void DebugInfoFinder::processDbgRecord(const Module &M, const DbgRecord &DPR) {
-  processDPValue(M, cast<DPValue>(DPR));
+void DebugInfoFinder::processDbgRecord(const Module &M, const DbgRecord &DR) {
+  if (const DPValue *DPV = dyn_cast<const DPValue>(&DR))
+    processVariable(M, DPV->getVariable());
+  processLocation(M, DR.getDebugLoc().get());
 }
 
 void DebugInfoFinder::processType(DIType *DT) {

From 3356818eed3224c50012f8ed2bfa046f2bc8e154 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 23 Feb 2024 13:46:57 +0000
Subject: [PATCH 287/546] Reapply [RemoveDIs] Enable DPLabels conversion [3b/3]
 (#82639)

Enables conversion between llvm.dbg.label and DPLabel.
---
 .../include/llvm/IR/DebugProgramInstruction.h | 10 ++++++++
 llvm/lib/IR/BasicBlock.cpp                    | 18 +++++++------
 llvm/lib/IR/DebugProgramInstruction.cpp       | 25 +++++++++++++++++++
 3 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 84b0f743d3c9b..97089098ee535 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -62,6 +62,8 @@ class BasicBlock;
 class MDNode;
 class Module;
 class DbgVariableIntrinsic;
+class DbgInfoIntrinsic;
+class DbgLabelInst;
 class DIAssignID;
 class DPMarker;
 class DPValue;
@@ -80,6 +82,7 @@ class raw_ostream;
 ///   clone
 ///   isIdenticalToWhenDefined
 ///   both print methods
+///   createDebugIntrinsic
 class DbgRecord : public ilist_node<DbgRecord> {
 public:
   /// Marker that this DbgRecord is linked into.
@@ -103,6 +106,11 @@ class DbgRecord : public ilist_node<DbgRecord> {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &O, ModuleSlotTracker &MST, bool IsForDebug) const;
   bool isIdenticalToWhenDefined(const DbgRecord &R) const;
+  /// Convert this DbgRecord back into an appropriate llvm.dbg.* intrinsic.
+  /// \p InsertBefore Optional position to insert this intrinsic.
+  /// \returns A new llvm.dbg.* intrinsic representiung this DbgRecord.
+  DbgInfoIntrinsic *createDebugIntrinsic(Module *M,
+                                         Instruction *InsertBefore) const;
   ///@}
 
   /// Same as isIdenticalToWhenDefined but checks DebugLoc too.
@@ -177,6 +185,8 @@ class DPLabel : public DbgRecord {
   DPLabel *clone() const;
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
+  DbgLabelInst *createDebugIntrinsic(Module *M,
+                                     Instruction *InsertBefore) const;
 
   void setLabel(DILabel *NewLabel) { Label = NewLabel; }
   DILabel *getLabel() const { return Label; }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 0680754444f17..6ea876fde5ec6 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -81,6 +81,12 @@ void BasicBlock::convertToNewDbgValues() {
       continue;
     }
 
+    if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(&I)) {
+      DPVals.push_back(new DPLabel(DLI->getLabel(), DLI->getDebugLoc()));
+      DLI->eraseFromParent();
+      continue;
+    }
+
     if (DPVals.empty())
       continue;
 
@@ -107,16 +113,12 @@ void BasicBlock::convertFromNewDbgValues() {
       continue;
 
     DPMarker &Marker = *Inst.DbgMarker;
-    for (DbgRecord &DR : Marker.getDbgValueRange()) {
-      if (auto *DPV = dyn_cast<DPValue>(&DR))
-        InstList.insert(Inst.getIterator(),
-                        DPV->createDebugIntrinsic(getModule(), nullptr));
-      else
-        llvm_unreachable("unsupported DbgRecord kind");
-    }
+    for (DbgRecord &DR : Marker.getDbgValueRange())
+      InstList.insert(Inst.getIterator(),
+                      DR.createDebugIntrinsic(getModule(), nullptr));
 
     Marker.eraseFromParent();
-  };
+  }
 
   // Assume no trailing DPValues: we could technically create them at the end
   // of the block, after a terminator, but this would be non-cannonical and
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 2ca4533afa96c..389bac4de6a1c 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -112,6 +112,17 @@ bool DbgRecord::isEquivalentTo(const DbgRecord &R) const {
   return getDebugLoc() == R.getDebugLoc() && isIdenticalToWhenDefined(R);
 }
 
+DbgInfoIntrinsic *
+DbgRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
+  switch (RecordKind) {
+  case ValueKind:
+    return cast<DPValue>(this)->createDebugIntrinsic(M, InsertBefore);
+  case LabelKind:
+    return cast<DPLabel>(this)->createDebugIntrinsic(M, InsertBefore);
+  };
+  llvm_unreachable("unsupported DbgRecord kind");
+}
+
 DPValue *DPValue::createDPValue(Value *Location, DILocalVariable *DV,
                                 DIExpression *Expr, const DILocation *DI) {
   return new DPValue(ValueAsMetadata::get(Location), DV, Expr, DI,
@@ -377,6 +388,20 @@ DPValue::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const {
   return DVI;
 }
 
+DbgLabelInst *DPLabel::createDebugIntrinsic(Module *M,
+                                            Instruction *InsertBefore) const {
+  auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label);
+  Value *Args[] = {
+      MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())};
+  DbgLabelInst *DbgLabel = cast<DbgLabelInst>(
+      CallInst::Create(LabelFn->getFunctionType(), LabelFn, Args));
+  DbgLabel->setTailCall();
+  DbgLabel->setDebugLoc(getDebugLoc());
+  if (InsertBefore)
+    DbgLabel->insertBefore(InsertBefore);
+  return DbgLabel;
+}
+
 Value *DPValue::getAddress() const {
   auto *MD = getRawAddress();
   if (auto *V = dyn_cast<ValueAsMetadata>(MD))

From cb2dd0282cf2f5dfc58d5a060dd2aa73c3b4c08e Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Mon, 26 Feb 2024 04:44:03 -0500
Subject: [PATCH 288/546] [clang][NFC] constify or staticify some
 CGRecordLowering fns (#82874)

Some CGRecordLowering functions either do not need the object or do not mutate it.  Thus marking static or const as appropriate.
---
 clang/lib/CodeGen/CGRecordLayoutBuilder.cpp | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 868ef810f3c4e..7822903b89ce4 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -95,7 +95,7 @@ struct CGRecordLowering {
   CGRecordLowering(CodeGenTypes &Types, const RecordDecl *D, bool Packed);
   // Short helper routines.
   /// Constructs a MemberInfo instance from an offset and llvm::Type *.
-  MemberInfo StorageInfo(CharUnits Offset, llvm::Type *Data) {
+  static MemberInfo StorageInfo(CharUnits Offset, llvm::Type *Data) {
     return MemberInfo(Offset, MemberInfo::Field, Data);
   }
 
@@ -104,7 +104,7 @@ struct CGRecordLowering {
   /// fields of the same formal type.  We want to emit a layout with
   /// these discrete storage units instead of combining them into a
   /// continuous run.
-  bool isDiscreteBitFieldABI() {
+  bool isDiscreteBitFieldABI() const {
     return Context.getTargetInfo().getCXXABI().isMicrosoft() ||
            D->isMsStruct(Context);
   }
@@ -121,22 +121,22 @@ struct CGRecordLowering {
   /// other bases, which complicates layout in specific ways.
   ///
   /// Note specifically that the ms_struct attribute doesn't change this.
-  bool isOverlappingVBaseABI() {
+  bool isOverlappingVBaseABI() const {
     return !Context.getTargetInfo().getCXXABI().isMicrosoft();
   }
 
   /// Wraps llvm::Type::getIntNTy with some implicit arguments.
-  llvm::Type *getIntNType(uint64_t NumBits) {
+  llvm::Type *getIntNType(uint64_t NumBits) const {
     unsigned AlignedBits = llvm::alignTo(NumBits, Context.getCharWidth());
     return llvm::Type::getIntNTy(Types.getLLVMContext(), AlignedBits);
   }
   /// Get the LLVM type sized as one character unit.
-  llvm::Type *getCharType() {
+  llvm::Type *getCharType() const {
     return llvm::Type::getIntNTy(Types.getLLVMContext(),
                                  Context.getCharWidth());
   }
   /// Gets an llvm type of size NumChars and alignment 1.
-  llvm::Type *getByteArrayType(CharUnits NumChars) {
+  llvm::Type *getByteArrayType(CharUnits NumChars) const {
     assert(!NumChars.isZero() && "Empty byte arrays aren't allowed.");
     llvm::Type *Type = getCharType();
     return NumChars == CharUnits::One() ? Type :
@@ -144,7 +144,7 @@ struct CGRecordLowering {
   }
   /// Gets the storage type for a field decl and handles storage
   /// for itanium bitfields that are smaller than their declared type.
-  llvm::Type *getStorageType(const FieldDecl *FD) {
+  llvm::Type *getStorageType(const FieldDecl *FD) const {
     llvm::Type *Type = Types.ConvertTypeForMem(FD->getType());
     if (!FD->isBitField()) return Type;
     if (isDiscreteBitFieldABI()) return Type;
@@ -152,29 +152,29 @@ struct CGRecordLowering {
                              (unsigned)Context.toBits(getSize(Type))));
   }
   /// Gets the llvm Basesubobject type from a CXXRecordDecl.
-  llvm::Type *getStorageType(const CXXRecordDecl *RD) {
+  llvm::Type *getStorageType(const CXXRecordDecl *RD) const {
     return Types.getCGRecordLayout(RD).getBaseSubobjectLLVMType();
   }
-  CharUnits bitsToCharUnits(uint64_t BitOffset) {
+  CharUnits bitsToCharUnits(uint64_t BitOffset) const {
     return Context.toCharUnitsFromBits(BitOffset);
   }
-  CharUnits getSize(llvm::Type *Type) {
+  CharUnits getSize(llvm::Type *Type) const {
     return CharUnits::fromQuantity(DataLayout.getTypeAllocSize(Type));
   }
-  CharUnits getAlignment(llvm::Type *Type) {
+  CharUnits getAlignment(llvm::Type *Type) const {
     return CharUnits::fromQuantity(DataLayout.getABITypeAlign(Type));
   }
-  bool isZeroInitializable(const FieldDecl *FD) {
+  bool isZeroInitializable(const FieldDecl *FD) const {
     return Types.isZeroInitializable(FD->getType());
   }
-  bool isZeroInitializable(const RecordDecl *RD) {
+  bool isZeroInitializable(const RecordDecl *RD) const {
     return Types.isZeroInitializable(RD);
   }
   void appendPaddingBytes(CharUnits Size) {
     if (!Size.isZero())
       FieldTypes.push_back(getByteArrayType(Size));
   }
-  uint64_t getFieldBitOffset(const FieldDecl *FD) {
+  uint64_t getFieldBitOffset(const FieldDecl *FD) const {
     return Layout.getFieldOffset(FD->getFieldIndex());
   }
   // Layout routines.

From 8cfb71613c452dd45a84a74affe8464bfd33de02 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 26 Feb 2024 09:44:26 +0000
Subject: [PATCH 289/546] [mlir][ArmSME] Replace use of `isa` with
 `isa_and_present` (#82798)

`op` can be null here, in which case this should just return a null
value back.
---
 .../Dialect/ArmSME/Transforms/VectorLegalization.cpp  |  2 +-
 mlir/test/Dialect/ArmSME/vector-legalization.mlir     | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 26dfb38263372..11f8bc04b2184 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -459,7 +459,7 @@ struct LiftIllegalVectorTransposeToMemory
   }
 
   static Value getExtensionSource(Operation *op) {
-    if (isa<arith::ExtSIOp, arith::ExtUIOp, arith::ExtFOp>(op))
+    if (isa_and_present<arith::ExtSIOp, arith::ExtUIOp, arith::ExtFOp>(op))
       return op->getOperand(0);
     return {};
   }
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index 11888c675f0b0..bf0b58ff4cf07 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -377,3 +377,14 @@ func.func @lift_illegal_transpose_to_memory_with_in_bounds_attr(%a: index, %b: i
   %legalType = vector.transpose %illegalRead, [1, 0] : vector<[8]x4xf32> to vector<4x[8]xf32>
   return %legalType : vector<4x[8]xf32>
 }
+
+// -----
+
+// The pass should do nothing (and not crash).
+// CHECK-LABEL: @illegal_transpose_no_defining_source_op
+func.func @illegal_transpose_no_defining_source_op(%vec: vector<[4]x1xf32>) -> vector<1x[4]xf32>
+{
+  // CHECK: vector.transpose
+  %0 = vector.transpose %vec, [1, 0] : vector<[4]x1xf32> to vector<1x[4]xf32>
+  return %0 : vector<1x[4]xf32>
+}

From e521752c04a479e3751003645a728667f3199d24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20Halkenh=C3=A4user?=
 <MichaelGerald.Halkenhauser@amd.com>
Date: Mon, 26 Feb 2024 11:16:25 +0100
Subject: [PATCH 290/546] [OpenMP][OMPT] Add OMPT callback for device data
 exchange 'Device-to-Device' (#81991)

Since there's no `ompt_target_data_transfer_tofrom_device` (within
ompt_target_data_op_t enum) or something other that conveys the meaning
of inter-device data exchange we decided to indicate a Device-to-Device
transfer by using: optype == ompt_target_data_transfer_from_device (=3)

Hence, a device transfer may be identified e.g. by checking for: (optype
== 3) &&
(src_device_num < omp_get_num_devices()) &&
(dest_device_num < omp_get_num_devices())

Fixes: #66478
---
 .../include/OpenMP/OMPT/Interface.h           | 20 +++--
 .../libomptarget/src/OpenMP/OMPT/Callback.cpp | 50 +++++------
 openmp/libomptarget/src/device.cpp            | 15 +++-
 openmp/libomptarget/test/ompt/callbacks.h     |  5 +-
 openmp/libomptarget/test/ompt/target_memcpy.c | 21 ++++-
 .../test/ompt/target_memcpy_emi.c             | 85 +++++++++++++++++++
 6 files changed, 153 insertions(+), 43 deletions(-)
 create mode 100644 openmp/libomptarget/test/ompt/target_memcpy_emi.c

diff --git a/openmp/libomptarget/include/OpenMP/OMPT/Interface.h b/openmp/libomptarget/include/OpenMP/OMPT/Interface.h
index 13eca730a9295..327fadfcd4acd 100644
--- a/openmp/libomptarget/include/OpenMP/OMPT/Interface.h
+++ b/openmp/libomptarget/include/OpenMP/OMPT/Interface.h
@@ -54,12 +54,14 @@ class Interface {
                           void **TgtPtrBegin, size_t Size, void *Code);
 
   /// Top-level function for invoking callback before data submit
-  void beginTargetDataSubmit(int64_t DeviceId, void *HstPtrBegin,
-                             void *TgtPtrBegin, size_t Size, void *Code);
+  void beginTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                             int64_t DstDeviceId, void *DstPtrBegin,
+                             size_t Size, void *Code);
 
   /// Top-level function for invoking callback after data submit
-  void endTargetDataSubmit(int64_t DeviceId, void *HstPtrBegin,
-                           void *TgtPtrBegin, size_t Size, void *Code);
+  void endTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                           int64_t DstDeviceId, void *DstPtrBegin, size_t Size,
+                           void *Code);
 
   /// Top-level function for invoking callback before device data deallocation
   void beginTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin, void *Code);
@@ -68,12 +70,14 @@ class Interface {
   void endTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin, void *Code);
 
   /// Top-level function for invoking callback before data retrieve
-  void beginTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                               void *TgtPtrBegin, size_t Size, void *Code);
+  void beginTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                               int64_t DstDeviceId, void *DstPtrBegin,
+                               size_t Size, void *Code);
 
   /// Top-level function for invoking callback after data retrieve
-  void endTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                             void *TgtPtrBegin, size_t Size, void *Code);
+  void endTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                             int64_t DstDeviceId, void *DstPtrBegin,
+                             size_t Size, void *Code);
 
   /// Top-level function for invoking callback before kernel dispatch
   void beginTargetSubmit(unsigned int NumTeams = 1);
diff --git a/openmp/libomptarget/src/OpenMP/OMPT/Callback.cpp b/openmp/libomptarget/src/OpenMP/OMPT/Callback.cpp
index 66435d2a4fe64..f285843e39f38 100644
--- a/openmp/libomptarget/src/OpenMP/OMPT/Callback.cpp
+++ b/openmp/libomptarget/src/OpenMP/OMPT/Callback.cpp
@@ -119,41 +119,38 @@ void Interface::endTargetDataAlloc(int64_t DeviceId, void *HstPtrBegin,
   endTargetDataOperation();
 }
 
-void Interface::beginTargetDataSubmit(int64_t DeviceId, void *TgtPtrBegin,
-                                      void *HstPtrBegin, size_t Size,
-                                      void *Code) {
+void Interface::beginTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                      int64_t DstDeviceId, void *DstPtrBegin,
+                                      size_t Size, void *Code) {
   beginTargetDataOperation();
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_to_device, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), TgtPtrBegin, DeviceId, Size,
-        Code);
+        ompt_target_data_transfer_to_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   } else if (ompt_callback_target_data_op_fn) {
     // HostOpId is set by the runtime
     HostOpId = createOpId();
     // Invoke the tool supplied data op callback
     ompt_callback_target_data_op_fn(
         TargetData.value, HostOpId, ompt_target_data_transfer_to_device,
-        HstPtrBegin, /*SrcDeviceNum=*/omp_get_initial_device(), TgtPtrBegin,
-        DeviceId, Size, Code);
+        SrcPtrBegin, SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
   }
 }
 
-void Interface::endTargetDataSubmit(int64_t DeviceId, void *TgtPtrBegin,
-                                    void *HstPtrBegin, size_t Size,
-                                    void *Code) {
+void Interface::endTargetDataSubmit(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                    int64_t DstDeviceId, void *DstPtrBegin,
+                                    size_t Size, void *Code) {
   // Only EMI callback handles end scope
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_to_device, HstPtrBegin,
-        /*SrcDeviceNum=*/omp_get_initial_device(), TgtPtrBegin, DeviceId, Size,
-        Code);
+        ompt_target_data_transfer_to_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   }
   endTargetDataOperation();
 }
@@ -193,41 +190,38 @@ void Interface::endTargetDataDelete(int64_t DeviceId, void *TgtPtrBegin,
   endTargetDataOperation();
 }
 
-void Interface::beginTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                                        void *TgtPtrBegin, size_t Size,
-                                        void *Code) {
+void Interface::beginTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                        int64_t DstDeviceId, void *DstPtrBegin,
+                                        size_t Size, void *Code) {
   beginTargetDataOperation();
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_begin, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_from_device, TgtPtrBegin, DeviceId,
-        HstPtrBegin,
-        /*TgtDeviceNum=*/omp_get_initial_device(), Size, Code);
+        ompt_target_data_transfer_from_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   } else if (ompt_callback_target_data_op_fn) {
     // HostOpId is set by the runtime
     HostOpId = createOpId();
     // Invoke the tool supplied data op callback
     ompt_callback_target_data_op_fn(
         TargetData.value, HostOpId, ompt_target_data_transfer_from_device,
-        TgtPtrBegin, DeviceId, HstPtrBegin,
-        /*TgtDeviceNum=*/omp_get_initial_device(), Size, Code);
+        SrcPtrBegin, SrcDeviceId, DstPtrBegin, DstDeviceId, Size, Code);
   }
 }
 
-void Interface::endTargetDataRetrieve(int64_t DeviceId, void *HstPtrBegin,
-                                      void *TgtPtrBegin, size_t Size,
-                                      void *Code) {
+void Interface::endTargetDataRetrieve(int64_t SrcDeviceId, void *SrcPtrBegin,
+                                      int64_t DstDeviceId, void *DstPtrBegin,
+                                      size_t Size, void *Code) {
   // Only EMI callback handles end scope
   if (ompt_callback_target_data_op_emi_fn) {
     // HostOpId will be set by the tool. Invoke the tool supplied data op EMI
     // callback
     ompt_callback_target_data_op_emi_fn(
         ompt_scope_end, TargetTaskData, &TargetData, &HostOpId,
-        ompt_target_data_transfer_from_device, TgtPtrBegin, DeviceId,
-        HstPtrBegin,
-        /*TgtDeviceNum=*/omp_get_initial_device(), Size, Code);
+        ompt_target_data_transfer_from_device, SrcPtrBegin, SrcDeviceId,
+        DstPtrBegin, DstDeviceId, Size, Code);
   }
   endTargetDataOperation();
 }
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 5fe3f508b739c..3345277d91d3a 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -151,7 +151,7 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
   OMPT_IF_BUILT(
       InterfaceRAII TargetDataSubmitRAII(
           RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
-          DeviceID, TgtPtrBegin, HstPtrBegin, Size,
+          omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
   if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
@@ -173,7 +173,7 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
   OMPT_IF_BUILT(
       InterfaceRAII TargetDataRetrieveRAII(
           RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
-          DeviceID, HstPtrBegin, TgtPtrBegin, Size,
+          DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
 
   if (!RTL->data_retrieve_async || !RTL->synchronize)
@@ -185,6 +185,17 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
 // Copy data from current device to destination device directly
 int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
                                int64_t Size, AsyncInfoTy &AsyncInfo) {
+  /// RAII to establish tool anchors before and after data exchange
+  /// Note: Despite the fact that this is a data exchange, we use 'from_device'
+  ///       operation enum (w.r.t. ompt_target_data_op_t) as there is currently
+  ///       no better alternative. It is still possible to distinguish this
+  ///       scenario from a real data retrieve by checking if both involved
+  ///       device numbers are less than omp_get_num_devices().
+  OMPT_IF_BUILT(
+      InterfaceRAII TargetDataExchangeRAII(
+          RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
+          RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
+          /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
   if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
     assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
     return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
diff --git a/openmp/libomptarget/test/ompt/callbacks.h b/openmp/libomptarget/test/ompt/callbacks.h
index 1f9b7c177b286..95437d9cdcfb1 100644
--- a/openmp/libomptarget/test/ompt/callbacks.h
+++ b/openmp/libomptarget/test/ompt/callbacks.h
@@ -81,11 +81,14 @@ static void on_ompt_callback_target_data_op_emi(
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
   if (endpoint == ompt_scope_begin)
     *host_op_id = next_op_id++;
+  // target_task_data may be null, avoid dereferencing it
+  uint64_t target_task_data_value =
+      (target_task_data) ? target_task_data->value : 0;
   printf("  Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
          "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
          "src_device_num=%d "
          "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         endpoint, optype, target_task_data, target_task_data->value,
+         endpoint, optype, target_task_data, target_task_data_value,
          target_data, target_data->value, host_op_id, *host_op_id, src_addr,
          src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
 }
diff --git a/openmp/libomptarget/test/ompt/target_memcpy.c b/openmp/libomptarget/test/ompt/target_memcpy.c
index 444f4b7bdbda3..80a8d6a4b32e5 100644
--- a/openmp/libomptarget/test/ompt/target_memcpy.c
+++ b/openmp/libomptarget/test/ompt/target_memcpy.c
@@ -33,6 +33,10 @@ int main() {
   if (omp_target_memcpy(dev_ptr, &host_var1, sizeof(int), 0, 0, dev, host))
     abort();
 
+  // D2D transfer
+  if (omp_target_memcpy(dev_ptr, dev_ptr, sizeof(int), 0, 0, dev, dev))
+    abort();
+
   // D2H transfer
   if (omp_target_memcpy(&host_var2, dev_ptr, sizeof(int), 0, 0, host, dev))
     abort();
@@ -46,16 +50,25 @@ int main() {
 
 // clang-format off
 /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
+/// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
 /// CHECK-NOT: code=(nil)
-/// CHECK: code=[[CODE1:.*]]
+/// CHECK: code=[[CODE1:0x[0-f]+]]
 /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK-SAME: src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE1]]
-/// CHECK: code=[[CODE2:.*]]
+/// CHECK: code=[[CODE2:0x[0-f]+]]
 /// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE2]]
-/// CHECK: code=[[CODE3:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: code=[[CODE3:0x[0-f]+]]
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE3]]
+/// CHECK: code=[[CODE4:0x[0-f]+]]
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK-NOT: code=(nil)
+/// CHECK-NOT: code=[[CODE4]]
diff --git a/openmp/libomptarget/test/ompt/target_memcpy_emi.c b/openmp/libomptarget/test/ompt/target_memcpy_emi.c
new file mode 100644
index 0000000000000..5347f38b87b6f
--- /dev/null
+++ b/openmp/libomptarget/test/ompt/target_memcpy_emi.c
@@ -0,0 +1,85 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// REQUIRES: ompt
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+/*
+ * Verify all three data transfer directions: H2D, D2D and D2H
+ */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "callbacks.h"
+#include "register_emi.h"
+
+int main(void) {
+  int NumDevices = omp_get_num_devices();
+  assert(NumDevices > 0 && "No device(s) present.");
+  int Device = omp_get_default_device();
+  int Host = omp_get_initial_device();
+  // Note: Zero value depicts an OFFLOAD_SUCCESS
+  int Status;
+
+  printf("Allocating Memory on Device\n");
+  int *DevPtr = (int *)omp_target_alloc(sizeof(int), Device);
+  assert(DevPtr && "Could not allocate memory on device.");
+  int *HstPtr = (int *)malloc(sizeof(int));
+  *HstPtr = 42;
+
+  printf("Testing: Host to Device\n");
+  Status = omp_target_memcpy(DevPtr, HstPtr, sizeof(int), 0, 0, Device, Host);
+  assert(Status == 0 && "H2D memory copy operation failed.\n");
+
+  printf("Testing: Device to Device\n");
+  Status = omp_target_memcpy(DevPtr, DevPtr, sizeof(int), 0, 0, Device, Device);
+  assert(Status == 0 && "D2D memory copy operation failed.\n");
+
+  printf("Testing: Device to Host\n");
+  Status = omp_target_memcpy(HstPtr, DevPtr, sizeof(int), 0, 0, Host, Device);
+  assert(Status == 0 && "D2H memory copy operation failed.\n");
+
+  printf("Checking Correctness\n");
+  assert(*HstPtr == 42);
+
+  printf("Freeing Memory on Device\n");
+  free(HstPtr);
+  omp_target_free(DevPtr, Device);
+
+  return 0;
+}
+
+// clang-format off
+
+/// CHECK: Callback Init:
+
+/// CHECK: Allocating Memory on Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
+/// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+
+/// CHECK: Testing: Host to Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+
+/// CHECK: Testing: Device to Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+
+/// CHECK: Testing: Device to Host
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+
+/// CHECK: Checking Correctness
+
+/// CHECK: Freeing Memory on Device
+/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 {{.+}} src_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 {{.+}} src_device_num=[[DEVICE]]
+
+/// CHECK: Callback Fini:
+
+// clang-format on

From a5ccf8522b96c56fc6bda54cf68a64c5d65b75cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 09:29:02 +0100
Subject: [PATCH 291/546] [clang][Interp] Not all RVO call expressions are
 initializing

We do not necessarily prepare storage for the return value when
we are returning a complex value.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 9 ++++++++-
 clang/test/AST/Interp/complex.cpp        | 6 ++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 49ba8e95f1799..b5402ec8caaec 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2703,7 +2703,14 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) {
           return false;
       }
     } else {
-      assert(Initializing);
+      // We need the result. Prepare a pointer to return or
+      // dup the current one.
+      if (!Initializing) {
+        if (std::optional<unsigned> LocalIndex = allocateLocal(E)) {
+          if (!this->emitGetPtrLocal(*LocalIndex, E))
+            return false;
+        }
+      }
       if (!this->emitDupPtr(E))
         return false;
     }
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index 2b65ccf9946e7..b6091d90867a0 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -124,6 +124,12 @@ void func(void) {
   result = arr * ii;
 }
 
+constexpr _Complex float getComplexFloat() {
+  return {1,2};
+}
+static_assert(__real(getComplexFloat()) == 1, "");
+static_assert(__imag(getComplexFloat()) == 2, "");
+
 namespace CastToBool {
   constexpr _Complex int F = {0, 1};
   static_assert(F, "");

From 73f11f9579a3206608ad9a07b5793ba451676087 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 26 Feb 2024 10:55:21 +0000
Subject: [PATCH 292/546] [lldb][test] Correct results regex for Windows

On Windows the line has \r\n at the end.
---
 lldb/test/API/lldbtest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/API/lldbtest.py b/lldb/test/API/lldbtest.py
index bae73e71820f7..c888fb574f4b7 100644
--- a/lldb/test/API/lldbtest.py
+++ b/lldb/test/API/lldbtest.py
@@ -101,7 +101,7 @@ def execute(self, test, litConfig):
         # Example: "OK (skipped=1, expected failures=1)"
         # Example: "FAILED (failures=3)"
         # Example: "OK"
-        result_regex = r"^(?:OK|FAILED)(?: \((.*)\))?$"
+        result_regex = r"^(?:OK|FAILED)(?: \((.*)\))?\r?$"
         results = re.search(result_regex, err, re.MULTILINE)
 
         # If parsing fails mark this test as unresolved.

From 53697a5dcdc4d83cbe0cb6d88e33c3f1bb3ea487 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus@amd.com>
Date: Mon, 26 Feb 2024 12:02:59 +0100
Subject: [PATCH 293/546] [AMDGPU] Refactor unit test. NFC (#82976)

I'm about to add more tests here (downstream for now).

Change-Id: Ibd5edb398f544c90e6e8b5e49b1777a407f0594a
---
 .../Target/AMDGPU/AMDGPUUnitTests.cpp         | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index f02413376759b..d0a3cfa84ee01 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -92,9 +92,23 @@ static const std::pair<StringRef, StringRef>
   W32FS = {"+wavefrontsize32", "w32"},
   W64FS = {"+wavefrontsize64", "w64"};
 
-static void testGPRLimits(
-    const char *RegName, bool TestW32W64,
-    std::function<bool(std::stringstream &, unsigned, GCNSubtarget &)> test) {
+using TestFuncTy =
+    function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>;
+
+static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
+                          TestFuncTy test) {
+  bool Success = true;
+  unsigned MaxOcc = ST.getMaxWavesPerEU();
+  for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
+    Table << std::right << std::setw(3) << Occ << "    ";
+    Success = test(Table, Occ, ST) && Success;
+    Table << '\n';
+  }
+  return Success;
+}
+
+static void testGPRLimits(const char *RegName, bool TestW32W64,
+                          TestFuncTy test) {
   SmallVector<StringRef> CPUs;
   AMDGPU::fillValidArchListAMDGCN(CPUs);
 
@@ -117,13 +131,7 @@ static void testGPRLimits(
         FS = &W32FS;
 
       std::stringstream Table;
-      bool Success = true;
-      unsigned MaxOcc = ST.getMaxWavesPerEU();
-      for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
-        Table << std::right << std::setw(3) << Occ << "    ";
-        Success = test(Table, Occ, ST) && Success;
-        Table << '\n';
-      }
+      bool Success = testAndRecord(Table, ST, test);
       if (!Success || PrintCpuRegLimits)
         TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
 
@@ -145,13 +153,14 @@ static void testGPRLimits(
 }
 
 TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
-  testGPRLimits("VGPR", true, [](std::stringstream &OS, unsigned Occ,
-                                 GCNSubtarget &ST) {
+  auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
     unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
     return checkMinMax(
         OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
         [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
         [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
         [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
-  });
+  };
+
+  testGPRLimits("VGPR", true, test);
 }

From d0c99f4b1ddc55f3af2ee2d084bc8c97a2ab1acf Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 26 Feb 2024 12:04:41 +0100
Subject: [PATCH 294/546] [libc++] Remove __member_pointer_traits_imp (#82081)

They aren't ever used, so they can be removed.
---
 libcxx/include/__type_traits/invoke.h | 196 +-------------------------
 1 file changed, 1 insertion(+), 195 deletions(-)

diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h
index c0487e25a0d8b..44ef2a83b9cc0 100644
--- a/libcxx/include/__type_traits/invoke.h
+++ b/libcxx/include/__type_traits/invoke.h
@@ -11,8 +11,6 @@
 #define _LIBCPP___TYPE_TRAITS_INVOKE_H
 
 #include <__config>
-#include <__type_traits/add_lvalue_reference.h>
-#include <__type_traits/apply_cv.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/enable_if.h>
@@ -25,7 +23,6 @@
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_void.h>
 #include <__type_traits/nat.h>
-#include <__type_traits/remove_cv.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
 
@@ -35,197 +32,6 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct __any {
-  __any(...);
-};
-
-template <class _MP, bool _IsMemberFunctionPtr, bool _IsMemberObjectPtr>
-struct __member_pointer_traits_imp {};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...), true, false> {
-  typedef _Class _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...), true, false> {
-  typedef _Class _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const, true, false> {
-  typedef _Class const _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const, true, false> {
-  typedef _Class const _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile, true, false> {
-  typedef _Class volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile, true, false> {
-  typedef _Class volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile, true, false> {
-  typedef _Class const volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile, true, false> {
-  typedef _Class const volatile _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...)&, true, false> {
-  typedef _Class& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...)&, true, false> {
-  typedef _Class& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const&, true, false> {
-  typedef _Class const& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const&, true, false> {
-  typedef _Class const& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile&, true, false> {
-  typedef _Class volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile&, true, false> {
-  typedef _Class volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile&, true, false> {
-  typedef _Class const volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile&, true, false> {
-  typedef _Class const volatile& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...)&&, true, false> {
-  typedef _Class&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...)&&, true, false> {
-  typedef _Class&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const&&, true, false> {
-  typedef _Class const&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const&&, true, false> {
-  typedef _Class const&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) volatile&&, true, false> {
-  typedef _Class volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) volatile&&, true, false> {
-  typedef _Class volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param...) const volatile&&, true, false> {
-  typedef _Class const volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param...);
-};
-
-template <class _Rp, class _Class, class... _Param>
-struct __member_pointer_traits_imp<_Rp (_Class::*)(_Param..., ...) const volatile&&, true, false> {
-  typedef _Class const volatile&& _ClassType;
-  typedef _Rp _ReturnType;
-  typedef _Rp(_FnType)(_Param..., ...);
-};
-
-template <class _Rp, class _Class>
-struct __member_pointer_traits_imp<_Rp _Class::*, false, true> {
-  typedef _Class _ClassType;
-  typedef _Rp _ReturnType;
-};
-
-template <class _MP>
-struct __member_pointer_traits
-    : public __member_pointer_traits_imp<__remove_cv_t<_MP>,
-                                         is_member_function_pointer<_MP>::value,
-                                         is_member_object_pointer<_MP>::value> {
-  //     typedef ... _ClassType;
-  //     typedef ... _ReturnType;
-  //     typedef ... _FnType;
-};
-
 template <class _DecayedFp>
 struct __member_pointer_class_type {};
 
@@ -285,7 +91,7 @@ using __enable_if_bullet6 =
 // fall back - none of the bullets
 
 template <class... _Args>
-__nat __invoke(__any, _Args&&... __args);
+__nat __invoke(_Args&&... __args);
 
 // bullets 1, 2 and 3
 

From d0b1fec9e1510d01dad2c9c429573eaa75f0963c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 26 Feb 2024 11:15:34 +0000
Subject: [PATCH 295/546] [lldb][test][Windows] Remove expected fail for a
 thread state test

No idea why but this is now passing (though if it randomly fails
I won't be surprised).

See https://github.com/llvm/llvm-project/issues/25034 for background
on the original expected fail.
---
 lldb/test/API/functionalities/thread/state/TestThreadStates.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/test/API/functionalities/thread/state/TestThreadStates.py b/lldb/test/API/functionalities/thread/state/TestThreadStates.py
index 56954c9f34c7e..97a6e250dcc39 100644
--- a/lldb/test/API/functionalities/thread/state/TestThreadStates.py
+++ b/lldb/test/API/functionalities/thread/state/TestThreadStates.py
@@ -38,7 +38,6 @@ def test_state_after_continue(self):
 
     @skipIfDarwin  # 'llvm.org/pr23669', cause Python crash randomly
     @expectedFailureDarwin("llvm.org/pr23669")
-    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24660")
     @expectedFailureNetBSD
     # thread states not properly maintained
     @unittest.expectedFailure  # llvm.org/pr16712

From bb87c914fec6526fbda81991ce0d35e60040ab9f Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Mon, 26 Feb 2024 11:30:04 +0000
Subject: [PATCH 296/546] [AArch64][SVE]Add error message in the AsmParser for
 SVEPattern (#82668)

All assembly instructions that have an operand using sve_pred_enum and
mistakenly use '#' in front of it would fail without an error message.
---
 llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +-
 llvm/test/MC/AArch64/SVE/cntb-diagnostics.s            | 5 +++++
 llvm/test/MC/AArch64/SVE/ptrue-diagnostics.s           | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 9774b5491ef11..b807aaf76fdb0 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -7939,7 +7939,7 @@ ParseStatus AArch64AsmParser::tryParseSVEPattern(OperandVector &Operands) {
 
     auto *MCE = dyn_cast<MCConstantExpr>(ImmVal);
     if (!MCE)
-      return ParseStatus::Failure;
+      return TokError("invalid operand for instruction");
 
     Pattern = MCE->getValue();
   } else {
diff --git a/llvm/test/MC/AArch64/SVE/cntb-diagnostics.s b/llvm/test/MC/AArch64/SVE/cntb-diagnostics.s
index 90ec58809fdd9..d338deb636eaa 100644
--- a/llvm/test/MC/AArch64/SVE/cntb-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/cntb-diagnostics.s
@@ -55,3 +55,8 @@ cntb  x0, vl512
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
 // CHECK-NEXT: cntb  x0, vl512
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+cntb  x0, #all, mul #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: cntb  x0, #all, mul #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE/ptrue-diagnostics.s b/llvm/test/MC/AArch64/SVE/ptrue-diagnostics.s
index 51001265acbc7..02a94b23661e6 100644
--- a/llvm/test/MC/AArch64/SVE/ptrue-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/ptrue-diagnostics.s
@@ -27,3 +27,8 @@ ptrue p0.s, #32
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate pattern
 // CHECK-NEXT: ptrue p0.s, #32
 // CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+ptrue p0.s, #all
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: ptrue p0.s, #all
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:$

From b4b490496ab8994fee41005471d075812bdb3a65 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 26 Feb 2024 19:33:44 +0800
Subject: [PATCH 297/546] [RISCV] Fix insert_subvector with fixed vector type
 creating invalid node (#82975)

If the vector type is a fixed vector type, we convert it to a container
scalable vector type to compute its reg class. But we need to keep the
old
fixed type so we create a result node with the same type.

This code path is currently dead so I haven't been able to create a test
case
for it. But I have an upcoming patch for insert_subvector lowering that
will
exercise this.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c922098c55094..1b8c1434c9f2d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2066,14 +2066,15 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       assert(Idx == 0 && V.isUndef());
       SubVecContainerVT = TLI.getContainerForFixedLengthVector(SubVecVT);
     }
+    MVT ContainerVT = VT;
     if (VT.isFixedLengthVector())
-      VT = TLI.getContainerForFixedLengthVector(VT);
+      ContainerVT = TLI.getContainerForFixedLengthVector(VT);
 
     const auto *TRI = Subtarget->getRegisterInfo();
     unsigned SubRegIdx;
     std::tie(SubRegIdx, Idx) =
         RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-            VT, SubVecContainerVT, Idx, TRI);
+            ContainerVT, SubVecContainerVT, Idx, TRI);
 
     // If the Idx hasn't been completely eliminated then this is a subvector
     // insert which doesn't naturally align to a vector register. These must
@@ -2093,7 +2094,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // If we haven't set a SubRegIdx, then we must be going between
     // equally-sized LMUL groups (e.g. VR -> VR). This can be done as a copy.
     if (SubRegIdx == RISCV::NoSubRegister) {
-      unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
+      unsigned InRegClassID =
+          RISCVTargetLowering::getRegClassIDForVecVT(ContainerVT);
       assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecContainerVT) ==
                  InRegClassID &&
              "Unexpected subvector extraction");

From 954a048d0d03d874d214cbe9ce0da456a0da35d3 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Mon, 26 Feb 2024 11:44:54 +0000
Subject: [PATCH 298/546] [RemoveDIs] Fix SimplifyCFG behaviour to match
 existing behaviour (#82981)

llvm.dbg.labels are deleted in SpeculativelyExecuteBB so DPLabels should
be too.

Modify existing test to check this (NB I couldn't find a dedicated
debug-info test that checks this behaviour).
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp           | 5 +++--
 llvm/test/Transforms/SimplifyCFG/branch-fold-dbg.ll | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 9562d44400448..bbdbfbbf776f3 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3208,8 +3208,9 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
   // end of this block.
   for (auto &It : make_range(ThenBB->begin(), ThenBB->end()))
     for (DbgRecord &DR : make_early_inc_range(It.getDbgValueRange()))
-      if (DPValue *DPV = dyn_cast<DPValue>(&DR); DPV && !DPV->isDbgAssign())
-        It.dropOneDbgValue(DPV);
+      // Drop all records except assign-kind DPValues (dbg.assign equivalent).
+      if (DPValue *DPV = dyn_cast<DPValue>(&DR); !DPV || !DPV->isDbgAssign())
+        It.dropOneDbgValue(&DR);
   BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
              std::prev(ThenBB->end()));
 
diff --git a/llvm/test/Transforms/SimplifyCFG/branch-fold-dbg.ll b/llvm/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
index 833d72773cdc8..ae43bab2f4fe5 100644
--- a/llvm/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
+++ b/llvm/test/Transforms/SimplifyCFG/branch-fold-dbg.ll
@@ -26,10 +26,11 @@ define i1 @foo(i32) nounwind ssp !dbg !0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[OR_COND2]], i1 false, i1 [[TMP8]], !dbg [[DBG7]]
 ; CHECK-NEXT:    br label [[COMMON_RET]], !dbg [[DBG7]]
+; CHECK-NOT: BB4
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[BB2]] ]
 ; CHECK-NEXT:    ret i1 [[COMMON_RET_OP]], !dbg [[DBG14:![0-9]+]]
-;
+
 Entry:
   %1 = icmp slt i32 %0, 0, !dbg !5
   br i1 %1, label %BB5, label %BB1, !dbg !5
@@ -55,6 +56,10 @@ BB3:                                              ; preds = %BB2
 
 BB4:                                              ; preds = %BB3
   %8 = icmp slt i32 %0, 0, !dbg !5
+  ;; Manually inserted intrinsics; these should get deleted when this block is
+  ;; folded into BB2.
+  call void @llvm.dbg.value(metadata ptr null, metadata !7, metadata !DIExpression()), !dbg !12
+  call void @llvm.dbg.label(metadata !18), !dbg !12
   ret i1 %8, !dbg !14
 
 BB5:                                              ; preds = %BB3, %BB2, %BB1, %Entry
@@ -84,3 +89,4 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !15 = !DIFile(filename: "a.c", directory: "/private/tmp")
 !16 = !{i32 1, !"Debug Info Version", i32 3}
 !17 = !{i32 2, !"Dwarf Version", i32 4}
+!18 = !DILabel(scope: !0, name: "label", file: !1, line: 1)

From 3d084e37ab038200df5f5ef371fdea2fcda05680 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 26 Feb 2024 20:01:10 +0800
Subject: [PATCH 299/546] [RISCV] Add tests for fixed length concat_vector. NFC

These shufflevector chains will get combined into a n-ary concat_vectors node.
---
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 247 ++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
new file mode 100644
index 0000000000000..6ef5aa846d6d9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+
+define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: concat_2xv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    ret
+  %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %ab
+}
+
+define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: concat_4xv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v11
+; CHECK-NEXT:    vmv1r.v v14, v9
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 6
+; CHECK-NEXT:    ret
+  %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %abcd
+}
+
+define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x i32> %d, <1 x i32> %e, <1 x i32> %f, <1 x i32> %g, <1 x i32> %h) {
+; CHECK-LABEL: concat_8xv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmv1r.v v20, v11
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v22, 1
+; CHECK-NEXT:    vsetivli zero, 3, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 3
+; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v18, 5
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v14, 6
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 7
+; CHECK-NEXT:    ret
+  %ab = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
+  %cd = shufflevector <1 x i32> %c, <1 x i32> %d, <2 x i32> <i32 0, i32 1>
+  %abcd = shufflevector <2 x i32> %ab, <2 x i32> %cd, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %ef = shufflevector <1 x i32> %e, <1 x i32> %f, <2 x i32> <i32 0, i32 1>
+  %gh = shufflevector <1 x i32> %g, <1 x i32> %h, <2 x i32> <i32 0, i32 1>
+  %efgh = shufflevector <2 x i32> %ef, <2 x i32> %gh, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %abcdefgh = shufflevector <4 x i32> %abcd, <4 x i32> %efgh, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %abcdefgh
+}
+
+define <16 x i32> @concat_2xv8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: concat_2xv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v10
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    ret
+  %v = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @concat_4xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+; CHECK-LABEL: concat_4xv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v11
+; CHECK-NEXT:    vmv1r.v v16, v10
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 4
+; CHECK-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 8
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 12
+; CHECK-NEXT:    ret
+  %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %abcd
+}
+
+define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d, <2 x i32> %e, <2 x i32> %f, <2 x i32> %g, <2 x i32> %h) {
+; CHECK-LABEL: concat_8xv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    vmv1r.v v20, v14
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vmv1r.v v28, v11
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmv1r.v v4, v9
+; CHECK-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v4, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v0, 4
+; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v28, 6
+; CHECK-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 8
+; CHECK-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v24, 10
+; CHECK-NEXT:    vsetivli zero, 14, e32, m4, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v20, 12
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 14
+; CHECK-NEXT:    ret
+  %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %cd = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %abcd = shufflevector <4 x i32> %ab, <4 x i32> %cd, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %ef = shufflevector <2 x i32> %e, <2 x i32> %f, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %gh = shufflevector <2 x i32> %g, <2 x i32> %h, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %efgh = shufflevector <4 x i32> %ef, <4 x i32> %gh, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %abcdefgh = shufflevector <8 x i32> %abcd, <8 x i32> %efgh, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i32> %abcdefgh
+}
+
+define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: concat_2xv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-NEXT:    ret
+  %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i32> %ab
+}
+
+define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
+; CHECK-LABEL: concat_4xv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v16, v14
+; CHECK-NEXT:    vmv2r.v v24, v12
+; CHECK-NEXT:    vmv2r.v v0, v10
+; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v0, 8
+; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v24, 16
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 24
+; CHECK-NEXT:    ret
+  %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %abcd = shufflevector <16 x i32> %ab, <16 x i32> %cd, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i32> %abcd
+}
+
+define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
+; CHECK-LABEL: concat_8xv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    vmv1r.v v16, v15
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v16, v14
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v16, v13
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmv1r.v v24, v10
+; CHECK-NEXT:    vmv1r.v v16, v9
+; CHECK-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v24, 8
+; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v0, 12
+; CHECK-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslideup.vi v8, v16, 20
+; CHECK-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslideup.vi v8, v16, 24
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslideup.vi v8, v16, 28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ef = shufflevector <4 x i32> %e, <4 x i32> %f, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %gh = shufflevector <4 x i32> %g, <4 x i32> %h, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %efgh = shufflevector <8 x i32> %ef, <8 x i32> %gh, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %abcdefgh = shufflevector <16 x i32> %abcd, <16 x i32> %efgh, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i32> %abcdefgh
+}

From 28233408a2c8670d7d94ae1bf18a2bb5f7194c32 Mon Sep 17 00:00:00 2001
From: Jack Styles <99514724+Stylie777@users.noreply.github.com>
Date: Mon, 26 Feb 2024 12:12:31 +0000
Subject: [PATCH 300/546] [CodeGen] [ARM] Make RISC-V Init Undef Pass Target
 Independent and add support for the ARM Architecture.  (#77770)

When using Greedy Register Allocation, there are times where
early-clobber values are ignored, and assigned the same register. This
is illeagal behaviour for these intructions. To get around this, using
Pseudo instructions for early-clobber registers gives them a definition
and allows Greedy to assign them to a different register. This then
meets the ARM Architecture Reference Manual and matches the defined
behaviour.

This patch takes the existing RISC-V patch and makes it target
independent, then adds support for the ARM Architecture. Doing this will
ensure early-clobber restraints are followed when using the ARM
Architecture. Making the pass target independent will also open up
possibility that support other architectures can be added in the future.
---
 llvm/include/llvm/CodeGen/Passes.h            |   5 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   9 ++
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  22 +++
 .../llvm/CodeGen/TargetSubtargetInfo.h        |   6 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |   2 +
 .../llvm/Passes/MachinePassRegistry.def       |   1 +
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/CodeGen.cpp                  |   1 +
 .../InitUndef.cpp}                            | 152 ++++++++----------
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   2 +
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         |   6 +
 llvm/lib/Target/ARM/ARMBaseInstrInfo.h        |  18 +++
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.h     |  27 ++++
 llvm/lib/Target/ARM/ARMInstrInfo.td           |  12 ++
 llvm/lib/Target/ARM/ARMSubtarget.h            |   7 +
 llvm/lib/Target/RISCV/CMakeLists.txt          |   1 -
 llvm/lib/Target/RISCV/RISCV.h                 |   4 -
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  17 ++
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |  21 +++
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |   2 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |  10 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   4 +
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |   1 +
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |   1 +
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |   1 +
 llvm/test/CodeGen/RISCV/O0-pipeline.ll        |   2 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   2 +-
 .../rvv/handle-noreg-with-implicit-def.mir    |   2 +-
 .../rvv/subregister-undef-early-clobber.mir   |   2 +-
 .../RISCV/rvv/undef-earlyclobber-chain.mir    |   2 +-
 .../CodeGen/Thumb2/mve-intrinsics/vcaddq.ll   |  11 ++
 .../Thumb2/mve-laneinterleaving-cost.ll       |   4 +-
 llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll  | 114 ++++++-------
 llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll   |  28 +++-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   1 +
 .../secondary/llvm/lib/Target/RISCV/BUILD.gn  |   1 -
 38 files changed, 333 insertions(+), 171 deletions(-)
 rename llvm/lib/{Target/RISCV/RISCVRVVInitUndef.cpp => CodeGen/InitUndef.cpp} (61%)

diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 3f0d81fa1d14b..f850767270a4f 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -196,6 +196,11 @@ namespace llvm {
   /// This pass reads flow sensitive profile.
   extern char &MIRProfileLoaderPassID;
 
+  // This pass gives undef values a Pseudo Instruction definition for
+  // Instructions to ensure early-clobber is followed when using the greedy
+  // register allocator.
+  extern char &InitUndefID;
+
   /// FastRegisterAllocation Pass - This pass register allocates as fast as
   /// possible. It is best suited for debug code where live ranges are short.
   ///
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 138c65785430f..e7787aafb98e2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2223,6 +2223,15 @@ class TargetInstrInfo : public MCInstrInfo {
     llvm_unreachable("unknown number of operands necessary");
   }
 
+  /// Gets the opcode for the Pseudo Instruction used to initialize
+  /// the undef value. If no Instruction is available, this will
+  /// fail compilation.
+  virtual unsigned getUndefInitOpcode(unsigned RegClassID) const {
+    (void)RegClassID;
+
+    llvm_unreachable("Unexpected register class.");
+  }
+
 private:
   mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 5098fc68df3b2..e7c9ecd2e1851 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1172,6 +1172,28 @@ class TargetRegisterInfo : public MCRegisterInfo {
   virtual bool isNonallocatableRegisterCalleeSave(MCRegister Reg) const {
     return false;
   }
+
+  /// Returns the Largest Super Class that is being initialized. There
+  /// should be a Pseudo Instruction implemented for the super class
+  /// that is being returned to ensure that Init Undef can apply the
+  /// initialization correctly.
+  virtual const TargetRegisterClass *
+  getLargestSuperClass(const TargetRegisterClass *RC) const {
+    llvm_unreachable("Unexpected target register class.");
+  }
+
+  /// Returns if the architecture being targeted has the required Pseudo
+  /// Instructions for initializing the register. By default this returns false,
+  /// but where it is overriden for an architecture, the behaviour will be
+  /// different. This can either be a check to ensure the Register Class is
+  /// present, or to return true as an indication the architecture supports the
+  /// pass. If using the method that does not check for the Register Class, it
+  /// is imperative to ensure all required Pseudo Instructions are implemented,
+  /// otherwise compilation may fail with an `Unexpected register class` error.
+  virtual bool
+  doesRegClassHavePseudoInitUndef(const TargetRegisterClass *RC) const {
+    return false;
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index a064dec7d8ab3..7f8ed5c501989 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -327,6 +327,12 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
 
   /// Get the list of MacroFusion predicates.
   virtual std::vector<MacroFusionPredTy> getMacroFusions() const { return {}; };
+
+  /// supportsInitUndef is used to determine if an architecture supports
+  /// the Init Undef Pass. By default, it is assumed that it will not support
+  /// the pass, with architecture specific overrides providing the information
+  /// where they are implemented.
+  virtual bool supportsInitUndef() const { return false; }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index ee91c3ec3ddc2..e4bf3868d0006 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -300,6 +300,7 @@ void initializeTLSVariableHoistLegacyPassPass(PassRegistry &);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
 void initializeTypePromotionLegacyPass(PassRegistry&);
+void initializeInitUndefPass(PassRegistry &);
 void initializeUniformityInfoWrapperPassPass(PassRegistry &);
 void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index dc60727729f73..82a17e882b3c4 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -1066,6 +1066,8 @@ void CodeGenPassBuilder<Derived>::addOptimizedRegAlloc(
     AddMachinePass &addPass) const {
   addPass(DetectDeadLanesPass());
 
+  addPass(InitUndefPass());
+
   addPass(ProcessImplicitDefsPass());
 
   // Edge splitting is smarter with machine loop info.
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index d8972080beeb0..016602730e0e9 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -173,6 +173,7 @@ DUMMY_MACHINE_FUNCTION_PASS("fs-profile-loader", MIRProfileLoaderNewPass)
 DUMMY_MACHINE_FUNCTION_PASS("funclet-layout", FuncletLayoutPass)
 DUMMY_MACHINE_FUNCTION_PASS("gc-empty-basic-blocks", GCEmptyBasicBlocksPass)
 DUMMY_MACHINE_FUNCTION_PASS("implicit-null-checks", ImplicitNullChecksPass)
+DUMMY_MACHINE_FUNCTION_PASS("init-undef-pass", InitUndefPass)
 DUMMY_MACHINE_FUNCTION_PASS("instruction-select", InstructionSelectPass)
 DUMMY_MACHINE_FUNCTION_PASS("irtranslator", IRTranslatorPass)
 DUMMY_MACHINE_FUNCTION_PASS("kcfi", MachineKCFIPass)
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index d49bcf8a0c8ee..e02c1d6417e07 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -75,6 +75,7 @@ add_llvm_component_library(LLVMCodeGen
   IfConversion.cpp
   ImplicitNullChecks.cpp
   IndirectBrExpandPass.cpp
+  InitUndef.cpp
   InlineSpiller.cpp
   InterferenceCache.cpp
   InterleavedAccessPass.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index be1813451228d..544f1b7f59353 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -54,6 +54,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeIfConverterPass(Registry);
   initializeImplicitNullChecksPass(Registry);
   initializeIndirectBrExpandLegacyPassPass(Registry);
+  initializeInitUndefPass(Registry);
   initializeInterleavedLoadCombinePass(Registry);
   initializeInterleavedAccessPass(Registry);
   initializeJMCInstrumenterPass(Registry);
diff --git a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp
similarity index 61%
rename from llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp
rename to llvm/lib/CodeGen/InitUndef.cpp
index 735fc1350c009..96ac385b6abf8 100644
--- a/llvm/lib/Target/RISCV/RISCVRVVInitUndef.cpp
+++ b/llvm/lib/CodeGen/InitUndef.cpp
@@ -1,4 +1,4 @@
-//===- RISCVRVVInitUndef.cpp - Initialize undef vector value to pseudo ----===//
+//===- InitUndef.cpp - Initialize undef value to pseudo ----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,23 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a function pass that initializes undef vector value to
-// temporary pseudo instruction and remove it in expandpseudo pass to prevent
-// register allocation resulting in a constraint violated result for vector
-// instruction.  It also rewrites the NoReg tied operand back to an
-// IMPLICIT_DEF.
+// This file implements a function pass that initializes undef value to
+// temporary pseudo instruction to prevent register allocation resulting in a
+// constraint violated result for the particular instruction. It also rewrites
+// the NoReg tied operand back to an IMPLICIT_DEF.
 //
-// RISC-V vector instruction has register overlapping constraint for certain
-// instructions, and will cause illegal instruction trap if violated, we use
-// early clobber to model this constraint, but it can't prevent register
-// allocator allocated same or overlapped if the input register is undef value,
-// so convert IMPLICIT_DEF to temporary pseudo instruction and remove it later
-// could prevent that happen, it's not best way to resolve this, and it might
+// Certain instructions have register overlapping constraints, and
+// will cause illegal instruction trap if violated, we use early clobber to
+// model this constraint, but it can't prevent register allocator allocating
+// same or overlapped if the input register is undef value, so convert
+// IMPLICIT_DEF to temporary pseudo instruction and remove it later could
+// prevent that happen, it's not best way to resolve this, and it might
 // change the order of program or increase the register pressure, so ideally we
 // should model the constraint right, but before we model the constraint right,
 // it's the only way to prevent that happen.
 //
-// When we enable the subregister liveness option, it will also trigger same
+// When we enable the subregister liveness option, it will also trigger the same
 // issue due to the partial of register is undef. If we pseudoinit the whole
 // register, then it will generate redundant COPY instruction. Currently, it
 // will generate INSERT_SUBREG to make sure the whole register is occupied
@@ -31,7 +30,7 @@
 //
 // See also: https://github.com/llvm/llvm-project/issues/50157
 //
-// Additionally, this pass rewrites tied operands of vector instructions
+// Additionally, this pass rewrites tied operands of instructions
 // from NoReg to IMPLICIT_DEF.  (Not that this is a non-overlapping set of
 // operands to the above.)  We use NoReg to side step a MachineCSE
 // optimization quality problem but need to convert back before
@@ -39,23 +38,31 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RISCV.h"
-#include "RISCVSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/DetectDeadLanes.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+
 using namespace llvm;
 
-#define DEBUG_TYPE "riscv-init-undef"
-#define RISCV_INIT_UNDEF_NAME "RISC-V init undef pass"
+#define DEBUG_TYPE "init-undef"
+#define INIT_UNDEF_NAME "Init Undef Pass"
 
 namespace {
 
-class RISCVInitUndef : public MachineFunctionPass {
+class InitUndef : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   MachineRegisterInfo *MRI;
-  const RISCVSubtarget *ST;
+  const TargetSubtargetInfo *ST;
   const TargetRegisterInfo *TRI;
 
   // Newly added vregs, assumed to be fully rewritten
@@ -65,7 +72,7 @@ class RISCVInitUndef : public MachineFunctionPass {
 public:
   static char ID;
 
-  RISCVInitUndef() : MachineFunctionPass(ID) {}
+  InitUndef() : MachineFunctionPass(ID) {}
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -73,14 +80,11 @@ class RISCVInitUndef : public MachineFunctionPass {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  StringRef getPassName() const override { return RISCV_INIT_UNDEF_NAME; }
+  StringRef getPassName() const override { return INIT_UNDEF_NAME; }
 
 private:
   bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
                          const DeadLaneDetector &DLD);
-  bool isVectorRegClass(const Register R);
-  const TargetRegisterClass *
-  getVRLargestSuperClass(const TargetRegisterClass *RC) const;
   bool handleSubReg(MachineFunction &MF, MachineInstr &MI,
                     const DeadLaneDetector &DLD);
   bool fixupIllOperand(MachineInstr *MI, MachineOperand &MO);
@@ -89,45 +93,9 @@ class RISCVInitUndef : public MachineFunctionPass {
 
 } // end anonymous namespace
 
-char RISCVInitUndef::ID = 0;
-INITIALIZE_PASS(RISCVInitUndef, DEBUG_TYPE, RISCV_INIT_UNDEF_NAME, false, false)
-char &llvm::RISCVInitUndefID = RISCVInitUndef::ID;
-
-const TargetRegisterClass *
-RISCVInitUndef::getVRLargestSuperClass(const TargetRegisterClass *RC) const {
-  if (RISCV::VRM8RegClass.hasSubClassEq(RC))
-    return &RISCV::VRM8RegClass;
-  if (RISCV::VRM4RegClass.hasSubClassEq(RC))
-    return &RISCV::VRM4RegClass;
-  if (RISCV::VRM2RegClass.hasSubClassEq(RC))
-    return &RISCV::VRM2RegClass;
-  if (RISCV::VRRegClass.hasSubClassEq(RC))
-    return &RISCV::VRRegClass;
-  return RC;
-}
-
-bool RISCVInitUndef::isVectorRegClass(const Register R) {
-  const TargetRegisterClass *RC = MRI->getRegClass(R);
-  return RISCV::VRRegClass.hasSubClassEq(RC) ||
-         RISCV::VRM2RegClass.hasSubClassEq(RC) ||
-         RISCV::VRM4RegClass.hasSubClassEq(RC) ||
-         RISCV::VRM8RegClass.hasSubClassEq(RC);
-}
-
-static unsigned getUndefInitOpcode(unsigned RegClassID) {
-  switch (RegClassID) {
-  case RISCV::VRRegClassID:
-    return RISCV::PseudoRVVInitUndefM1;
-  case RISCV::VRM2RegClassID:
-    return RISCV::PseudoRVVInitUndefM2;
-  case RISCV::VRM4RegClassID:
-    return RISCV::PseudoRVVInitUndefM4;
-  case RISCV::VRM8RegClassID:
-    return RISCV::PseudoRVVInitUndefM8;
-  default:
-    llvm_unreachable("Unexpected register class.");
-  }
-}
+char InitUndef::ID = 0;
+INITIALIZE_PASS(InitUndef, DEBUG_TYPE, INIT_UNDEF_NAME, false, false)
+char &llvm::InitUndefID = InitUndef::ID;
 
 static bool isEarlyClobberMI(MachineInstr &MI) {
   return llvm::any_of(MI.defs(), [](const MachineOperand &DefMO) {
@@ -143,7 +111,7 @@ static bool findImplictDefMIFromReg(Register Reg, MachineRegisterInfo *MRI) {
   return false;
 }
 
-bool RISCVInitUndef::handleReg(MachineInstr *MI) {
+bool InitUndef::handleReg(MachineInstr *MI) {
   bool Changed = false;
   for (auto &UseMO : MI->uses()) {
     if (!UseMO.isReg())
@@ -152,7 +120,7 @@ bool RISCVInitUndef::handleReg(MachineInstr *MI) {
       continue;
     if (!UseMO.getReg().isVirtual())
       continue;
-    if (!isVectorRegClass(UseMO.getReg()))
+    if (!TRI->doesRegClassHavePseudoInitUndef(MRI->getRegClass(UseMO.getReg())))
       continue;
 
     if (UseMO.isUndef() || findImplictDefMIFromReg(UseMO.getReg(), MRI))
@@ -161,8 +129,8 @@ bool RISCVInitUndef::handleReg(MachineInstr *MI) {
   return Changed;
 }
 
-bool RISCVInitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
-                                  const DeadLaneDetector &DLD) {
+bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
+                             const DeadLaneDetector &DLD) {
   bool Changed = false;
 
   for (MachineOperand &UseMO : MI.uses()) {
@@ -172,6 +140,8 @@ bool RISCVInitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
       continue;
     if (UseMO.isTied())
       continue;
+    if (!TRI->doesRegClassHavePseudoInitUndef(MRI->getRegClass(UseMO.getReg())))
+      continue;
 
     Register Reg = UseMO.getReg();
     if (NewRegs.count(Reg))
@@ -183,7 +153,7 @@ bool RISCVInitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
       continue;
 
     const TargetRegisterClass *TargetRegClass =
-        getVRLargestSuperClass(MRI->getRegClass(Reg));
+        TRI->getLargestSuperClass(MRI->getRegClass(Reg));
 
     LaneBitmask NeedDef = Info.UsedLanes & ~Info.DefinedLanes;
 
@@ -202,11 +172,12 @@ bool RISCVInitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
     Register LatestReg = Reg;
     for (auto ind : SubRegIndexNeedInsert) {
       Changed = true;
-      const TargetRegisterClass *SubRegClass =
-          getVRLargestSuperClass(TRI->getSubRegisterClass(TargetRegClass, ind));
+      const TargetRegisterClass *SubRegClass = TRI->getLargestSuperClass(
+          TRI->getSubRegisterClass(TargetRegClass, ind));
       Register TmpInitSubReg = MRI->createVirtualRegister(SubRegClass);
+      LLVM_DEBUG(dbgs() << "Register Class ID" << SubRegClass->getID() << "\n");
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
-              TII->get(getUndefInitOpcode(SubRegClass->getID())),
+              TII->get(TII->getUndefInitOpcode(SubRegClass->getID())),
               TmpInitSubReg);
       Register NewReg = MRI->createVirtualRegister(TargetRegClass);
       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(),
@@ -223,15 +194,16 @@ bool RISCVInitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI,
   return Changed;
 }
 
-bool RISCVInitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) {
+bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) {
 
   LLVM_DEBUG(
-      dbgs() << "Emitting PseudoRVVInitUndef for implicit vector register "
+      dbgs() << "Emitting PseudoInitUndef Instruction for implicit register "
              << MO.getReg() << '\n');
 
   const TargetRegisterClass *TargetRegClass =
-      getVRLargestSuperClass(MRI->getRegClass(MO.getReg()));
-  unsigned Opcode = getUndefInitOpcode(TargetRegClass->getID());
+      TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg()));
+  LLVM_DEBUG(dbgs() << "Register Class ID" << TargetRegClass->getID() << "\n");
+  unsigned Opcode = TII->getUndefInitOpcode(TargetRegClass->getID());
   Register NewReg = MRI->createVirtualRegister(TargetRegClass);
   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(Opcode), NewReg);
   MO.setReg(NewReg);
@@ -240,9 +212,8 @@ bool RISCVInitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) {
   return true;
 }
 
-bool RISCVInitUndef::processBasicBlock(MachineFunction &MF,
-                                       MachineBasicBlock &MBB,
-                                       const DeadLaneDetector &DLD) {
+bool InitUndef::processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB,
+                                  const DeadLaneDetector &DLD) {
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
     MachineInstr &MI = *I;
@@ -252,15 +223,15 @@ bool RISCVInitUndef::processBasicBlock(MachineFunction &MF,
     unsigned UseOpIdx;
     if (MI.getNumDefs() != 0 && MI.isRegTiedToUseOperand(0, &UseOpIdx)) {
       MachineOperand &UseMO = MI.getOperand(UseOpIdx);
-      if (UseMO.getReg() == RISCV::NoRegister) {
+      if (UseMO.getReg() == MCRegister::NoRegister) {
         const TargetRegisterClass *RC =
-          TII->getRegClass(MI.getDesc(), UseOpIdx, TRI, MF);
+            TII->getRegClass(MI.getDesc(), UseOpIdx, TRI, MF);
         Register NewDest = MRI->createVirtualRegister(RC);
         // We don't have a way to update dead lanes, so keep track of the
         // new register so that we avoid querying it later.
         NewRegs.insert(NewDest);
-        BuildMI(MBB, I, I->getDebugLoc(),
-                TII->get(TargetOpcode::IMPLICIT_DEF), NewDest);
+        BuildMI(MBB, I, I->getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
+                NewDest);
         UseMO.setReg(NewDest);
         Changed = true;
       }
@@ -275,9 +246,16 @@ bool RISCVInitUndef::processBasicBlock(MachineFunction &MF,
   return Changed;
 }
 
-bool RISCVInitUndef::runOnMachineFunction(MachineFunction &MF) {
-  ST = &MF.getSubtarget<RISCVSubtarget>();
-  if (!ST->hasVInstructions())
+bool InitUndef::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget();
+
+  // supportsInitUndef is implemented to reflect if an architecture has support
+  // for the InitUndef pass. Support comes from having the relevant Pseudo
+  // instructions that can be used to initialize the register. The function
+  // returns false by default so requires an implementation per architecture.
+  // Support can be added by overriding the function in a way that best fits
+  // the architecture.
+  if (!ST->supportsInitUndef())
     return false;
 
   MRI = &MF.getRegInfo();
@@ -297,5 +275,3 @@ bool RISCVInitUndef::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
-
-FunctionPass *llvm::createRISCVInitUndefPass() { return new RISCVInitUndef(); }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 2ed39a5696e20..cf068ece8d4ca 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1427,6 +1427,8 @@ void TargetPassConfig::addFastRegAlloc() {
 void TargetPassConfig::addOptimizedRegAlloc() {
   addPass(&DetectDeadLanesID);
 
+  addPass(&InitUndefID);
+
   addPass(&ProcessImplicitDefsID);
 
   // LiveVariables currently requires pure SSA form.
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 15cda9b9432d5..642739a29d6b0 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -2409,6 +2409,12 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case ARM::SEH_EpilogEnd:
     ATS.emitARMWinCFIEpilogEnd();
     return;
+
+  case ARM::PseudoARMInitUndefMQPR:
+  case ARM::PseudoARMInitUndefSPR:
+  case ARM::PseudoARMInitUndefDPR_VFP2:
+  case ARM::PseudoARMInitUndefGPR:
+    return;
   }
 
   MCInst TmpInst;
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index c3b475e0306ee..30f0730774b78 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -13,16 +13,21 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
 
+#include "ARMBaseRegisterInfo.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <array>
 #include <cstdint>
 
@@ -536,6 +541,19 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo {
 
   std::optional<RegImmPair> isAddImmediate(const MachineInstr &MI,
                                            Register Reg) const override;
+
+  unsigned getUndefInitOpcode(unsigned RegClassID) const override {
+    if (RegClassID == ARM::MQPRRegClass.getID())
+      return ARM::PseudoARMInitUndefMQPR;
+    if (RegClassID == ARM::SPRRegClass.getID())
+      return ARM::PseudoARMInitUndefSPR;
+    if (RegClassID == ARM::DPR_VFP2RegClass.getID())
+      return ARM::PseudoARMInitUndefDPR_VFP2;
+    if (RegClassID == ARM::GPRRegClass.getID())
+      return ARM::PseudoARMInitUndefGPR;
+
+    llvm_unreachable("Unexpected register class.");
+  }
 };
 
 /// Get the operands corresponding to the given \p Pred value. By default, the
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 926d702b4092a..53803cff8b90a 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -240,6 +240,33 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
                             unsigned SrcSubReg) const override;
 
   int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
+
+  const TargetRegisterClass *
+  getLargestSuperClass(const TargetRegisterClass *RC) const override {
+    if (ARM::MQPRRegClass.hasSubClassEq(RC))
+      return &ARM::MQPRRegClass;
+    if (ARM::SPRRegClass.hasSubClassEq(RC))
+      return &ARM::SPRRegClass;
+    if (ARM::DPR_VFP2RegClass.hasSubClassEq(RC))
+      return &ARM::DPR_VFP2RegClass;
+    if (ARM::GPRRegClass.hasSubClassEq(RC))
+      return &ARM::GPRRegClass;
+    return RC;
+  }
+
+  bool doesRegClassHavePseudoInitUndef(
+      const TargetRegisterClass *RC) const override {
+    (void)RC;
+    // For the ARM Architecture we want to always return true because all
+    // required PseudoInitUndef types have been added. If compilation fails due
+    // to `Unexpected register class`, this is likely to be because the specific
+    // register being used is not support by Init Undef and needs the Pseudo
+    // Instruction adding to ARMInstrInfo.td. If this is implemented as a
+    // conditional check, this could create a false positive where Init Undef is
+    // not running, skipping the instruction and moving to the next. This could
+    // lead to illegal instructions being generated by the register allocator.
+    return true;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 074dea36b6414..08b519e4d5cbf 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -6534,3 +6534,15 @@ let isPseudo = 1 in {
   let isTerminator = 1 in
   def SEH_EpilogEnd : PseudoInst<(outs), (ins), NoItinerary, []>, Sched<[]>;
 }
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions for use when early-clobber is defined and Greedy Register
+// Allocation is used. This ensures the constraint is used properly.
+//===----------------------------------------------------------------------===//
+let isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
+  def PseudoARMInitUndefMQPR      :  PseudoInst<(outs MQPR:$vd), (ins), NoItinerary, []>;
+  def PseudoARMInitUndefSPR      :  PseudoInst<(outs SPR:$sd), (ins), NoItinerary, []>;
+  def PseudoARMInitUndefDPR_VFP2      :  PseudoInst<(outs DPR_VFP2:$dd), (ins), NoItinerary, []>;
+  def PseudoARMInitUndefGPR     :  PseudoInst<(outs GPR:$rd), (ins), NoItinerary, []>;
+}
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 91f3978b041a3..044b1c4c54e0c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -278,6 +278,13 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
     return &InstrInfo->getRegisterInfo();
   }
 
+  /// The correct instructions have been implemented to initialize undef
+  /// registers, therefore the ARM Architecture is supported by the Init Undef
+  /// Pass. This will return true as the pass needs to be supported for all
+  /// types of instructions. The pass will then perform more checks to ensure it
+  /// should be applying the Pseudo Instructions.
+  bool supportsInitUndef() const override { return true; }
+
   const CallLowering *getCallLowering() const override;
   InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index ac88cd49db4e4..8715403f3839a 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -51,7 +51,6 @@ add_llvm_target(RISCVCodeGen
   RISCVMoveMerger.cpp
   RISCVPushPopOptimizer.cpp
   RISCVRegisterInfo.cpp
-  RISCVRVVInitUndef.cpp
   RISCVSubtarget.cpp
   RISCVTargetMachine.cpp
   RISCVTargetObjectFile.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 9eb18099894b2..7af543f018ccb 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -72,10 +72,6 @@ void initializeRISCVInsertWriteVXRMPass(PassRegistry &);
 FunctionPass *createRISCVRedundantCopyEliminationPass();
 void initializeRISCVRedundantCopyEliminationPass(PassRegistry &);
 
-FunctionPass *createRISCVInitUndefPass();
-void initializeRISCVInitUndefPass(PassRegistry &);
-extern char &RISCVInitUndefID;
-
 FunctionPass *createRISCVMoveMergePass();
 void initializeRISCVMoveMergePass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0f7d3e4e43390..2d56734259963 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 #define LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 
+#include "RISCV.h"
 #include "RISCVRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -20,6 +21,7 @@
 #define GET_INSTRINFO_HEADER
 #define GET_INSTRINFO_OPERAND_ENUM
 #include "RISCVGenInstrInfo.inc"
+#include "RISCVGenRegisterInfo.inc"
 
 namespace llvm {
 
@@ -262,6 +264,21 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
   getSerializableMachineMemOperandTargetFlags() const override;
 
+  unsigned getUndefInitOpcode(unsigned RegClassID) const override {
+    switch (RegClassID) {
+    case RISCV::VRRegClassID:
+      return RISCV::PseudoRVVInitUndefM1;
+    case RISCV::VRM2RegClassID:
+      return RISCV::PseudoRVVInitUndefM2;
+    case RISCV::VRM4RegClassID:
+      return RISCV::PseudoRVVInitUndefM4;
+    case RISCV::VRM8RegClassID:
+      return RISCV::PseudoRVVInitUndefM8;
+    default:
+      llvm_unreachable("Unexpected register class.");
+    }
+  }
+
 protected:
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 431ea23b3e2d0..e46fe8ecb900f 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -96,6 +96,27 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
                              SmallVectorImpl<MCPhysReg> &Hints,
                              const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
+
+  const TargetRegisterClass *
+  getLargestSuperClass(const TargetRegisterClass *RC) const override {
+    if (RISCV::VRM8RegClass.hasSubClassEq(RC))
+      return &RISCV::VRM8RegClass;
+    if (RISCV::VRM4RegClass.hasSubClassEq(RC))
+      return &RISCV::VRM4RegClass;
+    if (RISCV::VRM2RegClass.hasSubClassEq(RC))
+      return &RISCV::VRM2RegClass;
+    if (RISCV::VRRegClass.hasSubClassEq(RC))
+      return &RISCV::VRRegClass;
+    return RC;
+  }
+
+  bool doesRegClassHavePseudoInitUndef(
+      const TargetRegisterClass *RC) const override {
+    return RISCV::VRRegClass.hasSubClassEq(RC) ||
+           RISCV::VRM2RegClass.hasSubClassEq(RC) ||
+           RISCV::VRM4RegClass.hasSubClassEq(RC) ||
+           RISCV::VRM8RegClass.hasSubClassEq(RC);
+  }
 };
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 9ebf278d6749f..ba108912d9340 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -289,6 +289,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   };
 
   unsigned getMinimumJumpTableEntries() const;
+
+  bool supportsInitUndef() const override { return hasVInstructions(); }
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index be2d4c5c17d1e..6fe0abaccb9d9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -124,7 +124,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVInsertReadWriteCSRPass(*PR);
   initializeRISCVInsertWriteVXRMPass(*PR);
   initializeRISCVDAGToDAGISelPass(*PR);
-  initializeRISCVInitUndefPass(*PR);
   initializeRISCVMoveMergePass(*PR);
   initializeRISCVPushPopOptPass(*PR);
 }
@@ -383,7 +382,6 @@ class RISCVPassConfig : public TargetPassConfig {
   bool addRegAssignAndRewriteOptimized() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
-  void addOptimizedRegAlloc() override;
   void addFastRegAlloc() override;
 };
 } // namespace
@@ -564,14 +562,8 @@ void RISCVPassConfig::addPreRegAlloc() {
   addPass(createRISCVInsertWriteVXRMPass());
 }
 
-void RISCVPassConfig::addOptimizedRegAlloc() {
-  insertPass(&DetectDeadLanesID, &RISCVInitUndefID);
-
-  TargetPassConfig::addOptimizedRegAlloc();
-}
-
 void RISCVPassConfig::addFastRegAlloc() {
-  addPass(createRISCVInitUndefPass());
+  addPass(&InitUndefID);
   TargetPassConfig::addFastRegAlloc();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 638f26298ee26..ae0dbed09979b 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -155,6 +155,7 @@
 ; CHECK-NEXT:       AArch64 MI Peephole Optimization pass
 ; CHECK-NEXT:       AArch64 Dead register definitions
 ; CHECK-NEXT:       Detect Dead Lanes
+; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Process Implicit Definitions
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 48f00a82e3e1c..c67328a025b85 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -321,6 +321,7 @@
 ; GCN-O1-NEXT:        Register Usage Information Propagation
 ; GCN-O1-NEXT:        Detect Dead Lanes
 ; GCN-O1-NEXT:        Remove dead machine instructions
+; GCN-O1-NEXT:        Init Undef Pass
 ; GCN-O1-NEXT:        Process Implicit Definitions
 ; GCN-O1-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O1-NEXT:        Live Variable Analysis
@@ -618,6 +619,7 @@
 ; GCN-O1-OPTS-NEXT:        Register Usage Information Propagation
 ; GCN-O1-OPTS-NEXT:        Detect Dead Lanes
 ; GCN-O1-OPTS-NEXT:        Remove dead machine instructions
+; GCN-O1-OPTS-NEXT:        Init Undef Pass
 ; GCN-O1-OPTS-NEXT:        Process Implicit Definitions
 ; GCN-O1-OPTS-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O1-OPTS-NEXT:        Live Variable Analysis
@@ -920,6 +922,7 @@
 ; GCN-O2-NEXT:        Register Usage Information Propagation
 ; GCN-O2-NEXT:        Detect Dead Lanes
 ; GCN-O2-NEXT:        Remove dead machine instructions
+; GCN-O2-NEXT:        Init Undef Pass
 ; GCN-O2-NEXT:        Process Implicit Definitions
 ; GCN-O2-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O2-NEXT:        Live Variable Analysis
@@ -1235,6 +1238,7 @@
 ; GCN-O3-NEXT:        Register Usage Information Propagation
 ; GCN-O3-NEXT:        Detect Dead Lanes
 ; GCN-O3-NEXT:        Remove dead machine instructions
+; GCN-O3-NEXT:        Init Undef Pass
 ; GCN-O3-NEXT:        Process Implicit Definitions
 ; GCN-O3-NEXT:        Remove unreachable machine basic blocks
 ; GCN-O3-NEXT:        Live Variable Analysis
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 5e565970fc3a8..5914e98549fcc 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -113,6 +113,7 @@
 ; CHECK-NEXT:      ARM pre- register allocation load / store optimization pass
 ; CHECK-NEXT:      ARM A15 S->D optimizer
 ; CHECK-NEXT:      Detect Dead Lanes
+; CHECK-NEXT:      Init Undef Pass
 ; CHECK-NEXT:      Process Implicit Definitions
 ; CHECK-NEXT:      Remove unreachable machine basic blocks
 ; CHECK-NEXT:      Live Variable Analysis
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 3134d940545e8..a31eb8d11a35a 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -105,6 +105,7 @@
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       LoongArch Pre-RA pseudo instruction expansion pass
 ; CHECK-NEXT:       Detect Dead Lanes
+; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Process Implicit Definitions
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 6ce4416211cc4..f94f91b38fecc 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -149,6 +149,7 @@
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Modulo Software Pipelining
 ; CHECK-NEXT:       Detect Dead Lanes
+; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Process Implicit Definitions
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index e90fa24761bc1..faf37545e1a11 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -43,7 +43,7 @@
 ; CHECK-NEXT:       RISC-V Insert VSETVLI pass
 ; CHECK-NEXT:       RISC-V Insert Read/Write CSR Pass
 ; CHECK-NEXT:       RISC-V Insert Write VXRM Pass
-; CHECK-NEXT:       RISC-V init undef pass
+; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Fast Register Allocator
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 364c1e430b915..90472f246918f 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -120,7 +120,7 @@
 ; CHECK-NEXT:       RISC-V Insert Read/Write CSR Pass
 ; CHECK-NEXT:       RISC-V Insert Write VXRM Pass
 ; CHECK-NEXT:       Detect Dead Lanes
-; CHECK-NEXT:       RISC-V init undef pass
+; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Process Implicit Definitions
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
diff --git a/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir b/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir
index 4102aa8aa4d72..e090b313d4f7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/handle-noreg-with-implicit-def.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -run-pass=riscv-init-undef -o - %s | FileCheck %s --check-prefix=MIR
+# RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -run-pass=init-undef -o - %s | FileCheck %s --check-prefix=MIR
 ...
 ---
 name:            vrgather_all_undef
diff --git a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
index bf12a4df88d38..9cafb323dc65c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc %s -mtriple=riscv64 -mattr=+v -riscv-enable-subreg-liveness -run-pass=riscv-init-undef -o - | FileCheck %s
+# RUN: llc %s -mtriple=riscv64 -mattr=+v -riscv-enable-subreg-liveness -run-pass=init-undef -o - | FileCheck %s
 
 ...
 ---
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir
index 58b2687824aa1..dcf61c048ff0e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=riscv32 -mattr=+v -riscv-enable-subreg-liveness -run-pass riscv-init-undef -run-pass machineverifier %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv32 -mattr=+v -riscv-enable-subreg-liveness -run-pass init-undef -run-pass machineverifier %s -o - | FileCheck %s
 
 --- |
   source_filename = "<stdin>"
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll
index 9bb24fc61ccef..02234c6372536 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcaddq.ll
@@ -699,6 +699,17 @@ entry:
   ret <4 x i32> %0
 }
 
+define arm_aapcs_vfpcc <4 x i32> @test_vhcaddq_rot270_s32_undef() {
+; CHECK-LABEL: test_vhcaddq_rot270_s32_undef:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT: vhcadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #270
+; CHECK-NOT:  vhcadd.s32 q[[REG:[0-9]+]], q{{[0-9]+}}, q[[REG]], #270
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vcaddq.v4i32(i32 0, i32 1, <4 x i32> undef, <4 x i32> undef)
+  ret <4 x i32> %0
+}
+
 define arm_aapcs_vfpcc <16 x i8> @test_vhcaddq_rot90_x_s8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
 ; CHECK-LABEL: test_vhcaddq_rot90_x_s8:
 ; CHECK:       @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index c95fe2296e099..203ce1f881189 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -373,13 +373,13 @@ define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) {
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    ldr.w lr, [sp, #20]
-; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    vmov.f32 s4, s6
 ; CHECK-NEXT:    vmov.f32 s6, s7
-; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    smull r12, r3, r1, r0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index bd672d1ba4f66..6d581afe9fb31 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -222,88 +222,88 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q4, [r5], #16
-; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q3, [r5], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
 ; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    mov.w r2, #-1
-; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    str r5, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    vmov.f32 s20, s18
+; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    mov.w r8, #0
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vmov.f32 s22, s19
-; CHECK-NEXT:    vmullb.s32 q6, q5, q2
-; CHECK-NEXT:    vmov.f32 s18, s17
+; CHECK-NEXT:    vmullb.s32 q6, q5, q4
+; CHECK-NEXT:    vmov.f32 s14, s13
 ; CHECK-NEXT:    vmov r4, r7, d12
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s10, s9
 ; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    sbcs.w r5, r2, r7
 ; CHECK-NEXT:    csetm r5, lt
 ; CHECK-NEXT:    bfi r8, r5, #0, #8
 ; CHECK-NEXT:    vmov r10, r5, d13
 ; CHECK-NEXT:    asrl r10, r5, #31
-; CHECK-NEXT:    vmov r6, s18
+; CHECK-NEXT:    vmov r6, s14
 ; CHECK-NEXT:    rsbs.w r3, r10, #-2147483648
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r10
+; CHECK-NEXT:    vmov q4[2], q4[0], r4, r10
 ; CHECK-NEXT:    sbcs.w r3, r2, r5
-; CHECK-NEXT:    vmov q2[3], q2[1], r7, r5
+; CHECK-NEXT:    vmov q4[3], q4[1], r7, r5
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r8, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r8
 ; CHECK-NEXT:    mvn r8, #-2147483648
-; CHECK-NEXT:    vpsel q2, q2, q0
-; CHECK-NEXT:    vmov r3, r4, d4
+; CHECK-NEXT:    vpsel q4, q4, q0
+; CHECK-NEXT:    vmov r3, r4, d8
 ; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r4, #0
 ; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #0, #8
-; CHECK-NEXT:    vmov r3, r5, d5
+; CHECK-NEXT:    vmov r3, r5, d9
 ; CHECK-NEXT:    subs.w r3, r3, r8
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
-; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov r3, s8
 ; CHECK-NEXT:    vmsr p0, r4
-; CHECK-NEXT:    vmov r4, s16
-; CHECK-NEXT:    vpsel q2, q2, q1
+; CHECK-NEXT:    vmov r4, s12
+; CHECK-NEXT:    vpsel q4, q4, q1
 ; CHECK-NEXT:    smull r4, r7, r4, r3
 ; CHECK-NEXT:    asrl r4, r7, #31
 ; CHECK-NEXT:    rsbs.w r3, r4, #-2147483648
 ; CHECK-NEXT:    sbcs.w r3, r2, r7
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r5, r3, #0, #8
-; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    smull r6, r3, r6, r3
 ; CHECK-NEXT:    asrl r6, r3, #31
 ; CHECK-NEXT:    rsbs.w r1, r6, #-2147483648
-; CHECK-NEXT:    vmov q3[2], q3[0], r4, r6
+; CHECK-NEXT:    vmov q2[2], q2[0], r4, r6
 ; CHECK-NEXT:    sbcs.w r1, r2, r3
-; CHECK-NEXT:    vmov q3[3], q3[1], r7, r3
+; CHECK-NEXT:    vmov q2[3], q2[1], r7, r3
 ; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r5, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r5
 ; CHECK-NEXT:    ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    vpsel q3, q3, q0
-; CHECK-NEXT:    vmov r1, r3, d6
+; CHECK-NEXT:    vpsel q2, q2, q0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    subs.w r1, r1, r8
 ; CHECK-NEXT:    sbcs r1, r3, #0
 ; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r3, r1, #0, #8
-; CHECK-NEXT:    vmov r1, r4, d7
+; CHECK-NEXT:    vmov r1, r4, d5
 ; CHECK-NEXT:    subs.w r1, r1, r8
 ; CHECK-NEXT:    sbcs r1, r4, #0
 ; CHECK-NEXT:    csetm r1, lt
 ; CHECK-NEXT:    bfi r3, r1, #8, #8
 ; CHECK-NEXT:    vmsr p0, r3
-; CHECK-NEXT:    vpsel q3, q3, q1
-; CHECK-NEXT:    vmov.f32 s13, s14
-; CHECK-NEXT:    vmov.f32 s14, s8
-; CHECK-NEXT:    vmov.f32 s15, s10
-; CHECK-NEXT:    vstrb.8 q3, [r2], #16
+; CHECK-NEXT:    vpsel q2, q2, q1
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vmov.f32 s11, s18
+; CHECK-NEXT:    vstrb.8 q2, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    ldrd r1, r3, [sp] @ 8-byte Folded Reload
@@ -462,14 +462,14 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
 ; CHECK-NEXT:    vstr p0, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q5, [r0], #16
-; CHECK-NEXT:    vldrwt.u32 q6, [r1], #16
-; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s18, s23
-; CHECK-NEXT:    vmov.f32 s28, s26
-; CHECK-NEXT:    vmov.f32 s30, s27
-; CHECK-NEXT:    vmullb.s32 q0, q7, q4
-; CHECK-NEXT:    vmov.f32 s22, s25
+; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
+; CHECK-NEXT:    vmov.f32 s24, s18
+; CHECK-NEXT:    vmov.f32 s26, s19
+; CHECK-NEXT:    vmov.f32 s28, s22
+; CHECK-NEXT:    vmov.f32 s30, s23
+; CHECK-NEXT:    vmullb.s32 q0, q7, q6
+; CHECK-NEXT:    vmov.f32 s18, s21
 ; CHECK-NEXT:    vmov r10, r5, d0
 ; CHECK-NEXT:    asrl r10, r5, #31
 ; CHECK-NEXT:    rsbs.w r7, r10, #-2147483648
@@ -483,7 +483,7 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
 ; CHECK-NEXT:    sbcs.w r3, r12, r7
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r7
 ; CHECK-NEXT:    csetm r3, lt
-; CHECK-NEXT:    vmov r7, s22
+; CHECK-NEXT:    vmov r7, s18
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
 ; CHECK-NEXT:    vmsr p0, r4
 ; CHECK-NEXT:    vpsel q0, q0, q2
@@ -498,11 +498,11 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
 ; CHECK-NEXT:    sbcs r3, r5, #0
 ; CHECK-NEXT:    csetm r3, lt
 ; CHECK-NEXT:    bfi r4, r3, #8, #8
-; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    vmsr p0, r4
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vpsel q4, q0, q3
-; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov r4, s20
+; CHECK-NEXT:    vpsel q6, q0, q3
+; CHECK-NEXT:    vmov.f32 s2, s17
 ; CHECK-NEXT:    smull r10, r5, r4, r3
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    asrl r10, r5, #31
@@ -536,8 +536,8 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr n
 ; CHECK-NEXT:    vpsel q0, q0, q3
 ; CHECK-NEXT:    vldr p0, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmov.f32 s3, s18
+; CHECK-NEXT:    vmov.f32 s2, s24
+; CHECK-NEXT:    vmov.f32 s3, s26
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q0, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB2_2
@@ -778,18 +778,17 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:  .LBB4_4: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
-; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s16, s14
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmullb.u32 q5, q4, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s16, s10
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmullb.u32 q5, q4, q3
 ; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov r10, r5, d10
 ; CHECK-NEXT:    lsrl r10, r5, #31
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s10, s9
 ; CHECK-NEXT:    subs.w r6, r10, #-1
-; CHECK-NEXT:    vmullb.u32 q4, q3, q1
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    csetm r5, lo
@@ -797,15 +796,16 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    vmov r4, r5, d11
 ; CHECK-NEXT:    lsrl r4, r5, #31
 ; CHECK-NEXT:    subs.w r7, r4, #-1
-; CHECK-NEXT:    vmov q2[2], q2[0], r10, r4
+; CHECK-NEXT:    vmov q3[2], q3[0], r10, r4
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    csetm r5, lo
 ; CHECK-NEXT:    bfi r6, r5, #8, #8
+; CHECK-NEXT:    vmsr p0, r6
+; CHECK-NEXT:    vpsel q3, q3, q0
+; CHECK-NEXT:    vmullb.u32 q4, q2, q1
 ; CHECK-NEXT:    vmov r10, r5, d8
 ; CHECK-NEXT:    lsrl r10, r5, #31
-; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    subs.w r6, r10, #-1
-; CHECK-NEXT:    vpsel q2, q2, q0
 ; CHECK-NEXT:    sbcs r5, r5, #0
 ; CHECK-NEXT:    mov.w r6, #0
 ; CHECK-NEXT:    csetm r5, lo
@@ -820,8 +820,8 @@ define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr no
 ; CHECK-NEXT:    vmsr p0, r6
 ; CHECK-NEXT:    vpsel q1, q1, q0
 ; CHECK-NEXT:    vmov.f32 s5, s6
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s14
 ; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB4_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 217caeebe6335..cebc0d9c0e172 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -190,12 +190,17 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0213_0ext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmullb.s32 q2, q0, q3
+; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmullb.s32 q2, q4, q3
 ; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmullb.s32 q1, q0, q3
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -210,12 +215,17 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0ext_0213:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmullb.s32 q2, q3, q0
+; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmullb.s32 q2, q3, q4
 ; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmullb.s32 q1, q3, q0
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -466,12 +476,17 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @zext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: zext32_0213_0ext:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmullb.u32 q2, q0, q3
+; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmullb.u32 q2, q4, q3
 ; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmullb.u32 q1, q0, q3
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -486,12 +501,17 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: zext32_0ext_0213:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmullb.u32 q2, q3, q0
+; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmullb.u32 q2, q3, q4
 ; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmullb.u32 q1, q3, q0
 ; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 6f2bba84a6ecc..43589dc993dab 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -131,6 +131,7 @@
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Tile Register Pre-configure
 ; CHECK-NEXT:       Detect Dead Lanes
+; CHECK-NEXT:       Init Undef Pass
 ; CHECK-NEXT:       Process Implicit Definitions
 ; CHECK-NEXT:       Remove unreachable machine basic blocks
 ; CHECK-NEXT:       Live Variable Analysis
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index 43eaa72047e65..12366a35d6f1a 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -132,7 +132,6 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVOptWInstrs.cpp",
     "RISCVPostRAExpandPseudoInsts.cpp",
     "RISCVPushPopOptimizer.cpp",
-    "RISCVRVVInitUndef.cpp",
     "RISCVRedundantCopyElimination.cpp",
     "RISCVRegisterInfo.cpp",
     "RISCVSubtarget.cpp",

From af971396a9c77a57eb66fcb7eac3f671a7084680 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 09:52:12 +0100
Subject: [PATCH 301/546] [clang][Interp] Handle missing Lambda field
 initializer

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp      | 3 +++
 clang/test/SemaCXX/cxx1z-lambda-star-this.cpp | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index b5402ec8caaec..e96afb1078cc7 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1731,6 +1731,9 @@ bool ByteCodeExprGen<Emitter>::VisitLambdaExpr(const LambdaExpr *E) {
     const Expr *Init = *CaptureInitIt;
     ++CaptureInitIt;
 
+    if (!Init)
+      continue;
+
     if (std::optional<PrimType> T = classify(Init)) {
       if (!this->visit(Init))
         return false;
diff --git a/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp b/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp
index 95bc32b603ddf..45b78139d0b01 100644
--- a/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp
+++ b/clang/test/SemaCXX/cxx1z-lambda-star-this.cpp
@@ -3,6 +3,11 @@
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS
 // RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
 
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -emit-llvm-only %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING -fexperimental-new-constant-interpreter
+
 template <class, class>
 constexpr bool is_same = false;
 template <class T>

From 8779cf68e80dcc0b15e8034f39e6ce18b08352b6 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Fri, 23 Feb 2024 09:31:25 +0000
Subject: [PATCH 302/546] Pre-commit test showing bug #80287

This test shows the bug where LR is used as a general-purpose register
on a code path where it is not spilled to the stack.
---
 llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll

diff --git a/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
new file mode 100644
index 0000000000000..883a812139ded
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple thumbv7a-none-eabi < %s | FileCheck %s
+
+@val0 = global i32 0, align 4
+@val1 = global i32 0, align 4
+@val2 = global i32 0, align 4
+
+define i32 @foo(ptr %ctx) {
+; CHECK-LABEL: foo:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cbz r0, .LBB0_2
+; CHECK-NEXT:  @ %bb.1: @ %if.end
+; CHECK-NEXT:    movw r2, :lower16:val0
+; CHECK-NEXT:    mov r1, r0
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    movw r12, :lower16:val2
+; CHECK-NEXT:    movw r3, :lower16:val1
+; CHECK-NEXT:    movt r2, :upper16:val0
+; CHECK-NEXT:    add.w lr, r1, #4
+; CHECK-NEXT:    movt r12, :upper16:val2
+; CHECK-NEXT:    movt r3, :upper16:val1
+; CHECK-NEXT:    stm.w lr, {r2, r3, r12}
+; CHECK-NEXT:    str r0, [r1, #16]
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB0_2: @ %if.then
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl bar
+; CHECK-NEXT:    mov.w r0, #-1
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %tobool.not = icmp eq ptr %ctx, null
+  br i1 %tobool.not, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @bar() #2
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %cmd_a = getelementptr inbounds i8, ptr %ctx, i32 4
+  store ptr @val0, ptr %cmd_a, align 4
+  %cmd_b = getelementptr inbounds i8, ptr %ctx, i32 8
+  store ptr @val1, ptr %cmd_b, align 4
+  %cmd_c = getelementptr inbounds i8, ptr %ctx, i32 12
+  store ptr @val2, ptr %cmd_c, align 4
+  %cmd_d = getelementptr inbounds i8, ptr %ctx, i32 16
+  store ptr null, ptr %cmd_d, align 4
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %if.then ]
+  ret i32 %retval.0
+}
+
+declare void @bar()

From 749384c08e042739342c88b521c8ba5dac1b9276 Mon Sep 17 00:00:00 2001
From: ostannard <oliver.stannard@arm.com>
Date: Mon, 26 Feb 2024 12:23:25 +0000
Subject: [PATCH 303/546] [ARM] Update IsRestored for LR based on all returns
 (#82745)

PR #75527 fixed ARMFrameLowering to set the IsRestored flag for LR based
on all of the return instructions in the function, not just one.
However, there is also code in ARMLoadStoreOptimizer which changes
return instructions, but it set IsRestored based on the one instruction
it changed, not the whole function.

The fix is to factor out the code added in #75527, and also call it from
ARMLoadStoreOptimizer if it made a change to return instructions.

Fixes #80287.
---
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      | 11 +++++----
 llvm/lib/Target/ARM/ARMFrameLowering.h        |  4 ++++
 llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 23 ++++++++-----------
 llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll | 11 +++++----
 4 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index eeb7f64aa5810..9b54dd4e4e618 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2781,10 +2781,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
 }
 
-void ARMFrameLowering::processFunctionBeforeFrameFinalized(
-    MachineFunction &MF, RegScavenger *RS) const {
-  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
-
+void ARMFrameLowering::updateLRRestored(MachineFunction &MF) {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   if (!MFI.isCalleeSavedInfoValid())
     return;
@@ -2808,6 +2805,12 @@ void ARMFrameLowering::processFunctionBeforeFrameFinalized(
   }
 }
 
+void ARMFrameLowering::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF, RegScavenger *RS) const {
+  TargetFrameLowering::processFunctionBeforeFrameFinalized(MF, RS);
+  updateLRRestored(MF);
+}
+
 void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
                                       BitVector &SavedRegs) const {
   TargetFrameLowering::getCalleeSaves(MF, SavedRegs);
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index 8d2b8beb9a58f..3c7358d8cd53e 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -59,6 +59,10 @@ class ARMFrameLowering : public TargetFrameLowering {
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
+  /// Update the IsRestored flag on LR if it is spilled, based on the return
+  /// instructions.
+  static void updateLRRestored(MachineFunction &MF);
+
   void processFunctionBeforeFrameFinalized(
       MachineFunction &MF, RegScavenger *RS = nullptr) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index ed9d30c3c3ab9..6121055eb0217 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2062,17 +2062,6 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
       MO.setReg(ARM::PC);
       PrevMI.copyImplicitOps(*MBB.getParent(), *MBBI);
       MBB.erase(MBBI);
-      // We now restore LR into PC so it is not live-out of the return block
-      // anymore: Clear the CSI Restored bit.
-      MachineFrameInfo &MFI = MBB.getParent()->getFrameInfo();
-      // CSI should be fixed after PrologEpilog Insertion
-      assert(MFI.isCalleeSavedInfoValid() && "CSI should be valid");
-      for (CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
-        if (Info.getReg() == ARM::LR) {
-          Info.setRestored(false);
-          break;
-        }
-      }
       return true;
     }
   }
@@ -2120,14 +2109,22 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   isThumb2 = AFI->isThumb2Function();
   isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
-  bool Modified = false;
+  bool Modified = false, ModifiedLDMReturn = false;
   for (MachineBasicBlock &MBB : Fn) {
     Modified |= LoadStoreMultipleOpti(MBB);
     if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress())
-      Modified |= MergeReturnIntoLDM(MBB);
+      ModifiedLDMReturn |= MergeReturnIntoLDM(MBB);
     if (isThumb1)
       Modified |= CombineMovBx(MBB);
   }
+  Modified |= ModifiedLDMReturn;
+
+  // If we merged a BX instruction into an LDM, we need to re-calculate whether
+  // LR is restored. This check needs to consider the whole function, not just
+  // the instruction(s) we changed, because there may be other BX returns which
+  // still need LR to be restored.
+  if (ModifiedLDMReturn)
+    ARMFrameLowering::updateLRRestored(Fn);
 
   Allocator.DestroyAll();
   return Modified;
diff --git a/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
index 883a812139ded..9494880f99025 100644
--- a/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
+++ b/llvm/test/CodeGen/ARM/ldst-opt-lr-restored.ll
@@ -10,16 +10,17 @@ define i32 @foo(ptr %ctx) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cbz r0, .LBB0_2
 ; CHECK-NEXT:  @ %bb.1: @ %if.end
+; CHECK-NEXT:    movw r12, :lower16:val2
+; CHECK-NEXT:    movw r3, :lower16:val1
 ; CHECK-NEXT:    movw r2, :lower16:val0
 ; CHECK-NEXT:    mov r1, r0
 ; CHECK-NEXT:    movs r0, #0
-; CHECK-NEXT:    movw r12, :lower16:val2
-; CHECK-NEXT:    movw r3, :lower16:val1
-; CHECK-NEXT:    movt r2, :upper16:val0
-; CHECK-NEXT:    add.w lr, r1, #4
 ; CHECK-NEXT:    movt r12, :upper16:val2
 ; CHECK-NEXT:    movt r3, :upper16:val1
-; CHECK-NEXT:    stm.w lr, {r2, r3, r12}
+; CHECK-NEXT:    movt r2, :upper16:val0
+; CHECK-NEXT:    str r2, [r1, #4]
+; CHECK-NEXT:    str r3, [r1, #8]
+; CHECK-NEXT:    str.w r12, [r1, #12]
 ; CHECK-NEXT:    str r0, [r1, #16]
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:  .LBB0_2: @ %if.then

From 76dd4bc036f4709f7c28e38e5ae12ade8f07e8c5 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 26 Feb 2024 12:32:42 +0000
Subject: [PATCH 304/546] [RemoveDIs] Add iterator-taking constructors and
 Create methods (#82778)

Part of removing debug-intrinsics from LLVM requires using iterators
whenever we insert an instruction into a block. That means we need all
instruction constructors and factory functions to have an iterator
taking option, which this patch adds.

The whole of this patch should be NFC: it's adding new flavours of
existing constructors, and plumbing those through to the Instruction
constructor that takes iterators. It's almost entirely boilerplate
copy-and-paste too.
---
 llvm/include/llvm/IR/InstrTypes.h   | 219 ++++++++++
 llvm/include/llvm/IR/Instruction.h  |   2 +
 llvm/include/llvm/IR/Instructions.h | 631 +++++++++++++++++++++++++++-
 llvm/lib/IR/Instruction.cpp         |  10 +
 llvm/lib/IR/Instructions.cpp        | 504 +++++++++++++++++++++-
 5 files changed, 1350 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 6eba902fa0416..4ee51cd192ed7 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -54,6 +54,10 @@ typedef unsigned ID;
 
 class UnaryInstruction : public Instruction {
 protected:
+  UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock::iterator IB)
+      : Instruction(Ty, iType, &Op<0>(), 1, IB) {
+    Op<0>() = V;
+  }
   UnaryInstruction(Type *Ty, unsigned iType, Value *V,
                    Instruction *IB = nullptr)
     : Instruction(Ty, iType, &Op<0>(), 1, IB) {
@@ -101,6 +105,8 @@ class UnaryOperator : public UnaryInstruction {
   void AssertOK();
 
 protected:
+  UnaryOperator(UnaryOps iType, Value *S, Type *Ty,
+                const Twine &Name, BasicBlock::iterator InsertBefore);
   UnaryOperator(UnaryOps iType, Value *S, Type *Ty,
                 const Twine &Name, Instruction *InsertBefore);
   UnaryOperator(UnaryOps iType, Value *S, Type *Ty,
@@ -112,6 +118,12 @@ class UnaryOperator : public UnaryInstruction {
   UnaryOperator *cloneImpl() const;
 
 public:
+  /// Construct a unary instruction, given the opcode and an operand.
+  /// Insert the instruction into a BasicBlock right before the specified
+  /// instruction (InsertBefore must be a valid iterator).
+  ///
+  static UnaryOperator *Create(UnaryOps Op, Value *S, const Twine &Name,
+                               BasicBlock::iterator InsertBefore);
 
   /// Construct a unary instruction, given the opcode and an operand.
   /// Optionally (if InstBefore is specified) insert the instruction
@@ -150,6 +162,20 @@ class UnaryOperator : public UnaryInstruction {
     return Create(Instruction::OPC, V, Name, I);\
   }
 #include "llvm/IR/Instruction.def"
+#define HANDLE_UNARY_INST(N, OPC, CLASS) \
+  static UnaryOperator *Create##OPC(Value *V, const Twine &Name, \
+                                    BasicBlock::iterator It) {\
+    return Create(Instruction::OPC, V, Name, It);\
+  }
+#include "llvm/IR/Instruction.def"
+
+  static UnaryOperator *
+  CreateWithCopiedFlags(UnaryOps Opc, Value *V, Instruction *CopyO,
+                        const Twine &Name, BasicBlock::iterator InsertBefore) {
+    UnaryOperator *UO = Create(Opc, V, Name, InsertBefore);
+    UO->copyIRFlags(CopyO);
+    return UO;
+  }
 
   static UnaryOperator *
   CreateWithCopiedFlags(UnaryOps Opc, Value *V, Instruction *CopyO,
@@ -160,6 +186,13 @@ class UnaryOperator : public UnaryInstruction {
     return UO;
   }
 
+  static UnaryOperator *CreateFNegFMF(Value *Op, Instruction *FMFSource,
+                                      const Twine &Name,
+                                      BasicBlock::iterator InsertBefore) {
+    return CreateWithCopiedFlags(Instruction::FNeg, Op, FMFSource, Name,
+                                 InsertBefore);
+  }
+
   static UnaryOperator *CreateFNegFMF(Value *Op, Instruction *FMFSource,
                                       const Twine &Name = "",
                                       Instruction *InsertBefore = nullptr) {
@@ -188,6 +221,8 @@ class BinaryOperator : public Instruction {
   void AssertOK();
 
 protected:
+  BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
+                 const Twine &Name, BasicBlock::iterator InsertBefore);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
                  const Twine &Name, Instruction *InsertBefore);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
@@ -206,6 +241,14 @@ class BinaryOperator : public Instruction {
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
+  /// Construct a binary instruction, given the opcode and the two
+  /// operands. Insert the instruction into a BasicBlock right before the
+  /// specified instruction.
+  ///
+  static BinaryOperator *Create(BinaryOps Op, Value *S1, Value *S2,
+                                const Twine &Name,
+                                BasicBlock::iterator InsertBefore);
+
   /// Construct a binary instruction, given the opcode and the two
   /// operands.  Optionally (if InstBefore is specified) insert the instruction
   /// into a BasicBlock right before the specified instruction.  The specified
@@ -243,6 +286,20 @@ class BinaryOperator : public Instruction {
     return Create(Instruction::OPC, V1, V2, Name, I);\
   }
 #include "llvm/IR/Instruction.def"
+#define HANDLE_BINARY_INST(N, OPC, CLASS) \
+  static BinaryOperator *Create##OPC(Value *V1, Value *V2, \
+                                     const Twine &Name, BasicBlock::iterator It) {\
+    return Create(Instruction::OPC, V1, V2, Name, It);\
+  }
+#include "llvm/IR/Instruction.def"
+
+  static BinaryOperator *
+  CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO,
+                        const Twine &Name, BasicBlock::iterator InsertBefore) {
+    BinaryOperator *BO = Create(Opc, V1, V2, Name, InsertBefore);
+    BO->copyIRFlags(CopyO);
+    return BO;
+  }
 
   static BinaryOperator *
   CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO,
@@ -297,6 +354,12 @@ class BinaryOperator : public Instruction {
     BO->setHasNoSignedWrap(true);
     return BO;
   }
+  static BinaryOperator *CreateNSW(BinaryOps Opc, Value *V1, Value *V2,
+                                   const Twine &Name, BasicBlock::iterator It) {
+    BinaryOperator *BO = Create(Opc, V1, V2, Name, It);
+    BO->setHasNoSignedWrap(true);
+    return BO;
+  }
 
   static BinaryOperator *CreateNUW(BinaryOps Opc, Value *V1, Value *V2,
                                    const Twine &Name = "") {
@@ -316,6 +379,12 @@ class BinaryOperator : public Instruction {
     BO->setHasNoUnsignedWrap(true);
     return BO;
   }
+  static BinaryOperator *CreateNUW(BinaryOps Opc, Value *V1, Value *V2,
+                                   const Twine &Name, BasicBlock::iterator It) {
+    BinaryOperator *BO = Create(Opc, V1, V2, Name, It);
+    BO->setHasNoUnsignedWrap(true);
+    return BO;
+  }
 
   static BinaryOperator *CreateExact(BinaryOps Opc, Value *V1, Value *V2,
                                      const Twine &Name = "") {
@@ -335,6 +404,13 @@ class BinaryOperator : public Instruction {
     BO->setIsExact(true);
     return BO;
   }
+  static BinaryOperator *CreateExact(BinaryOps Opc, Value *V1, Value *V2,
+                                     const Twine &Name,
+                                     BasicBlock::iterator It) {
+    BinaryOperator *BO = Create(Opc, V1, V2, Name, It);
+    BO->setIsExact(true);
+    return BO;
+  }
 
   static inline BinaryOperator *
   CreateDisjoint(BinaryOps Opc, Value *V1, Value *V2, const Twine &Name = "");
@@ -344,6 +420,9 @@ class BinaryOperator : public Instruction {
   static inline BinaryOperator *CreateDisjoint(BinaryOps Opc, Value *V1,
                                                Value *V2, const Twine &Name,
                                                Instruction *I);
+  static inline BinaryOperator *CreateDisjoint(BinaryOps Opc, Value *V1,
+                                               Value *V2, const Twine &Name,
+                                               BasicBlock::iterator It);
 
 #define DEFINE_HELPERS(OPC, NUWNSWEXACT)                                       \
   static BinaryOperator *Create##NUWNSWEXACT##OPC(Value *V1, Value *V2,        \
@@ -357,6 +436,10 @@ class BinaryOperator : public Instruction {
   static BinaryOperator *Create##NUWNSWEXACT##OPC(                             \
       Value *V1, Value *V2, const Twine &Name, Instruction *I) {               \
     return Create##NUWNSWEXACT(Instruction::OPC, V1, V2, Name, I);             \
+  }                                                                            \
+  static BinaryOperator *Create##NUWNSWEXACT##OPC(                             \
+      Value *V1, Value *V2, const Twine &Name, BasicBlock::iterator It) {      \
+    return Create##NUWNSWEXACT(Instruction::OPC, V1, V2, Name, It);            \
   }
 
   DEFINE_HELPERS(Add, NSW) // CreateNSWAdd
@@ -382,18 +465,26 @@ class BinaryOperator : public Instruction {
   ///
   /// Create the NEG and NOT instructions out of SUB and XOR instructions.
   ///
+  static BinaryOperator *CreateNeg(Value *Op, const Twine &Name,
+                                   BasicBlock::iterator InsertBefore);
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name = "",
                                    Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNeg(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
+  static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name,
+                                      BasicBlock::iterator InsertBefore);
   static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name = "",
                                       Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNSWNeg(Value *Op, const Twine &Name,
                                       BasicBlock *InsertAtEnd);
+  static BinaryOperator *CreateNUWNeg(Value *Op, const Twine &Name,
+                                      BasicBlock::iterator InsertBefore);
   static BinaryOperator *CreateNUWNeg(Value *Op, const Twine &Name = "",
                                       Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNUWNeg(Value *Op, const Twine &Name,
                                       BasicBlock *InsertAtEnd);
+  static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
+                                   BasicBlock::iterator InsertBefore);
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name = "",
                                    Instruction *InsertBefore = nullptr);
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
@@ -469,6 +560,13 @@ BinaryOperator *BinaryOperator::CreateDisjoint(BinaryOps Opc, Value *V1,
   cast<PossiblyDisjointInst>(BO)->setIsDisjoint(true);
   return BO;
 }
+BinaryOperator *BinaryOperator::CreateDisjoint(BinaryOps Opc, Value *V1,
+                                               Value *V2, const Twine &Name,
+                                               BasicBlock::iterator It) {
+  BinaryOperator *BO = Create(Opc, V1, V2, Name, It);
+  cast<PossiblyDisjointInst>(BO)->setIsDisjoint(true);
+  return BO;
+}
 
 //===----------------------------------------------------------------------===//
 //                               CastInst Class
@@ -482,6 +580,12 @@ BinaryOperator *BinaryOperator::CreateDisjoint(BinaryOps Opc, Value *V1,
 /// Base class of casting instructions.
 class CastInst : public UnaryInstruction {
 protected:
+  /// Constructor with insert-before-instruction semantics for subclasses
+  CastInst(Type *Ty, unsigned iType, Value *S, const Twine &NameStr,
+           BasicBlock::iterator InsertBefore)
+      : UnaryInstruction(Ty, iType, S, InsertBefore) {
+    setName(NameStr);
+  }
   /// Constructor with insert-before-instruction semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
            const Twine &NameStr = "", Instruction *InsertBefore = nullptr)
@@ -496,6 +600,19 @@ class CastInst : public UnaryInstruction {
   }
 
 public:
+  /// Provides a way to construct any of the CastInst subclasses using an
+  /// opcode instead of the subclass's constructor. The opcode must be in the
+  /// CastOps category (Instruction::isCast(opcode) returns true). This
+  /// constructor has insert-before-instruction semantics to automatically
+  /// insert the new CastInst before InsertBefore, which must be a valid
+  /// iterator. Construct any of the CastInst subclasses.
+  static CastInst *
+  Create(Instruction::CastOps, ///< The opcode of the cast instruction
+         Value *S,             ///< The value to be casted (operand 0)
+         Type *Ty,             ///< The type to which cast should be made
+         const Twine &Name,    ///< Name for the instruction
+         BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
   /// Provides a way to construct any of the CastInst subclasses using an
   /// opcode instead of the subclass's constructor. The opcode must be in the
   /// CastOps category (Instruction::isCast(opcode) returns true). This
@@ -523,6 +640,14 @@ class CastInst : public UnaryInstruction {
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
+  /// Create a ZExt or BitCast cast instruction
+  static CastInst *CreateZExtOrBitCast(
+    Value *S,                ///< The value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a ZExt or BitCast cast instruction
   static CastInst *CreateZExtOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
@@ -539,6 +664,14 @@ class CastInst : public UnaryInstruction {
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
+  /// Create a SExt or BitCast cast instruction
+  static CastInst *CreateSExtOrBitCast(
+    Value *S,                ///< The value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a SExt or BitCast cast instruction
   static CastInst *CreateSExtOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
@@ -563,6 +696,14 @@ class CastInst : public UnaryInstruction {
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
+  /// Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
+  static CastInst *CreatePointerCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
   static CastInst *CreatePointerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
@@ -579,6 +720,14 @@ class CastInst : public UnaryInstruction {
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
+  /// Create a BitCast or an AddrSpaceCast cast instruction.
+  static CastInst *CreatePointerBitCastOrAddrSpaceCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a BitCast or an AddrSpaceCast cast instruction.
   static CastInst *CreatePointerBitCastOrAddrSpaceCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
@@ -587,6 +736,19 @@ class CastInst : public UnaryInstruction {
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
+  /// Create a BitCast, a PtrToInt, or an IntToPTr cast instruction.
+  ///
+  /// If the value is a pointer type and the destination an integer type,
+  /// creates a PtrToInt cast. If the value is an integer type and the
+  /// destination a pointer type, creates an IntToPtr cast. Otherwise, creates
+  /// a bitcast.
+  static CastInst *CreateBitOrPointerCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a BitCast, a PtrToInt, or an IntToPTr cast instruction.
   ///
   /// If the value is a pointer type and the destination an integer type,
@@ -600,6 +762,15 @@ class CastInst : public UnaryInstruction {
     Instruction *InsertBefore = nullptr ///< Place to insert the instruction
   );
 
+  /// Create a ZExt, BitCast, or Trunc for int -> int casts.
+  static CastInst *CreateIntegerCast(
+    Value *S,                ///< The pointer value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    bool isSigned,           ///< Whether to regard S as signed or not
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a ZExt, BitCast, or Trunc for int -> int casts.
   static CastInst *CreateIntegerCast(
     Value *S,                ///< The pointer value to be casted (operand 0)
@@ -618,6 +789,14 @@ class CastInst : public UnaryInstruction {
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
+  /// Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
+  static CastInst *CreateFPCast(
+    Value *S,                ///< The floating point value to be casted
+    Type *Ty,          ///< The floating point type to cast to
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create an FPExt, BitCast, or FPTrunc for fp -> fp casts
   static CastInst *CreateFPCast(
     Value *S,                ///< The floating point value to be casted
@@ -634,6 +813,14 @@ class CastInst : public UnaryInstruction {
     BasicBlock *InsertAtEnd  ///< The block to insert the instruction into
   );
 
+  /// Create a Trunc or BitCast cast instruction
+  static CastInst *CreateTruncOrBitCast(
+    Value *S,                ///< The value to be casted (operand 0)
+    Type *Ty,          ///< The type to which cast should be made
+    const Twine &Name, ///< Name for the instruction
+    BasicBlock::iterator InsertBefore ///< Place to insert the instruction
+  );
+
   /// Create a Trunc or BitCast cast instruction
   static CastInst *CreateTruncOrBitCast(
     Value *S,                ///< The value to be casted (operand 0)
@@ -830,6 +1017,10 @@ class CmpInst : public Instruction {
   }
 
 protected:
+  CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred, Value *LHS,
+          Value *RHS, const Twine &Name, BasicBlock::iterator InsertBefore,
+          Instruction *FlagsSource = nullptr);
+
   CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
           Instruction *InsertBefore = nullptr,
@@ -844,6 +1035,13 @@ class CmpInst : public Instruction {
   void *operator new(size_t S) { return User::operator new(S, 2); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
+  /// Construct a compare instruction, given the opcode, the predicate and
+  /// the two operands. Insert the instruction into a BasicBlock right before
+  /// the specified instruction.
+  /// Create a CmpInst
+  static CmpInst *Create(OtherOps Op, Predicate predicate, Value *S1, Value *S2,
+                         const Twine &Name, BasicBlock::iterator InsertBefore);
+
   /// Construct a compare instruction, given the opcode, the predicate and
   /// the two operands.  Optionally (if InstBefore is specified) insert the
   /// instruction into a BasicBlock right before the specified instruction.
@@ -1300,6 +1498,15 @@ class CallBase : public Instruction {
 public:
   using Instruction::getContext;
 
+  /// Create a clone of \p CB with a different set of operand bundles and
+  /// insert it before \p InsertPt.
+  ///
+  /// The returned call instruction is identical \p CB in every way except that
+  /// the operand bundles for the new instruction are set to the operand bundles
+  /// in \p Bundles.
+  static CallBase *Create(CallBase *CB, ArrayRef<OperandBundleDef> Bundles,
+                          BasicBlock::iterator InsertPt);
+
   /// Create a clone of \p CB with a different set of operand bundles and
   /// insert it before \p InsertPt.
   ///
@@ -1309,6 +1516,14 @@ class CallBase : public Instruction {
   static CallBase *Create(CallBase *CB, ArrayRef<OperandBundleDef> Bundles,
                           Instruction *InsertPt = nullptr);
 
+  /// Create a clone of \p CB with the operand bundle with the tag matching
+  /// \p Bundle's tag replaced with Bundle, and insert it before \p InsertPt.
+  ///
+  /// The returned call instruction is identical \p CI in every way except that
+  /// the specified operand bundle has been replaced.
+  static CallBase *Create(CallBase *CB, OperandBundleDef Bundle,
+                          BasicBlock::iterator InsertPt);
+
   /// Create a clone of \p CB with the operand bundle with the tag matching
   /// \p Bundle's tag replaced with Bundle, and insert it before \p InsertPt.
   ///
@@ -2391,6 +2606,10 @@ class FuncletPadInst : public Instruction {
 private:
   FuncletPadInst(const FuncletPadInst &CPI);
 
+  explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
+                          ArrayRef<Value *> Args, unsigned Values,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore);
   explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
                           ArrayRef<Value *> Args, unsigned Values,
                           const Twine &NameStr, Instruction *InsertBefore);
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index c0e159a342d5b..75f399ec2fcd5 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -1007,6 +1007,8 @@ class Instruction : public User,
     setValueSubclassData(Storage);
   }
 
+  Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
+              InstListType::iterator InsertBefore);
   Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
               Instruction *InsertBefore = nullptr);
   Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 1db4ff2f09bca..bc357074e5cb2 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -73,16 +73,22 @@ class AllocaInst : public UnaryInstruction {
   AllocaInst *cloneImpl() const;
 
 public:
+  explicit AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                      const Twine &Name, BasicBlock::iterator InsertBefore);
   explicit AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                       const Twine &Name, Instruction *InsertBefore);
   AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
+  AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
+             BasicBlock::iterator InsertBefore);
   AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
              Instruction *InsertBefore);
   AllocaInst(Type *Ty, unsigned AddrSpace,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
+  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, Align Align,
+             const Twine &Name, BasicBlock::iterator);
   AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, Align Align,
              const Twine &Name = "", Instruction *InsertBefore = nullptr);
   AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, Align Align,
@@ -192,17 +198,26 @@ class LoadInst : public UnaryInstruction {
   LoadInst *cloneImpl() const;
 
 public:
+  LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr,
+           BasicBlock::iterator InsertBefore);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr,
            Instruction *InsertBefore);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, BasicBlock *InsertAtEnd);
+  LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
+           BasicBlock::iterator InsertBefore);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
            Instruction *InsertBefore);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
            BasicBlock *InsertAtEnd);
+  LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
+           Align Align, BasicBlock::iterator InsertBefore);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
            Align Align, Instruction *InsertBefore = nullptr);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
            Align Align, BasicBlock *InsertAtEnd);
+  LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
+           Align Align, AtomicOrdering Order, SyncScope::ID SSID,
+           BasicBlock::iterator InsertBefore);
   LoadInst(Type *Ty, Value *Ptr, const Twine &NameStr, bool isVolatile,
            Align Align, AtomicOrdering Order,
            SyncScope::ID SSID = SyncScope::System,
@@ -456,6 +471,8 @@ class FenceInst : public Instruction {
 public:
   // Ordering may only be Acquire, Release, AcquireRelease, or
   // SequentiallyConsistent.
+  FenceInst(LLVMContext &C, AtomicOrdering Ordering, SyncScope::ID SSID,
+            BasicBlock::iterator InsertBefore);
   FenceInst(LLVMContext &C, AtomicOrdering Ordering,
             SyncScope::ID SSID = SyncScope::System,
             Instruction *InsertBefore = nullptr);
@@ -536,6 +553,10 @@ class AtomicCmpXchgInst : public Instruction {
   AtomicCmpXchgInst *cloneImpl() const;
 
 public:
+  AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, Align Alignment,
+                    AtomicOrdering SuccessOrdering,
+                    AtomicOrdering FailureOrdering, SyncScope::ID SSID,
+                    BasicBlock::iterator InsertBefore);
   AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, Align Alignment,
                     AtomicOrdering SuccessOrdering,
                     AtomicOrdering FailureOrdering, SyncScope::ID SSID,
@@ -798,6 +819,9 @@ class AtomicRMWInst : public Instruction {
       typename Bitfield::Element<BinOp, Offset, 5, BinOp::LAST_BINOP>;
 
 public:
+  AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment,
+                AtomicOrdering Ordering, SyncScope::ID SSID,
+                BasicBlock::iterator InsertBefore);
   AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment,
                 AtomicOrdering Ordering, SyncScope::ID SSID,
                 Instruction *InsertBefore = nullptr);
@@ -953,9 +977,13 @@ class GetElementPtrInst : public Instruction {
   GetElementPtrInst(const GetElementPtrInst &GEPI);
 
   /// Constructors - Create a getelementptr instruction with a base pointer an
-  /// list of indices. The first ctor can optionally insert before an existing
-  /// instruction, the second appends the new instruction to the specified
-  /// BasicBlock.
+  /// list of indices. The first and second ctor can optionally insert before an
+  /// existing instruction, the third appends the new instruction to the
+  /// specified BasicBlock.
+  inline GetElementPtrInst(Type *PointeeType, Value *Ptr,
+                           ArrayRef<Value *> IdxList, unsigned Values,
+                           const Twine &NameStr,
+                           BasicBlock::iterator InsertBefore);
   inline GetElementPtrInst(Type *PointeeType, Value *Ptr,
                            ArrayRef<Value *> IdxList, unsigned Values,
                            const Twine &NameStr, Instruction *InsertBefore);
@@ -972,6 +1000,16 @@ class GetElementPtrInst : public Instruction {
   GetElementPtrInst *cloneImpl() const;
 
 public:
+  static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr,
+                                   ArrayRef<Value *> IdxList,
+                                   const Twine &NameStr,
+                                   BasicBlock::iterator InsertBefore) {
+    unsigned Values = 1 + unsigned(IdxList.size());
+    assert(PointeeType && "Must specify element type");
+    return new (Values) GetElementPtrInst(PointeeType, Ptr, IdxList, Values,
+                                          NameStr, InsertBefore);
+  }
+
   static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr,
                                    ArrayRef<Value *> IdxList,
                                    const Twine &NameStr = "",
@@ -994,6 +1032,16 @@ class GetElementPtrInst : public Instruction {
 
   /// Create an "inbounds" getelementptr. See the documentation for the
   /// "inbounds" flag in LangRef.html for details.
+  static GetElementPtrInst *CreateInBounds(Type *PointeeType, Value *Ptr,
+                                           ArrayRef<Value *> IdxList,
+                                           const Twine &NameStr,
+                                           BasicBlock::iterator InsertBefore) {
+    GetElementPtrInst *GEP =
+        Create(PointeeType, Ptr, IdxList, NameStr, InsertBefore);
+    GEP->setIsInBounds(true);
+    return GEP;
+  }
+
   static GetElementPtrInst *
   CreateInBounds(Type *PointeeType, Value *Ptr, ArrayRef<Value *> IdxList,
                  const Twine &NameStr = "",
@@ -1152,6 +1200,18 @@ struct OperandTraits<GetElementPtrInst> :
   public VariadicOperandTraits<GetElementPtrInst, 1> {
 };
 
+GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
+                                     ArrayRef<Value *> IdxList, unsigned Values,
+                                     const Twine &NameStr,
+                                     BasicBlock::iterator InsertBefore)
+    : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr,
+                  OperandTraits<GetElementPtrInst>::op_end(this) - Values,
+                  Values, InsertBefore),
+      SourceElementType(PointeeType),
+      ResultElementType(getIndexedType(PointeeType, IdxList)) {
+  init(Ptr, IdxList, NameStr);
+}
+
 GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
                                      ArrayRef<Value *> IdxList, unsigned Values,
                                      const Twine &NameStr,
@@ -1206,6 +1266,21 @@ class ICmpInst: public CmpInst {
   ICmpInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics.
+  ICmpInst(
+    BasicBlock::iterator InsertBefore,  ///< Where to insert
+    Predicate pred,  ///< The predicate to use for the comparison
+    Value *LHS,      ///< The left-hand-side of the expression
+    Value *RHS,      ///< The right-hand-side of the expression
+    const Twine &NameStr = ""  ///< Name of the instruction
+  ) : CmpInst(makeCmpResultType(LHS->getType()),
+              Instruction::ICmp, pred, LHS, RHS, NameStr,
+              InsertBefore) {
+#ifndef NDEBUG
+  AssertOK();
+#endif
+  }
+
   /// Constructor with insert-before-instruction semantics.
   ICmpInst(
     Instruction *InsertBefore,  ///< Where to insert
@@ -1378,6 +1453,19 @@ class FCmpInst: public CmpInst {
   FCmpInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics.
+  FCmpInst(
+    BasicBlock::iterator InsertBefore, ///< Where to insert
+    Predicate pred,  ///< The predicate to use for the comparison
+    Value *LHS,      ///< The left-hand-side of the expression
+    Value *RHS,      ///< The right-hand-side of the expression
+    const Twine &NameStr = ""  ///< Name of the instruction
+  ) : CmpInst(makeCmpResultType(LHS->getType()),
+              Instruction::FCmp, pred, LHS, RHS, NameStr,
+              InsertBefore) {
+    AssertOK();
+  }
+
   /// Constructor with insert-before-instruction semantics.
   FCmpInst(
     Instruction *InsertBefore, ///< Where to insert
@@ -1477,6 +1565,15 @@ class FCmpInst: public CmpInst {
 class CallInst : public CallBase {
   CallInst(const CallInst &CI);
 
+  /// Construct a CallInst from a range of arguments
+  inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                  ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
+                  BasicBlock::iterator InsertBefore);
+
+  inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                  const Twine &NameStr, BasicBlock::iterator InsertBefore)
+      : CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore) {}
+
   /// Construct a CallInst given a range of arguments.
   /// Construct a CallInst from a range of arguments
   inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
@@ -1493,6 +1590,9 @@ class CallInst : public CallBase {
                   ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                   BasicBlock *InsertAtEnd);
 
+  explicit CallInst(FunctionType *Ty, Value *F, const Twine &NameStr,
+                    BasicBlock::iterator InsertBefore);
+
   explicit CallInst(FunctionType *Ty, Value *F, const Twine &NameStr,
                     Instruction *InsertBefore);
 
@@ -1517,11 +1617,23 @@ class CallInst : public CallBase {
   CallInst *cloneImpl() const;
 
 public:
+  static CallInst *Create(FunctionType *Ty, Value *F, const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore) {
+    return new (ComputeNumOperands(0)) CallInst(Ty, F, NameStr, InsertBefore);
+  }
+
   static CallInst *Create(FunctionType *Ty, Value *F, const Twine &NameStr = "",
                           Instruction *InsertBefore = nullptr) {
     return new (ComputeNumOperands(0)) CallInst(Ty, F, NameStr, InsertBefore);
   }
 
+  static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore) {
+    return new (ComputeNumOperands(Args.size()))
+        CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore);
+  }
+
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           const Twine &NameStr,
                           Instruction *InsertBefore = nullptr) {
@@ -1529,6 +1641,18 @@ class CallInst : public CallBase {
         CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore);
   }
 
+  static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                          ArrayRef<OperandBundleDef> Bundles,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore) {
+    const int NumOperands =
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
+    const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+
+    return new (NumOperands, DescriptorBytes)
+        CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
+  }
+
   static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = std::nullopt,
                           const Twine &NameStr = "",
@@ -1563,12 +1687,26 @@ class CallInst : public CallBase {
         CallInst(Ty, Func, Args, Bundles, NameStr, InsertAtEnd);
   }
 
+  static CallInst *Create(FunctionCallee Func, const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), NameStr,
+                  InsertBefore);
+  }
+
   static CallInst *Create(FunctionCallee Func, const Twine &NameStr = "",
                           Instruction *InsertBefore = nullptr) {
     return Create(Func.getFunctionType(), Func.getCallee(), NameStr,
                   InsertBefore);
   }
 
+  static CallInst *Create(FunctionCallee Func, ArrayRef<Value *> Args,
+                          ArrayRef<OperandBundleDef> Bundles,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), Args, Bundles,
+                  NameStr, InsertBefore);
+  }
+
   static CallInst *Create(FunctionCallee Func, ArrayRef<Value *> Args,
                           ArrayRef<OperandBundleDef> Bundles = std::nullopt,
                           const Twine &NameStr = "",
@@ -1577,6 +1715,13 @@ class CallInst : public CallBase {
                   NameStr, InsertBefore);
   }
 
+  static CallInst *Create(FunctionCallee Func, ArrayRef<Value *> Args,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), Args, NameStr,
+                  InsertBefore);
+  }
+
   static CallInst *Create(FunctionCallee Func, ArrayRef<Value *> Args,
                           const Twine &NameStr,
                           Instruction *InsertBefore = nullptr) {
@@ -1609,6 +1754,8 @@ class CallInst : public CallBase {
   /// The returned call instruction is identical \p CI in every way except that
   /// the operand bundles for the new instruction are set to the operand bundles
   /// in \p Bundles.
+  static CallInst *Create(CallInst *CI, ArrayRef<OperandBundleDef> Bundles,
+                          BasicBlock::iterator InsertPt);
   static CallInst *Create(CallInst *CI, ArrayRef<OperandBundleDef> Bundles,
                           Instruction *InsertPt = nullptr);
 
@@ -1682,6 +1829,17 @@ CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
   init(Ty, Func, Args, Bundles, NameStr);
 }
 
+CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
+                   ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
+                   BasicBlock::iterator InsertBefore)
+    : CallBase(Ty->getReturnType(), Instruction::Call,
+               OperandTraits<CallBase>::op_end(this) -
+                   (Args.size() + CountBundleInputs(Bundles) + 1),
+               unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
+               InsertBefore) {
+  init(Ty, Func, Args, Bundles, NameStr);
+}
+
 CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
                    ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
                    Instruction *InsertBefore)
@@ -1700,6 +1858,14 @@ CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
 /// This class represents the LLVM 'select' instruction.
 ///
 class SelectInst : public Instruction {
+  SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr,
+             BasicBlock::iterator InsertBefore)
+      : Instruction(S1->getType(), Instruction::Select, &Op<0>(), 3,
+                    InsertBefore) {
+    init(C, S1, S2);
+    setName(NameStr);
+  }
+
   SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr,
              Instruction *InsertBefore)
     : Instruction(S1->getType(), Instruction::Select,
@@ -1730,6 +1896,16 @@ class SelectInst : public Instruction {
   SelectInst *cloneImpl() const;
 
 public:
+  static SelectInst *Create(Value *C, Value *S1, Value *S2,
+                            const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore,
+                            Instruction *MDFrom = nullptr) {
+    SelectInst *Sel = new (3) SelectInst(C, S1, S2, NameStr, InsertBefore);
+    if (MDFrom)
+      Sel->copyMetadata(*MDFrom);
+    return Sel;
+  }
+
   static SelectInst *Create(Value *C, Value *S1, Value *S2,
                             const Twine &NameStr = "",
                             Instruction *InsertBefore = nullptr,
@@ -1802,6 +1978,12 @@ class VAArgInst : public UnaryInstruction {
   VAArgInst *cloneImpl() const;
 
 public:
+  VAArgInst(Value *List, Type *Ty, const Twine &NameStr,
+            BasicBlock::iterator InsertBefore)
+      : UnaryInstruction(Ty, VAArg, List, InsertBefore) {
+    setName(NameStr);
+  }
+
   VAArgInst(Value *List, Type *Ty, const Twine &NameStr = "",
              Instruction *InsertBefore = nullptr)
     : UnaryInstruction(Ty, VAArg, List, InsertBefore) {
@@ -1835,6 +2017,8 @@ class VAArgInst : public UnaryInstruction {
 /// element from a VectorType value
 ///
 class ExtractElementInst : public Instruction {
+  ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr,
+                     BasicBlock::iterator InsertBefore);
   ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr = "",
                      Instruction *InsertBefore = nullptr);
   ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr,
@@ -1847,6 +2031,12 @@ class ExtractElementInst : public Instruction {
   ExtractElementInst *cloneImpl() const;
 
 public:
+  static ExtractElementInst *Create(Value *Vec, Value *Idx,
+                                    const Twine &NameStr,
+                                    BasicBlock::iterator InsertBefore) {
+    return new (2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
+  }
+
   static ExtractElementInst *Create(Value *Vec, Value *Idx,
                                    const Twine &NameStr = "",
                                    Instruction *InsertBefore = nullptr) {
@@ -1899,6 +2089,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementInst, Value)
 /// element into a VectorType value
 ///
 class InsertElementInst : public Instruction {
+  InsertElementInst(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr,
+                    BasicBlock::iterator InsertBefore);
   InsertElementInst(Value *Vec, Value *NewElt, Value *Idx,
                     const Twine &NameStr = "",
                     Instruction *InsertBefore = nullptr);
@@ -1912,6 +2104,12 @@ class InsertElementInst : public Instruction {
   InsertElementInst *cloneImpl() const;
 
 public:
+  static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
+                                   const Twine &NameStr,
+                                   BasicBlock::iterator InsertBefore) {
+    return new (3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
+  }
+
   static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
                                    const Twine &NameStr = "",
                                    Instruction *InsertBefore = nullptr) {
@@ -1981,19 +2179,27 @@ class ShuffleVectorInst : public Instruction {
   ShuffleVectorInst *cloneImpl() const;
 
 public:
+  ShuffleVectorInst(Value *V1, Value *Mask, const Twine &NameStr,
+                    BasicBlock::iterator InsertBefore);
   ShuffleVectorInst(Value *V1, Value *Mask, const Twine &NameStr = "",
                     Instruction *InsertBefore = nullptr);
   ShuffleVectorInst(Value *V1, Value *Mask, const Twine &NameStr,
                     BasicBlock *InsertAtEnd);
+  ShuffleVectorInst(Value *V1, ArrayRef<int> Mask, const Twine &NameStr,
+                    BasicBlock::iterator InsertBefore);
   ShuffleVectorInst(Value *V1, ArrayRef<int> Mask, const Twine &NameStr = "",
                     Instruction *InsertBefore = nullptr);
   ShuffleVectorInst(Value *V1, ArrayRef<int> Mask, const Twine &NameStr,
                     BasicBlock *InsertAtEnd);
+  ShuffleVectorInst(Value *V1, Value *V2, Value *Mask, const Twine &NameStr,
+                    BasicBlock::iterator InsertBefor);
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr = "",
                     Instruction *InsertBefor = nullptr);
   ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                     const Twine &NameStr, BasicBlock *InsertAtEnd);
+  ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
+                    const Twine &NameStr, BasicBlock::iterator InsertBefor);
   ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
                     const Twine &NameStr = "",
                     Instruction *InsertBefor = nullptr);
@@ -2468,9 +2674,12 @@ class ExtractValueInst : public UnaryInstruction {
   ExtractValueInst(const ExtractValueInst &EVI);
 
   /// Constructors - Create a extractvalue instruction with a base aggregate
-  /// value and a list of indices.  The first ctor can optionally insert before
-  /// an existing instruction, the second appends the new instruction to the
-  /// specified BasicBlock.
+  /// value and a list of indices. The first and second ctor can optionally
+  /// insert before an existing instruction, the third appends the new
+  /// instruction to the specified BasicBlock.
+  inline ExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore);
   inline ExtractValueInst(Value *Agg,
                           ArrayRef<unsigned> Idxs,
                           const Twine &NameStr,
@@ -2488,6 +2697,13 @@ class ExtractValueInst : public UnaryInstruction {
   ExtractValueInst *cloneImpl() const;
 
 public:
+  static ExtractValueInst *Create(Value *Agg, ArrayRef<unsigned> Idxs,
+                                  const Twine &NameStr,
+                                  BasicBlock::iterator InsertBefore) {
+    return new
+      ExtractValueInst(Agg, Idxs, NameStr, InsertBefore);
+  }
+
   static ExtractValueInst *Create(Value *Agg,
                                   ArrayRef<unsigned> Idxs,
                                   const Twine &NameStr = "",
@@ -2548,6 +2764,14 @@ class ExtractValueInst : public UnaryInstruction {
   }
 };
 
+ExtractValueInst::ExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+                                   const Twine &NameStr,
+                                   BasicBlock::iterator InsertBefore)
+    : UnaryInstruction(checkGEPType(getIndexedType(Agg->getType(), Idxs)),
+                       ExtractValue, Agg, InsertBefore) {
+  init(Idxs, NameStr);
+}
+
 ExtractValueInst::ExtractValueInst(Value *Agg,
                                    ArrayRef<unsigned> Idxs,
                                    const Twine &NameStr,
@@ -2579,9 +2803,12 @@ class InsertValueInst : public Instruction {
   InsertValueInst(const InsertValueInst &IVI);
 
   /// Constructors - Create a insertvalue instruction with a base aggregate
-  /// value, a value to insert, and a list of indices.  The first ctor can
-  /// optionally insert before an existing instruction, the second appends
+  /// value, a value to insert, and a list of indices. The first and second ctor
+  /// can optionally insert before an existing instruction, the third appends
   /// the new instruction to the specified BasicBlock.
+  inline InsertValueInst(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
+                         const Twine &NameStr,
+                         BasicBlock::iterator InsertBefore);
   inline InsertValueInst(Value *Agg, Value *Val,
                          ArrayRef<unsigned> Idxs,
                          const Twine &NameStr,
@@ -2590,8 +2817,10 @@ class InsertValueInst : public Instruction {
                          ArrayRef<unsigned> Idxs,
                          const Twine &NameStr, BasicBlock *InsertAtEnd);
 
-  /// Constructors - These two constructors are convenience methods because one
-  /// and two index insertvalue instructions are so common.
+  /// Constructors - These three constructors are convenience methods because
+  /// one and two index insertvalue instructions are so common.
+  InsertValueInst(Value *Agg, Value *Val, unsigned Idx, const Twine &NameStr,
+                  BasicBlock::iterator InsertBefore);
   InsertValueInst(Value *Agg, Value *Val, unsigned Idx,
                   const Twine &NameStr = "",
                   Instruction *InsertBefore = nullptr);
@@ -2612,6 +2841,12 @@ class InsertValueInst : public Instruction {
   void *operator new(size_t S) { return User::operator new(S, 2); }
   void operator delete(void *Ptr) { User::operator delete(Ptr); }
 
+  static InsertValueInst *Create(Value *Agg, Value *Val,
+                                 ArrayRef<unsigned> Idxs, const Twine &NameStr,
+                                 BasicBlock::iterator InsertBefore) {
+    return new InsertValueInst(Agg, Val, Idxs, NameStr, InsertBefore);
+  }
+
   static InsertValueInst *Create(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
                                  const Twine &NameStr = "",
@@ -2683,6 +2918,16 @@ struct OperandTraits<InsertValueInst> :
   public FixedNumOperandTraits<InsertValueInst, 2> {
 };
 
+InsertValueInst::InsertValueInst(Value *Agg,
+                                 Value *Val,
+                                 ArrayRef<unsigned> Idxs,
+                                 const Twine &NameStr,
+                                 BasicBlock::iterator InsertBefore)
+  : Instruction(Agg->getType(), InsertValue, OperandTraits<InsertValueInst>::op_begin(this),
+                2, InsertBefore) {
+  init(Agg, Val, Idxs, NameStr);
+}
+
 InsertValueInst::InsertValueInst(Value *Agg,
                                  Value *Val,
                                  ArrayRef<unsigned> Idxs,
@@ -2722,6 +2967,15 @@ class PHINode : public Instruction {
 
   PHINode(const PHINode &PN);
 
+  explicit PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr,
+                   BasicBlock::iterator InsertBefore)
+      : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore),
+        ReservedSpace(NumReservedValues) {
+    assert(!Ty->isTokenTy() && "PHI nodes cannot have token type!");
+    setName(NameStr);
+    allocHungoffUses(ReservedSpace);
+  }
+
   explicit PHINode(Type *Ty, unsigned NumReservedValues,
                    const Twine &NameStr = "",
                    Instruction *InsertBefore = nullptr)
@@ -2757,6 +3011,12 @@ class PHINode : public Instruction {
 public:
   /// Constructors - NumReservedValues is a hint for the number of incoming
   /// edges that this phi node will have (use 0 if you really have no idea).
+  static PHINode *Create(Type *Ty, unsigned NumReservedValues,
+                         const Twine &NameStr,
+                         BasicBlock::iterator InsertBefore) {
+    return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
+  }
+
   static PHINode *Create(Type *Ty, unsigned NumReservedValues,
                          const Twine &NameStr = "",
                          Instruction *InsertBefore = nullptr) {
@@ -2981,6 +3241,9 @@ class LandingPadInst : public Instruction {
   enum ClauseType { Catch, Filter };
 
 private:
+  explicit LandingPadInst(Type *RetTy, unsigned NumReservedValues,
+                          const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore);
   explicit LandingPadInst(Type *RetTy, unsigned NumReservedValues,
                           const Twine &NameStr, Instruction *InsertBefore);
   explicit LandingPadInst(Type *RetTy, unsigned NumReservedValues,
@@ -3003,6 +3266,9 @@ class LandingPadInst : public Instruction {
 
   /// Constructors - NumReservedClauses is a hint for the number of incoming
   /// clauses that this landingpad will have (use 0 if you really have no idea).
+  static LandingPadInst *Create(Type *RetTy, unsigned NumReservedClauses,
+                                const Twine &NameStr,
+                                BasicBlock::iterator InsertBefore);
   static LandingPadInst *Create(Type *RetTy, unsigned NumReservedClauses,
                                 const Twine &NameStr = "",
                                 Instruction *InsertBefore = nullptr);
@@ -3077,6 +3343,8 @@ class ReturnInst : public Instruction {
   // ReturnInst()                  - 'ret void' instruction
   // ReturnInst(    null)          - 'ret void' instruction
   // ReturnInst(Value* X)          - 'ret X'    instruction
+  // ReturnInst(null, Iterator It) - 'ret void' instruction, insert before I
+  // ReturnInst(Value* X, Iterator It) - 'ret X'    instruction, insert before I
   // ReturnInst(    null, Inst *I) - 'ret void' instruction, insert before I
   // ReturnInst(Value* X, Inst *I) - 'ret X'    instruction, insert before I
   // ReturnInst(    null, BB *B)   - 'ret void' instruction, insert @ end of B
@@ -3084,6 +3352,8 @@ class ReturnInst : public Instruction {
   //
   // NOTE: If the Value* passed is of type void then the constructor behaves as
   // if it was passed NULL.
+  explicit ReturnInst(LLVMContext &C, Value *retVal,
+                      BasicBlock::iterator InsertBefore);
   explicit ReturnInst(LLVMContext &C, Value *retVal = nullptr,
                       Instruction *InsertBefore = nullptr);
   ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd);
@@ -3096,6 +3366,11 @@ class ReturnInst : public Instruction {
   ReturnInst *cloneImpl() const;
 
 public:
+  static ReturnInst *Create(LLVMContext &C, Value *retVal,
+                            BasicBlock::iterator InsertBefore) {
+    return new (!!retVal) ReturnInst(C, retVal, InsertBefore);
+  }
+
   static ReturnInst* Create(LLVMContext &C, Value *retVal = nullptr,
                             Instruction *InsertBefore = nullptr) {
     return new(!!retVal) ReturnInst(C, retVal, InsertBefore);
@@ -3160,10 +3435,15 @@ class BranchInst : public Instruction {
   // BranchInst constructors (where {B, T, F} are blocks, and C is a condition):
   // BranchInst(BB *B)                           - 'br B'
   // BranchInst(BB* T, BB *F, Value *C)          - 'br C, T, F'
+  // BranchInst(BB* B, Iter It)                  - 'br B'        insert before I
+  // BranchInst(BB* T, BB *F, Value *C, Iter It) - 'br C, T, F', insert before I
   // BranchInst(BB* B, Inst *I)                  - 'br B'        insert before I
   // BranchInst(BB* T, BB *F, Value *C, Inst *I) - 'br C, T, F', insert before I
   // BranchInst(BB* B, BB *I)                    - 'br B'        insert at end
   // BranchInst(BB* T, BB *F, Value *C, BB *I)   - 'br C, T, F', insert at end
+  explicit BranchInst(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore);
+  BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
+             BasicBlock::iterator InsertBefore);
   explicit BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore = nullptr);
   BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
              Instruction *InsertBefore = nullptr);
@@ -3207,11 +3487,21 @@ class BranchInst : public Instruction {
     const BasicBlock *operator->() const { return operator*(); }
   };
 
+  static BranchInst *Create(BasicBlock *IfTrue,
+                            BasicBlock::iterator InsertBefore) {
+    return new(1) BranchInst(IfTrue, InsertBefore);
+  }
+
   static BranchInst *Create(BasicBlock *IfTrue,
                             Instruction *InsertBefore = nullptr) {
     return new(1) BranchInst(IfTrue, InsertBefore);
   }
 
+  static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                            Value *Cond, BasicBlock::iterator InsertBefore) {
+    return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore);
+  }
+
   static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
                             Value *Cond, Instruction *InsertBefore = nullptr) {
     return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore);
@@ -3304,6 +3594,13 @@ class SwitchInst : public Instruction {
   // Operand[2n+1] = BasicBlock to go to on match
   SwitchInst(const SwitchInst &SI);
 
+  /// Create a new switch instruction, specifying a value to switch on and a
+  /// default destination. The number of additional cases can be specified here
+  /// to make memory allocation more efficient. This constructor can also
+  /// auto-insert before another instruction.
+  SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
+             BasicBlock::iterator InsertBefore);
+
   /// Create a new switch instruction, specifying a value to switch on and a
   /// default destination. The number of additional cases can be specified here
   /// to make memory allocation more efficient. This constructor can also
@@ -3484,6 +3781,12 @@ class SwitchInst : public Instruction {
   using CaseIt = CaseIteratorImpl<CaseHandle>;
   using ConstCaseIt = CaseIteratorImpl<ConstCaseHandle>;
 
+  static SwitchInst *Create(Value *Value, BasicBlock *Default,
+                            unsigned NumCases,
+                            BasicBlock::iterator InsertBefore) {
+    return new SwitchInst(Value, Default, NumCases, InsertBefore);
+  }
+
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
                             unsigned NumCases,
                             Instruction *InsertBefore = nullptr) {
@@ -3704,6 +4007,13 @@ class IndirectBrInst : public Instruction {
   // Operand[n+1] = n-th destination
   IndirectBrInst(const IndirectBrInst &IBI);
 
+  /// Create a new indirectbr instruction, specifying an
+  /// Address to jump to.  The number of expected destinations can be specified
+  /// here to make memory allocation more efficient.  This constructor can also
+  /// autoinsert before another instruction.
+  IndirectBrInst(Value *Address, unsigned NumDests,
+                 BasicBlock::iterator InsertBefore);
+
   /// Create a new indirectbr instruction, specifying an
   /// Address to jump to.  The number of expected destinations can be specified
   /// here to make memory allocation more efficient.  This constructor can also
@@ -3758,6 +4068,11 @@ class IndirectBrInst : public Instruction {
     const BasicBlock *operator->() const { return operator*(); }
   };
 
+  static IndirectBrInst *Create(Value *Address, unsigned NumDests,
+                                BasicBlock::iterator InsertBefore) {
+    return new IndirectBrInst(Address, NumDests, InsertBefore);
+  }
+
   static IndirectBrInst *Create(Value *Address, unsigned NumDests,
                                 Instruction *InsertBefore = nullptr) {
     return new IndirectBrInst(Address, NumDests, InsertBefore);
@@ -3845,6 +4160,12 @@ class InvokeInst : public CallBase {
 
   InvokeInst(const InvokeInst &BI);
 
+  /// Construct an InvokeInst given a range of arguments.
+  inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
+                    BasicBlock *IfException, ArrayRef<Value *> Args,
+                    ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+                    const Twine &NameStr, BasicBlock::iterator InsertBefore);
+
   /// Construct an InvokeInst given a range of arguments.
   ///
   /// Construct an InvokeInst from a range of arguments
@@ -3876,6 +4197,16 @@ class InvokeInst : public CallBase {
   InvokeInst *cloneImpl() const;
 
 public:
+  static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
+                            BasicBlock *IfException, ArrayRef<Value *> Args,
+                            const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    int NumOperands = ComputeNumOperands(Args.size());
+    return new (NumOperands)
+        InvokeInst(Ty, Func, IfNormal, IfException, Args, std::nullopt,
+                   NumOperands, NameStr, InsertBefore);
+  }
+
   static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
@@ -3886,6 +4217,20 @@ class InvokeInst : public CallBase {
                    NumOperands, NameStr, InsertBefore);
   }
 
+  static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
+                            BasicBlock *IfException, ArrayRef<Value *> Args,
+                            ArrayRef<OperandBundleDef> Bundles,
+                            const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    int NumOperands =
+        ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
+    unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+
+    return new (NumOperands, DescriptorBytes)
+        InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands,
+                   NameStr, InsertBefore);
+  }
+
   static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             ArrayRef<OperandBundleDef> Bundles = std::nullopt,
@@ -3922,6 +4267,14 @@ class InvokeInst : public CallBase {
                    NameStr, InsertAtEnd);
   }
 
+  static InvokeInst *Create(FunctionCallee Func, BasicBlock *IfNormal,
+                            BasicBlock *IfException, ArrayRef<Value *> Args,
+                            const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), IfNormal,
+                  IfException, Args, std::nullopt, NameStr, InsertBefore);
+  }
+
   static InvokeInst *Create(FunctionCallee Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             const Twine &NameStr,
@@ -3930,6 +4283,15 @@ class InvokeInst : public CallBase {
                   IfException, Args, std::nullopt, NameStr, InsertBefore);
   }
 
+  static InvokeInst *Create(FunctionCallee Func, BasicBlock *IfNormal,
+                            BasicBlock *IfException, ArrayRef<Value *> Args,
+                            ArrayRef<OperandBundleDef> Bundles,
+                            const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), IfNormal,
+                  IfException, Args, Bundles, NameStr, InsertBefore);
+  }
+
   static InvokeInst *Create(FunctionCallee Func, BasicBlock *IfNormal,
                             BasicBlock *IfException, ArrayRef<Value *> Args,
                             ArrayRef<OperandBundleDef> Bundles = std::nullopt,
@@ -3960,6 +4322,8 @@ class InvokeInst : public CallBase {
   /// The returned invoke instruction is identical to \p II in every way except
   /// that the operand bundles for the new instruction are set to the operand
   /// bundles in \p Bundles.
+  static InvokeInst *Create(InvokeInst *II, ArrayRef<OperandBundleDef> Bundles,
+                            BasicBlock::iterator InsertPt);
   static InvokeInst *Create(InvokeInst *II, ArrayRef<OperandBundleDef> Bundles,
                             Instruction *InsertPt = nullptr);
 
@@ -4013,6 +4377,16 @@ class InvokeInst : public CallBase {
   }
 };
 
+InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
+                       BasicBlock *IfException, ArrayRef<Value *> Args,
+                       ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+                       const Twine &NameStr, BasicBlock::iterator InsertBefore)
+    : CallBase(Ty->getReturnType(), Instruction::Invoke,
+               OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+               InsertBefore) {
+  init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr);
+}
+
 InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
                        BasicBlock *IfException, ArrayRef<Value *> Args,
                        ArrayRef<OperandBundleDef> Bundles, int NumOperands,
@@ -4047,6 +4421,13 @@ class CallBrInst : public CallBase {
 
   CallBrInst(const CallBrInst &BI);
 
+  /// Construct a CallBrInst given a range of arguments.
+  inline CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
+                    ArrayRef<BasicBlock *> IndirectDests,
+                    ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
+                    int NumOperands, const Twine &NameStr,
+                    BasicBlock::iterator InsertBefore);
+
   /// Construct a CallBrInst given a range of arguments.
   ///
   /// Construct a CallBrInst from a range of arguments
@@ -4081,6 +4462,17 @@ class CallBrInst : public CallBase {
   CallBrInst *cloneImpl() const;
 
 public:
+  static CallBrInst *Create(FunctionType *Ty, Value *Func,
+                            BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size());
+    return new (NumOperands)
+        CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, std::nullopt,
+                   NumOperands, NameStr, InsertBefore);
+  }
+
   static CallBrInst *Create(FunctionType *Ty, Value *Func,
                             BasicBlock *DefaultDest,
                             ArrayRef<BasicBlock *> IndirectDests,
@@ -4092,6 +4484,20 @@ class CallBrInst : public CallBase {
                    NumOperands, NameStr, InsertBefore);
   }
 
+  static CallBrInst *
+  Create(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
+         ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args,
+         ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
+         BasicBlock::iterator InsertBefore) {
+    int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size(),
+                                         CountBundleInputs(Bundles));
+    unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+
+    return new (NumOperands, DescriptorBytes)
+        CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, Bundles,
+                   NumOperands, NameStr, InsertBefore);
+  }
+
   static CallBrInst *
   Create(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
          ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args,
@@ -4132,6 +4538,14 @@ class CallBrInst : public CallBase {
                    NumOperands, NameStr, InsertAtEnd);
   }
 
+  static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args, const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), DefaultDest,
+                  IndirectDests, Args, NameStr, InsertBefore);
+  }
+
   static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest,
                             ArrayRef<BasicBlock *> IndirectDests,
                             ArrayRef<Value *> Args, const Twine &NameStr,
@@ -4140,6 +4554,16 @@ class CallBrInst : public CallBase {
                   IndirectDests, Args, NameStr, InsertBefore);
   }
 
+  static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest,
+                            ArrayRef<BasicBlock *> IndirectDests,
+                            ArrayRef<Value *> Args,
+                            ArrayRef<OperandBundleDef> Bundles,
+                            const Twine &NameStr,
+                            BasicBlock::iterator InsertBefore) {
+    return Create(Func.getFunctionType(), Func.getCallee(), DefaultDest,
+                  IndirectDests, Args, Bundles, NameStr, InsertBefore);
+  }
+
   static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest,
                             ArrayRef<BasicBlock *> IndirectDests,
                             ArrayRef<Value *> Args,
@@ -4174,6 +4598,8 @@ class CallBrInst : public CallBase {
   /// The returned callbr instruction is identical to \p CBI in every way
   /// except that the operand bundles for the new instruction are set to the
   /// operand bundles in \p Bundles.
+  static CallBrInst *Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> Bundles,
+                            BasicBlock::iterator InsertPt);
   static CallBrInst *Create(CallBrInst *CBI,
                             ArrayRef<OperandBundleDef> Bundles,
                             Instruction *InsertPt = nullptr);
@@ -4245,6 +4671,17 @@ class CallBrInst : public CallBase {
   }
 };
 
+CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
+                       ArrayRef<BasicBlock *> IndirectDests,
+                       ArrayRef<Value *> Args,
+                       ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+                       const Twine &NameStr, BasicBlock::iterator InsertBefore)
+    : CallBase(Ty->getReturnType(), Instruction::CallBr,
+               OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+               InsertBefore) {
+  init(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, NameStr);
+}
+
 CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
                        ArrayRef<BasicBlock *> IndirectDests,
                        ArrayRef<Value *> Args,
@@ -4278,6 +4715,7 @@ class ResumeInst : public Instruction {
   ResumeInst(const ResumeInst &RI);
 
   explicit ResumeInst(Value *Exn, Instruction *InsertBefore=nullptr);
+  explicit ResumeInst(Value *Exn, BasicBlock::iterator InsertBefore);
   ResumeInst(Value *Exn, BasicBlock *InsertAtEnd);
 
 protected:
@@ -4287,6 +4725,10 @@ class ResumeInst : public Instruction {
   ResumeInst *cloneImpl() const;
 
 public:
+  static ResumeInst *Create(Value *Exn, BasicBlock::iterator InsertBefore) {
+    return new (1) ResumeInst(Exn, InsertBefore);
+  }
+
   static ResumeInst *Create(Value *Exn, Instruction *InsertBefore = nullptr) {
     return new(1) ResumeInst(Exn, InsertBefore);
   }
@@ -4343,6 +4785,14 @@ class CatchSwitchInst : public Instruction {
   // Operand[n] = BasicBlock to go to on match
   CatchSwitchInst(const CatchSwitchInst &CSI);
 
+  /// Create a new switch instruction, specifying a
+  /// default destination.  The number of additional handlers can be specified
+  /// here to make memory allocation more efficient.
+  /// This constructor can also autoinsert before another instruction.
+  CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
+                  unsigned NumHandlers, const Twine &NameStr,
+                  BasicBlock::iterator InsertBefore);
+
   /// Create a new switch instruction, specifying a
   /// default destination.  The number of additional handlers can be specified
   /// here to make memory allocation more efficient.
@@ -4374,6 +4824,13 @@ class CatchSwitchInst : public Instruction {
 public:
   void operator delete(void *Ptr) { return User::operator delete(Ptr); }
 
+  static CatchSwitchInst *Create(Value *ParentPad, BasicBlock *UnwindDest,
+                                 unsigned NumHandlers, const Twine &NameStr,
+                                 BasicBlock::iterator InsertBefore) {
+    return new CatchSwitchInst(ParentPad, UnwindDest, NumHandlers, NameStr,
+                               InsertBefore);
+  }
+
   static CatchSwitchInst *Create(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumHandlers,
                                  const Twine &NameStr = "",
@@ -4511,6 +4968,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value)
 //===----------------------------------------------------------------------===//
 class CleanupPadInst : public FuncletPadInst {
 private:
+  explicit CleanupPadInst(Value *ParentPad, ArrayRef<Value *> Args,
+                          unsigned Values, const Twine &NameStr,
+                          BasicBlock::iterator InsertBefore)
+      : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, Values,
+                       NameStr, InsertBefore) {}
   explicit CleanupPadInst(Value *ParentPad, ArrayRef<Value *> Args,
                           unsigned Values, const Twine &NameStr,
                           Instruction *InsertBefore)
@@ -4523,6 +4985,14 @@ class CleanupPadInst : public FuncletPadInst {
                        NameStr, InsertAtEnd) {}
 
 public:
+  static CleanupPadInst *Create(Value *ParentPad, ArrayRef<Value *> Args,
+                                const Twine &NameStr,
+                                BasicBlock::iterator InsertBefore) {
+    unsigned Values = 1 + Args.size();
+    return new (Values)
+        CleanupPadInst(ParentPad, Args, Values, NameStr, InsertBefore);
+  }
+
   static CleanupPadInst *Create(Value *ParentPad,
                                 ArrayRef<Value *> Args = std::nullopt,
                                 const Twine &NameStr = "",
@@ -4553,6 +5023,11 @@ class CleanupPadInst : public FuncletPadInst {
 //===----------------------------------------------------------------------===//
 class CatchPadInst : public FuncletPadInst {
 private:
+  explicit CatchPadInst(Value *CatchSwitch, ArrayRef<Value *> Args,
+                        unsigned Values, const Twine &NameStr,
+                        BasicBlock::iterator InsertBefore)
+      : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, Values,
+                       NameStr, InsertBefore) {}
   explicit CatchPadInst(Value *CatchSwitch, ArrayRef<Value *> Args,
                         unsigned Values, const Twine &NameStr,
                         Instruction *InsertBefore)
@@ -4565,6 +5040,14 @@ class CatchPadInst : public FuncletPadInst {
                        NameStr, InsertAtEnd) {}
 
 public:
+  static CatchPadInst *Create(Value *CatchSwitch, ArrayRef<Value *> Args,
+                              const Twine &NameStr,
+                              BasicBlock::iterator InsertBefore) {
+    unsigned Values = 1 + Args.size();
+    return new (Values)
+        CatchPadInst(CatchSwitch, Args, Values, NameStr, InsertBefore);
+  }
+
   static CatchPadInst *Create(Value *CatchSwitch, ArrayRef<Value *> Args,
                               const Twine &NameStr = "",
                               Instruction *InsertBefore = nullptr) {
@@ -4604,6 +5087,8 @@ class CatchPadInst : public FuncletPadInst {
 
 class CatchReturnInst : public Instruction {
   CatchReturnInst(const CatchReturnInst &RI);
+  CatchReturnInst(Value *CatchPad, BasicBlock *BB,
+                  BasicBlock::iterator InsertBefore);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, Instruction *InsertBefore);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, BasicBlock *InsertAtEnd);
 
@@ -4616,6 +5101,13 @@ class CatchReturnInst : public Instruction {
   CatchReturnInst *cloneImpl() const;
 
 public:
+  static CatchReturnInst *Create(Value *CatchPad, BasicBlock *BB,
+                                 BasicBlock::iterator InsertBefore) {
+    assert(CatchPad);
+    assert(BB);
+    return new (2) CatchReturnInst(CatchPad, BB, InsertBefore);
+  }
+
   static CatchReturnInst *Create(Value *CatchPad, BasicBlock *BB,
                                  Instruction *InsertBefore = nullptr) {
     assert(CatchPad);
@@ -4688,6 +5180,8 @@ class CleanupReturnInst : public Instruction {
 
 private:
   CleanupReturnInst(const CleanupReturnInst &RI);
+  CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
+                    BasicBlock::iterator InsertBefore);
   CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
                     Instruction *InsertBefore = nullptr);
   CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
@@ -4702,6 +5196,16 @@ class CleanupReturnInst : public Instruction {
   CleanupReturnInst *cloneImpl() const;
 
 public:
+  static CleanupReturnInst *Create(Value *CleanupPad, BasicBlock *UnwindBB,
+                                   BasicBlock::iterator InsertBefore) {
+    assert(CleanupPad);
+    unsigned Values = 1;
+    if (UnwindBB)
+      ++Values;
+    return new (Values)
+        CleanupReturnInst(CleanupPad, UnwindBB, Values, InsertBefore);
+  }
+
   static CleanupReturnInst *Create(Value *CleanupPad,
                                    BasicBlock *UnwindBB = nullptr,
                                    Instruction *InsertBefore = nullptr) {
@@ -4799,6 +5303,7 @@ class UnreachableInst : public Instruction {
   UnreachableInst *cloneImpl() const;
 
 public:
+  explicit UnreachableInst(LLVMContext &C, BasicBlock::iterator InsertBefore);
   explicit UnreachableInst(LLVMContext &C, Instruction *InsertBefore = nullptr);
   explicit UnreachableInst(LLVMContext &C, BasicBlock *InsertAtEnd);
 
@@ -4840,6 +5345,14 @@ class TruncInst : public CastInst {
   TruncInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  TruncInst(
+    Value *S,                           ///< The value to be truncated
+    Type *Ty,                           ///< The (smaller) type to truncate to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   TruncInst(
     Value *S,                           ///< The value to be truncated
@@ -4879,6 +5392,14 @@ class ZExtInst : public CastInst {
   ZExtInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  ZExtInst(
+    Value *S,                           ///< The value to be zero extended
+    Type *Ty,                           ///< The type to zero extend to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   ZExtInst(
     Value *S,                           ///< The value to be zero extended
@@ -4918,6 +5439,14 @@ class SExtInst : public CastInst {
   SExtInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  SExtInst(
+    Value *S,                           ///< The value to be sign extended
+    Type *Ty,                           ///< The type to sign extend to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   SExtInst(
     Value *S,                           ///< The value to be sign extended
@@ -4957,6 +5486,14 @@ class FPTruncInst : public CastInst {
   FPTruncInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  FPTruncInst(
+    Value *S,                           ///< The value to be truncated
+    Type *Ty,                           ///< The type to truncate to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   FPTruncInst(
     Value *S,                           ///< The value to be truncated
@@ -4996,6 +5533,14 @@ class FPExtInst : public CastInst {
   FPExtInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  FPExtInst(
+    Value *S,                           ///< The value to be extended
+    Type *Ty,                           ///< The type to extend to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   FPExtInst(
     Value *S,                           ///< The value to be extended
@@ -5035,6 +5580,14 @@ class UIToFPInst : public CastInst {
   UIToFPInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  UIToFPInst(
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   UIToFPInst(
     Value *S,                           ///< The value to be converted
@@ -5074,6 +5627,14 @@ class SIToFPInst : public CastInst {
   SIToFPInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  SIToFPInst(
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   SIToFPInst(
     Value *S,                           ///< The value to be converted
@@ -5113,6 +5674,14 @@ class FPToUIInst  : public CastInst {
   FPToUIInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  FPToUIInst(
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   FPToUIInst(
     Value *S,                           ///< The value to be converted
@@ -5152,6 +5721,14 @@ class FPToSIInst  : public CastInst {
   FPToSIInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  FPToSIInst(
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   FPToSIInst(
     Value *S,                           ///< The value to be converted
@@ -5187,6 +5764,14 @@ class IntToPtrInst : public CastInst {
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
 
+  /// Constructor with insert-before-instruction semantics
+  IntToPtrInst(
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   IntToPtrInst(
     Value *S,                           ///< The value to be converted
@@ -5234,6 +5819,14 @@ class PtrToIntInst : public CastInst {
   PtrToIntInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  PtrToIntInst(
+    Value *S,                           ///< The value to be converted
+    Type *Ty,                           ///< The type to convert to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   PtrToIntInst(
     Value *S,                           ///< The value to be converted
@@ -5285,6 +5878,14 @@ class BitCastInst : public CastInst {
   BitCastInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  BitCastInst(
+    Value *S,                           ///< The value to be casted
+    Type *Ty,                           ///< The type to casted to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   BitCastInst(
     Value *S,                           ///< The value to be casted
@@ -5325,6 +5926,14 @@ class AddrSpaceCastInst : public CastInst {
   AddrSpaceCastInst *cloneImpl() const;
 
 public:
+  /// Constructor with insert-before-instruction semantics
+  AddrSpaceCastInst(
+    Value *S,                           ///< The value to be casted
+    Type *Ty,                           ///< The type to casted to
+    const Twine &NameStr,               ///< A name for the new instruction
+    BasicBlock::iterator InsertBefore   ///< Where to insert the new instruction
+  );
+
   /// Constructor with insert-before-instruction semantics
   AddrSpaceCastInst(
     Value *S,                           ///< The value to be casted
@@ -5467,6 +6076,8 @@ class FreezeInst : public UnaryInstruction {
   FreezeInst *cloneImpl() const;
 
 public:
+  explicit FreezeInst(Value *S, const Twine &NameStr,
+                      BasicBlock::iterator InsertBefore);
   explicit FreezeInst(Value *S,
                       const Twine &NameStr = "",
                       Instruction *InsertBefore = nullptr);
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 345b050b7077a..c54f8d7aca4a9 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -22,6 +22,16 @@
 #include "llvm/IR/Type.h"
 using namespace llvm;
 
+Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+                         InstListType::iterator InsertBefore)
+    : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
+
+  // When called with an iterator, there must be a block to insert into.
+  BasicBlock *BB = InsertBefore->getParent();
+  assert(BB && "Instruction to insert before is not in a basic block!");
+  insertInto(BB, InsertBefore);
+}
+
 Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          Instruction *InsertBefore)
   : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(nullptr) {
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index fc5c9b201487e..25778570ebf34 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -230,6 +230,13 @@ bool PHINode::hasConstantOrUndefValue() const {
 //                       LandingPadInst Implementation
 //===----------------------------------------------------------------------===//
 
+LandingPadInst::LandingPadInst(Type *RetTy, unsigned NumReservedValues,
+                               const Twine &NameStr,
+                               BasicBlock::iterator InsertBefore)
+    : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) {
+  init(NumReservedValues, NameStr);
+}
+
 LandingPadInst::LandingPadInst(Type *RetTy, unsigned NumReservedValues,
                                const Twine &NameStr, Instruction *InsertBefore)
     : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) {
@@ -736,6 +743,20 @@ CallInst::CallInst(const CallInst &CI)
   SubclassOptionalData = CI.SubclassOptionalData;
 }
 
+CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
+                           BasicBlock::iterator InsertPt) {
+  std::vector<Value *> Args(CI->arg_begin(), CI->arg_end());
+
+  auto *NewCI = CallInst::Create(CI->getFunctionType(), CI->getCalledOperand(),
+                                 Args, OpB, CI->getName(), InsertPt);
+  NewCI->setTailCallKind(CI->getTailCallKind());
+  NewCI->setCallingConv(CI->getCallingConv());
+  NewCI->SubclassOptionalData = CI->SubclassOptionalData;
+  NewCI->setAttributes(CI->getAttributes());
+  NewCI->setDebugLoc(CI->getDebugLoc());
+  return NewCI;
+}
+
 CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
                            Instruction *InsertPt) {
   std::vector<Value *> Args(CI->arg_begin(), CI->arg_end());
@@ -960,7 +981,17 @@ ReturnInst::ReturnInst(const ReturnInst &RI)
   SubclassOptionalData = RI.SubclassOptionalData;
 }
 
-ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
+ReturnInst::ReturnInst(LLVMContext &C, Value *retVal,
+                       BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(C), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                  InsertBefore) {
+  if (retVal)
+    Op<0>() = retVal;
+}
+
+ReturnInst::ReturnInst(LLVMContext &C, Value *retVal,
+                       Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(C), Instruction::Ret,
                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
                   InsertBefore) {
@@ -990,6 +1021,12 @@ ResumeInst::ResumeInst(const ResumeInst &RI)
   Op<0>() = RI.Op<0>();
 }
 
+ResumeInst::ResumeInst(Value *Exn, BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+  Op<0>() = Exn;
+}
+
 ResumeInst::ResumeInst(Value *Exn, Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
@@ -1027,6 +1064,16 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
     Op<1>() = UnwindBB;
 }
 
+CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
+                                     unsigned Values,
+                                     BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(CleanupPad->getContext()),
+                  Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                  Values, InsertBefore) {
+  init(CleanupPad, UnwindBB);
+}
+
 CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
                                      unsigned Values, Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(CleanupPad->getContext()),
@@ -1060,6 +1107,14 @@ CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
   Op<1>() = CRI.Op<1>();
 }
 
+CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
+                                 BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                  InsertBefore) {
+  init(CatchPad, BB);
+}
+
 CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
                                  Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
@@ -1179,6 +1234,16 @@ FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI)
   setParentPad(FPI.getParentPad());
 }
 
+FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
+                               ArrayRef<Value *> Args, unsigned Values,
+                               const Twine &NameStr,
+                               BasicBlock::iterator InsertBefore)
+    : Instruction(ParentPad->getType(), Op,
+                  OperandTraits<FuncletPadInst>::op_end(this) - Values, Values,
+                  InsertBefore) {
+  init(ParentPad, Args, NameStr);
+}
+
 FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
                                ArrayRef<Value *> Args, unsigned Values,
                                const Twine &NameStr, Instruction *InsertBefore)
@@ -1201,6 +1266,10 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
 //                      UnreachableInst Implementation
 //===----------------------------------------------------------------------===//
 
+UnreachableInst::UnreachableInst(LLVMContext &Context,
+                                 BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
+                  0, InsertBefore) {}
 UnreachableInst::UnreachableInst(LLVMContext &Context,
                                  Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
@@ -1219,6 +1288,14 @@ void BranchInst::AssertOK() {
            "May only branch on boolean predicates!");
 }
 
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 1, 1,
+                  InsertBefore) {
+  assert(IfTrue && "Branch destination may not be null!");
+  Op<-1>() = IfTrue;
+}
+
 BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
                   OperandTraits<BranchInst>::op_end(this) - 1, 1,
@@ -1227,6 +1304,20 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
   Op<-1>() = IfTrue;
 }
 
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
+                       BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 3, 3,
+                  InsertBefore) {
+  // Assign in order of operand index to make use-list order predictable.
+  Op<-3>() = Cond;
+  Op<-2>() = IfFalse;
+  Op<-1>() = IfTrue;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
                        Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
@@ -1309,11 +1400,19 @@ static Align computeAllocaDefaultAlign(Type *Ty, BasicBlock *BB) {
   return DL.getPrefTypeAlign(Ty);
 }
 
+static Align computeAllocaDefaultAlign(Type *Ty, BasicBlock::iterator It) {
+  return computeAllocaDefaultAlign(Ty, It->getParent());
+}
+
 static Align computeAllocaDefaultAlign(Type *Ty, Instruction *I) {
   assert(I && "Insertion position cannot be null when alignment not provided!");
   return computeAllocaDefaultAlign(Ty, I->getParent());
 }
 
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
+
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        Instruction *InsertBefore)
   : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
@@ -1322,6 +1421,12 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        BasicBlock *InsertAtEnd)
   : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
 
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       const Twine &Name, BasicBlock::iterator InsertBefore)
+    : AllocaInst(Ty, AddrSpace, ArraySize,
+                 computeAllocaDefaultAlign(Ty, InsertBefore), Name,
+                 InsertBefore) {}
+
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
     : AllocaInst(Ty, AddrSpace, ArraySize,
@@ -1334,6 +1439,17 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                  computeAllocaDefaultAlign(Ty, InsertAtEnd), Name,
                  InsertAtEnd) {}
 
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       Align Align, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                       getAISize(Ty->getContext(), ArraySize), InsertBefore),
+      AllocatedType(Ty) {
+  setAlignment(Align);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        Align Align, const Twine &Name,
                        Instruction *InsertBefore)
@@ -1391,11 +1507,19 @@ static Align computeLoadStoreDefaultAlign(Type *Ty, BasicBlock *BB) {
   return DL.getABITypeAlign(Ty);
 }
 
+static Align computeLoadStoreDefaultAlign(Type *Ty, BasicBlock::iterator It) {
+  return computeLoadStoreDefaultAlign(Ty, It->getParent());
+}
+
 static Align computeLoadStoreDefaultAlign(Type *Ty, Instruction *I) {
   assert(I && "Insertion position cannot be null when alignment not provided!");
   return computeLoadStoreDefaultAlign(Ty, I->getParent());
 }
 
+LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name,
+                   BasicBlock::iterator InsertBef)
+    : LoadInst(Ty, Ptr, Name, /*isVolatile=*/false, InsertBef) {}
+
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name,
                    Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, /*isVolatile=*/false, InsertBef) {}
@@ -1404,6 +1528,11 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name,
                    BasicBlock *InsertAE)
     : LoadInst(Ty, Ptr, Name, /*isVolatile=*/false, InsertAE) {}
 
+LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
+                   BasicBlock::iterator InsertBef)
+    : LoadInst(Ty, Ptr, Name, isVolatile,
+               computeLoadStoreDefaultAlign(Ty, InsertBef), InsertBef) {}
+
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, isVolatile,
@@ -1414,6 +1543,11 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
     : LoadInst(Ty, Ptr, Name, isVolatile,
                computeLoadStoreDefaultAlign(Ty, InsertAE), InsertAE) {}
 
+LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
+                   Align Align, BasicBlock::iterator InsertBef)
+    : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
+               SyncScope::System, InsertBef) {}
+
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    Align Align, Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
@@ -1424,6 +1558,17 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
                SyncScope::System, InsertAE) {}
 
+LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
+                   Align Align, AtomicOrdering Order, SyncScope::ID SSID,
+                   BasicBlock::iterator InsertBef)
+    : UnaryInstruction(Ty, Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SSID);
+  AssertOK();
+  setName(Name);
+}
+
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    Align Align, AtomicOrdering Order, SyncScope::ID SSID,
                    Instruction *InsertBef)
@@ -1565,6 +1710,19 @@ void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
          "Cmp type and NewVal type must be same!");
 }
 
+AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     Align Alignment,
+                                     AtomicOrdering SuccessOrdering,
+                                     AtomicOrdering FailureOrdering,
+                                     SyncScope::ID SSID,
+                                     BasicBlock::iterator InsertBefore)
+    : Instruction(
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
+          AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+          OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
+  Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID);
+}
+
 AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      Align Alignment,
                                      AtomicOrdering SuccessOrdering,
@@ -1617,6 +1775,16 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
          "AtomicRMW instructions must be atomic!");
 }
 
+AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
+                             Align Alignment, AtomicOrdering Ordering,
+                             SyncScope::ID SSID,
+                             BasicBlock::iterator InsertBefore)
+    : Instruction(Val->getType(), AtomicRMW,
+                  OperandTraits<AtomicRMWInst>::op_begin(this),
+                  OperandTraits<AtomicRMWInst>::operands(this), InsertBefore) {
+  Init(Operation, Ptr, Val, Alignment, Ordering, SSID);
+}
+
 AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                              Align Alignment, AtomicOrdering Ordering,
                              SyncScope::ID SSID, Instruction *InsertBefore)
@@ -1682,6 +1850,13 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) {
 //                       FenceInst Implementation
 //===----------------------------------------------------------------------===//
 
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
+                     SyncScope::ID SSID, BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) {
+  setOrdering(Ordering);
+  setSyncScopeID(SSID);
+}
+
 FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
                      SyncScope::ID SSID,
                      Instruction *InsertBefore)
@@ -1827,6 +2002,19 @@ bool GetElementPtrInst::collectOffset(
 //                           ExtractElementInst Implementation
 //===----------------------------------------------------------------------===//
 
+ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
+                                       const Twine &Name,
+                                       BasicBlock::iterator InsertBef)
+    : Instruction(
+          cast<VectorType>(Val->getType())->getElementType(), ExtractElement,
+          OperandTraits<ExtractElementInst>::op_begin(this), 2, InsertBef) {
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
 ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
                                        const Twine &Name,
                                        Instruction *InsertBef)
@@ -1866,6 +2054,20 @@ bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
 //                           InsertElementInst Implementation
 //===----------------------------------------------------------------------===//
 
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
+                                     const Twine &Name,
+                                     BasicBlock::iterator InsertBef)
+    : Instruction(Vec->getType(), InsertElement,
+                  OperandTraits<InsertElementInst>::op_begin(this), 3,
+                  InsertBef) {
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
 InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
                                      const Twine &Name,
                                      Instruction *InsertBef)
@@ -1917,6 +2119,11 @@ static Value *createPlaceholderForShuffleVector(Value *V) {
   return PoisonValue::get(V->getType());
 }
 
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *Mask, const Twine &Name,
+                                     BasicBlock::iterator InsertBefore)
+    : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
+                        InsertBefore) {}
+
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *Mask, const Twine &Name,
                                      Instruction *InsertBefore)
     : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
@@ -1927,6 +2134,12 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *Mask, const Twine &Name,
     : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
                         InsertAtEnd) {}
 
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, ArrayRef<int> Mask,
+                                     const Twine &Name,
+                                     BasicBlock::iterator InsertBefore)
+    : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
+                        InsertBefore) {}
+
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, ArrayRef<int> Mask,
                                      const Twine &Name,
                                      Instruction *InsertBefore)
@@ -1938,6 +2151,25 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, ArrayRef<int> Mask,
     : ShuffleVectorInst(V1, createPlaceholderForShuffleVector(V1), Mask, Name,
                         InsertAtEnd) {}
 
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
+                                     const Twine &Name,
+                                     BasicBlock::iterator InsertBefore)
+    : Instruction(
+          VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                          cast<VectorType>(Mask->getType())->getElementCount()),
+          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
+          OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+
+  Op<0>() = V1;
+  Op<1>() = V2;
+  SmallVector<int, 16> MaskArr;
+  getShuffleMask(cast<Constant>(Mask), MaskArr);
+  setShuffleMask(MaskArr);
+  setName(Name);
+}
+
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                                      const Twine &Name,
                                      Instruction *InsertBefore)
@@ -1975,6 +2207,22 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
   setName(Name);
 }
 
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
+                                     const Twine &Name,
+                                     BasicBlock::iterator InsertBefore)
+    : Instruction(
+          VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                          Mask.size(), isa<ScalableVectorType>(V1->getType())),
+          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
+          OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+  Op<0>() = V1;
+  Op<1>() = V2;
+  setShuffleMask(Mask);
+  setName(Name);
+}
+
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
                                      const Twine &Name,
                                      Instruction *InsertBefore)
@@ -2802,6 +3050,15 @@ Type *ExtractValueInst::getIndexedType(Type *Agg,
 //                             UnaryOperator Class
 //===----------------------------------------------------------------------===//
 
+UnaryOperator::UnaryOperator(UnaryOps iType, Value *S, Type *Ty,
+                             const Twine &Name,
+                             BasicBlock::iterator InsertBefore)
+    : UnaryInstruction(Ty, iType, S, InsertBefore) {
+  Op<0>() = S;
+  setName(Name);
+  AssertOK();
+}
+
 UnaryOperator::UnaryOperator(UnaryOps iType, Value *S,
                              Type *Ty, const Twine &Name,
                              Instruction *InsertBefore)
@@ -2820,6 +3077,11 @@ UnaryOperator::UnaryOperator(UnaryOps iType, Value *S,
   AssertOK();
 }
 
+UnaryOperator *UnaryOperator::Create(UnaryOps Op, Value *S, const Twine &Name,
+                                     BasicBlock::iterator InsertBefore) {
+  return new UnaryOperator(Op, S, S->getType(), Name, InsertBefore);
+}
+
 UnaryOperator *UnaryOperator::Create(UnaryOps Op, Value *S,
                                      const Twine &Name,
                                      Instruction *InsertBefore) {
@@ -2855,6 +3117,17 @@ void UnaryOperator::AssertOK() {
 //                             BinaryOperator Class
 //===----------------------------------------------------------------------===//
 
+BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
+                               const Twine &Name,
+                               BasicBlock::iterator InsertBefore)
+    : Instruction(Ty, iType, OperandTraits<BinaryOperator>::op_begin(this),
+                  OperandTraits<BinaryOperator>::operands(this), InsertBefore) {
+  Op<0>() = S1;
+  Op<1>() = S2;
+  setName(Name);
+  AssertOK();
+}
+
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                                Type *Ty, const Twine &Name,
                                Instruction *InsertBefore)
@@ -2965,6 +3238,13 @@ BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
   return Res;
 }
 
+BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
+                                          BasicBlock::iterator InsertBefore) {
+  Value *Zero = ConstantInt::get(Op->getType(), 0);
+  return new BinaryOperator(Instruction::Sub, Zero, Op, Op->getType(), Name,
+                            InsertBefore);
+}
+
 BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
                                           Instruction *InsertBefore) {
   Value *Zero = ConstantInt::get(Op->getType(), 0);
@@ -3324,6 +3604,29 @@ unsigned CastInst::isEliminableCastPair(
   }
 }
 
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
+                           const Twine &Name,
+                           BasicBlock::iterator InsertBefore) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
+  // Construct and return the appropriate CastInst subclass
+  switch (op) {
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertBefore);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertBefore);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertBefore);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertBefore);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertBefore);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertBefore);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertBefore);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertBefore);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertBefore);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertBefore);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertBefore);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertBefore);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertBefore);
+  default: llvm_unreachable("Invalid opcode provided");
+  }
+}
+
 CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   const Twine &Name, Instruction *InsertBefore) {
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
@@ -3368,6 +3671,13 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   }
 }
 
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, const Twine &Name,
+                                        BasicBlock::iterator InsertBefore) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::ZExt, S, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty,
                                         const Twine &Name,
                                         Instruction *InsertBefore) {
@@ -3384,6 +3694,13 @@ CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty,
   return Create(Instruction::ZExt, S, Ty, Name, InsertAtEnd);
 }
 
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, const Twine &Name,
+                                        BasicBlock::iterator InsertBefore) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::SExt, S, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty,
                                         const Twine &Name,
                                         Instruction *InsertBefore) {
@@ -3400,6 +3717,13 @@ CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty,
   return Create(Instruction::SExt, S, Ty, Name, InsertAtEnd);
 }
 
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty, const Twine &Name,
+                                         BasicBlock::iterator InsertBefore) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::Trunc, S, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
                                          const Twine &Name,
                                          Instruction *InsertBefore) {
@@ -3435,8 +3759,25 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
 }
 
 /// Create a BitCast or a PtrToInt cast instruction
-CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
-                                      const Twine &Name,
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, const Twine &Name,
+                                      BasicBlock::iterator InsertBefore) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
+         "Invalid cast");
+  assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
+  assert((!Ty->isVectorTy() ||
+          cast<VectorType>(Ty)->getElementCount() ==
+              cast<VectorType>(S->getType())->getElementCount()) &&
+         "Invalid cast");
+
+  if (Ty->isIntOrIntVectorTy())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+
+  return CreatePointerBitCastOrAddrSpaceCast(S, Ty, Name, InsertBefore);
+}
+
+/// Create a BitCast or a PtrToInt cast instruction
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, const Twine &Name,
                                       Instruction *InsertBefore) {
   assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
   assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
@@ -3467,9 +3808,18 @@ CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
 }
 
 CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
-  Value *S, Type *Ty,
-  const Twine &Name,
-  Instruction *InsertBefore) {
+    Value *S, Type *Ty, const Twine &Name, BasicBlock::iterator InsertBefore) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
+
+  if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertBefore);
+
+  return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreatePointerBitCastOrAddrSpaceCast(
+    Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore) {
   assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
   assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
 
@@ -3490,6 +3840,20 @@ CastInst *CastInst::CreateBitOrPointerCast(Value *S, Type *Ty,
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, bool isSigned,
+                                      const Twine &Name,
+                                      BasicBlock::iterator InsertBefore) {
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "Invalid integer cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return Create(opcode, C, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty,
                                       bool isSigned, const Twine &Name,
                                       Instruction *InsertBefore) {
@@ -3518,6 +3882,18 @@ CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty,
   return Create(opcode, C, Ty, Name, InsertAtEnd);
 }
 
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, const Twine &Name,
+                                 BasicBlock::iterator InsertBefore) {
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
+  return Create(opcode, C, Ty, Name, InsertBefore);
+}
+
 CastInst *CastInst::CreateFPCast(Value *C, Type *Ty,
                                  const Twine &Name,
                                  Instruction *InsertBefore) {
@@ -3809,6 +4185,12 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
   }
 }
 
+TruncInst::TruncInst(Value *S, Type *Ty, const Twine &Name,
+                     BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, Trunc, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
+}
+
 TruncInst::TruncInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, Trunc, S, Name, InsertBefore) {
@@ -3821,6 +4203,12 @@ TruncInst::TruncInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
 }
 
+ZExtInst::ZExtInst(Value *S, Type *Ty, const Twine &Name,
+                   BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, ZExt, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
+}
+
 ZExtInst::ZExtInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 )  : CastInst(Ty, ZExt, S, Name, InsertBefore) {
@@ -3832,6 +4220,13 @@ ZExtInst::ZExtInst(
 )  : CastInst(Ty, ZExt, S, Name, InsertAtEnd) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
 }
+
+SExtInst::SExtInst(Value *S, Type *Ty, const Twine &Name,
+                   BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, SExt, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
+}
+
 SExtInst::SExtInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, SExt, S, Name, InsertBefore) {
@@ -3844,6 +4239,12 @@ SExtInst::SExtInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
 }
 
+FPTruncInst::FPTruncInst(Value *S, Type *Ty, const Twine &Name,
+                         BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, FPTrunc, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
+}
+
 FPTruncInst::FPTruncInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) {
@@ -3856,6 +4257,12 @@ FPTruncInst::FPTruncInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
 }
 
+FPExtInst::FPExtInst(Value *S, Type *Ty, const Twine &Name,
+                     BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, FPExt, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
+}
+
 FPExtInst::FPExtInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPExt, S, Name, InsertBefore) {
@@ -3868,6 +4275,12 @@ FPExtInst::FPExtInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
 }
 
+UIToFPInst::UIToFPInst(Value *S, Type *Ty, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, UIToFP, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
+}
+
 UIToFPInst::UIToFPInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, UIToFP, S, Name, InsertBefore) {
@@ -3880,6 +4293,12 @@ UIToFPInst::UIToFPInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
 }
 
+SIToFPInst::SIToFPInst(Value *S, Type *Ty, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, SIToFP, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
+}
+
 SIToFPInst::SIToFPInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, SIToFP, S, Name, InsertBefore) {
@@ -3892,6 +4311,12 @@ SIToFPInst::SIToFPInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
 }
 
+FPToUIInst::FPToUIInst(Value *S, Type *Ty, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, FPToUI, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
+}
+
 FPToUIInst::FPToUIInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPToUI, S, Name, InsertBefore) {
@@ -3904,6 +4329,12 @@ FPToUIInst::FPToUIInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
 }
 
+FPToSIInst::FPToSIInst(Value *S, Type *Ty, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, FPToSI, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
+}
+
 FPToSIInst::FPToSIInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPToSI, S, Name, InsertBefore) {
@@ -3916,6 +4347,12 @@ FPToSIInst::FPToSIInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
 }
 
+PtrToIntInst::PtrToIntInst(Value *S, Type *Ty, const Twine &Name,
+                           BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, PtrToInt, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
+}
+
 PtrToIntInst::PtrToIntInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) {
@@ -3928,6 +4365,12 @@ PtrToIntInst::PtrToIntInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
 }
 
+IntToPtrInst::IntToPtrInst(Value *S, Type *Ty, const Twine &Name,
+                           BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
+}
+
 IntToPtrInst::IntToPtrInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) {
@@ -3940,6 +4383,12 @@ IntToPtrInst::IntToPtrInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
 }
 
+BitCastInst::BitCastInst(Value *S, Type *Ty, const Twine &Name,
+                         BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, BitCast, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
+}
+
 BitCastInst::BitCastInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, BitCast, S, Name, InsertBefore) {
@@ -3952,6 +4401,12 @@ BitCastInst::BitCastInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
+AddrSpaceCastInst::AddrSpaceCastInst(Value *S, Type *Ty, const Twine &Name,
+                                     BasicBlock::iterator InsertBefore)
+    : CastInst(Ty, AddrSpaceCast, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
 AddrSpaceCastInst::AddrSpaceCastInst(
   Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, AddrSpaceCast, S, Name, InsertBefore) {
@@ -3968,6 +4423,19 @@ AddrSpaceCastInst::AddrSpaceCastInst(
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
 
+CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
+                 Value *RHS, const Twine &Name,
+                 BasicBlock::iterator InsertBefore, Instruction *FlagsSource)
+    : Instruction(ty, op, OperandTraits<CmpInst>::op_begin(this),
+                  OperandTraits<CmpInst>::operands(this), InsertBefore) {
+  Op<0>() = LHS;
+  Op<1>() = RHS;
+  setPredicate((Predicate)predicate);
+  setName(Name);
+  if (FlagsSource)
+    copyIRFlags(FlagsSource);
+}
+
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
                  Value *RHS, const Twine &Name, Instruction *InsertBefore,
                  Instruction *FlagsSource)
@@ -4471,6 +4939,17 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
   Op<1>() = Default;
 }
 
+/// SwitchInst ctor - Create a new switch instruction, specifying a value to
+/// switch on and a default destination.  The number of additional cases can
+/// be specified here to make memory allocation more efficient.  This
+/// constructor can also autoinsert before another instruction.
+SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
+                       BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                  nullptr, 0, InsertBefore) {
+  init(Value, Default, 2 + NumCases * 2);
+}
+
 /// SwitchInst ctor - Create a new switch instruction, specifying a value to
 /// switch on and a default destination.  The number of additional cases can
 /// be specified here to make memory allocation more efficient.  This
@@ -4693,6 +5172,13 @@ void IndirectBrInst::growOperands() {
   growHungoffUses(ReservedSpace);
 }
 
+IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
+                               BasicBlock::iterator InsertBefore)
+    : Instruction(Type::getVoidTy(Address->getContext()),
+                  Instruction::IndirectBr, nullptr, 0, InsertBefore) {
+  init(Address, NumCases);
+}
+
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(Address->getContext()),
@@ -4750,6 +5236,12 @@ void IndirectBrInst::removeDestination(unsigned idx) {
 //                            FreezeInst Implementation
 //===----------------------------------------------------------------------===//
 
+FreezeInst::FreezeInst(Value *S, const Twine &Name,
+                       BasicBlock::iterator InsertBefore)
+    : UnaryInstruction(S->getType(), Freeze, S, InsertBefore) {
+  setName(Name);
+}
+
 FreezeInst::FreezeInst(Value *S,
                        const Twine &Name, Instruction *InsertBefore)
     : UnaryInstruction(S->getType(), Freeze, S, InsertBefore) {

From f290c000d87bfc72a31b151dffa2d190596ebe91 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 26 Feb 2024 12:35:59 +0000
Subject: [PATCH 305/546] [TBAA] Add additional bitfield tests.

Additional test for https://github.com/llvm/llvm-project/pull/82922/.
---
 clang/test/CodeGen/tbaa-struct.cpp | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index ff5521fcf3f60..e25fbc1a77810 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -109,8 +109,6 @@ struct NamedBitfields {
    double f3;
 };
 
-NamedBitfields g;
-
 void copy8(NamedBitfields *a1, NamedBitfields *a2) {
 // CHECK-LABEL: _Z5copy8P14NamedBitfieldsS0_
 // CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
@@ -119,6 +117,23 @@ void copy8(NamedBitfields *a1, NamedBitfields *a2) {
   *a1 = *a2;
 }
 
+struct NamedBitfields2 {
+  char a, b, c;
+   signed f0 : 3;
+   unsigned f1 : 4;
+   char f2 : 7;
+   double f3;
+   unsigned f4 : 4;
+};
+
+void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
+// CHECK-LABEL: _Z5copy9P15NamedBitfields2S0_
+// CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) %a1, ptr noundef nonnull align 8 dereferenceable(24) %a2, i64 24, i1 false),
+// CHECK-OLD-SAME: !tbaa.struct [[TS7:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields2:!.+]], !tbaa.struct
+  *a1 = *a2;
+}
+
 // CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
 // CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
 // CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
@@ -133,6 +148,7 @@ void copy8(NamedBitfields *a1, NamedBitfields *a2) {
 // CHECK-OLD: [[TS6]] = !{i64 0, i64 4, [[TAG_INT]], i64 1, i64 4, [[TAG_INT]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
 // CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0}
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
+// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 4, [[TAG_INT]], i64 3, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 4, [[TAG_INT]]}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
@@ -150,3 +166,5 @@ void copy8(NamedBitfields *a1, NamedBitfields *a2) {
 // CHECK-NEW-DAG: [[TAG_NamedBitfields]] = !{[[TYPE_NamedBitfields:!.+]], [[TYPE_NamedBitfields]], i64 0, i64 16}
 // CHECK-NEW-DAG: [[TYPE_NamedBitfields]] = !{[[TYPE_char]], i64 16, !"_ZTS14NamedBitfields", [[TYPE_int]], i64 0, i64 4, [[TYPE_int]], i64 1, i64 4, [[TYPE_char]], i64 2, i64 1, [[TYPE_double:!.+]], i64 8, i64 8}
 // CHECK-NEW-DAG: [[TYPE_double]] = !{[[TYPE_char]], i64 8, !"double"}
+// CHECK-NEW-DAG: [[TAG_NamedBitfields2]] = !{[[TYPE_NamedBitfields2:!.+]], [[TYPE_NamedBitfields2]], i64 0, i64 24}
+// CHECK-NEW-DAG: [[TYPE_NamedBitfields2]] = !{[[TYPE_char]], i64 24, !"_ZTS15NamedBitfields2", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1, [[TYPE_int]], i64 3, i64 4, [[TYPE_int]], i64 3, i64 4, [[TYPE_char]], i64 4, i64 1, [[TYPE_double]], i64 8, i64 8, [[TYPE_int]], i64 16, i64 4}

From 433f8e741e7d4a5b7dad3e078dd847efa6afee5e Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Mon, 26 Feb 2024 13:46:13 +0100
Subject: [PATCH 306/546] MachineSSAUpdater: use all vreg attributes instead of
 reg class only (#78431)

When initializing MachineSSAUpdater save all attributes of current
virtual register and create new virtual registers with same attributes.
Now new virtual registers have same both register class or bank and LLT.
Previously new virtual registers had same register class but LLT was not
set (LLT was set to default/empty LLT).
Required by GlobalISel for AMDGPU, new 'lane mask' virtual registers
created by MachineSSAUpdater need to have both register class and LLT.

patch 4 from: https://github.com/llvm/llvm-project/pull/73337
---
 llvm/include/llvm/CodeGen/MachineSSAUpdater.h |   6 +-
 llvm/lib/CodeGen/MachineSSAUpdater.cpp        |  44 ++--
 ...-divergent-i1-phis-no-lane-mask-merging.ll |  99 +++++----
 ...divergent-i1-phis-no-lane-mask-merging.mir |  16 +-
 ...vergence-divergent-i1-used-outside-loop.ll | 198 ++++++++++++------
 ...ergence-divergent-i1-used-outside-loop.mir | 112 +++++-----
 .../GlobalISel/divergence-structurizer.ll     | 117 ++++++++---
 .../GlobalISel/divergence-structurizer.mir    | 100 +++++----
 .../divergence-temporal-divergent-i1.ll       |  80 ++++---
 .../divergence-temporal-divergent-i1.mir      |  24 +--
 .../GlobalISel/divergent-control-flow.ll      |  43 ++--
 11 files changed, 480 insertions(+), 359 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineSSAUpdater.h b/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
index bbd09d7d151ba..3305e90f696d4 100644
--- a/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CODEGEN_MACHINESSAUPDATER_H
 #define LLVM_CODEGEN_MACHINESSAUPDATER_H
 
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Register.h"
 
 namespace llvm {
@@ -40,8 +41,8 @@ class MachineSSAUpdater {
   //typedef DenseMap<MachineBasicBlock*, Register> AvailableValsTy;
   void *AV = nullptr;
 
-  /// VRC - Register class of the current virtual register.
-  const TargetRegisterClass *VRC = nullptr;
+  /// Register class or bank and LLT of current virtual register.
+  MachineRegisterInfo::VRegAttrs RegAttrs;
 
   /// InsertedPHIs - If this is non-null, the MachineSSAUpdater adds all PHI
   /// nodes that it creates to the vector.
@@ -62,7 +63,6 @@ class MachineSSAUpdater {
   /// Initialize - Reset this object to get ready for a new set of SSA
   /// updates.
   void Initialize(Register V);
-  void Initialize(const TargetRegisterClass *RC);
 
   /// AddAvailableValue - Indicate that a rewritten value is available at the
   /// end of the specified block with the specified value.
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 48076663ddf53..8d6ea9c488da5 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -51,17 +51,13 @@ MachineSSAUpdater::~MachineSSAUpdater() {
 
 /// Initialize - Reset this object to get ready for a new set of SSA
 /// updates.
-void MachineSSAUpdater::Initialize(const TargetRegisterClass *RC) {
+void MachineSSAUpdater::Initialize(Register V) {
   if (!AV)
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
 
-  VRC = RC;
-}
-
-void MachineSSAUpdater::Initialize(Register V) {
-  Initialize(MRI->getRegClass(V));
+  RegAttrs = MRI->getVRegAttrs(V);
 }
 
 /// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
@@ -115,13 +111,12 @@ Register LookForIdenticalPHI(MachineBasicBlock *BB,
 /// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
 /// a value of the given register class at the start of the specified basic
 /// block. It returns the virtual register defined by the instruction.
-static
-MachineInstrBuilder InsertNewDef(unsigned Opcode,
-                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
-                           const TargetRegisterClass *RC,
-                           MachineRegisterInfo *MRI,
-                           const TargetInstrInfo *TII) {
-  Register NewVR = MRI->createVirtualRegister(RC);
+static MachineInstrBuilder InsertNewDef(unsigned Opcode, MachineBasicBlock *BB,
+                                        MachineBasicBlock::iterator I,
+                                        MachineRegisterInfo::VRegAttrs RegAttrs,
+                                        MachineRegisterInfo *MRI,
+                                        const TargetInstrInfo *TII) {
+  Register NewVR = MRI->createVirtualRegister(RegAttrs);
   return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
 }
 
@@ -158,9 +153,9 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
     if (ExistingValueOnly)
       return Register();
     // Insert an implicit_def to represent an undef value.
-    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
-                                        BB, BB->getFirstTerminator(),
-                                        VRC, MRI, TII);
+    MachineInstr *NewDef =
+        InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstTerminator(),
+                     RegAttrs, MRI, TII);
     return NewDef->getOperand(0).getReg();
   }
 
@@ -197,8 +192,8 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
 
   // Otherwise, we do need a PHI: insert one now.
   MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
-  MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
-                                                 Loc, VRC, MRI, TII);
+  MachineInstrBuilder InsertedPHI =
+      InsertNewDef(TargetOpcode::PHI, BB, Loc, RegAttrs, MRI, TII);
 
   // Fill in all the predecessors of the PHI.
   for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
@@ -300,10 +295,9 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
   static Register GetUndefVal(MachineBasicBlock *BB,
                               MachineSSAUpdater *Updater) {
     // Insert an implicit_def to represent an undef value.
-    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
-                                        BB, BB->getFirstNonPHI(),
-                                        Updater->VRC, Updater->MRI,
-                                        Updater->TII);
+    MachineInstr *NewDef =
+        InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstNonPHI(),
+                     Updater->RegAttrs, Updater->MRI, Updater->TII);
     return NewDef->getOperand(0).getReg();
   }
 
@@ -312,9 +306,9 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
   static Register CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
                                  MachineSSAUpdater *Updater) {
     MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
-    MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
-                                     Updater->VRC, Updater->MRI,
-                                     Updater->TII);
+    MachineInstr *PHI =
+        InsertNewDef(TargetOpcode::PHI, BB, Loc, Updater->RegAttrs,
+                     Updater->MRI, Updater->TII);
     return PHI->getOperand(0).getReg();
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 06a8f80e6aa34..0f70c1996d6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 ; Divergent phis that don't require lowering using lane mask merging
 
@@ -66,15 +65,16 @@ exit:
 define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(ptr addrspace(1) %out, i32 %tid, i32 inreg %cond) {
 ; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
 ; GFX10:       ; %bb.0: ; %A
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, 6, v2
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX10-NEXT:  ; %bb.1:
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
-; GFX10-NEXT:    s_branch .LBB1_3
-; GFX10-NEXT:  .LBB1_2: ; %B
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
-; GFX10-NEXT:  .LBB1_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s1, s0, s1
+; GFX10-NEXT:  .LBB1_2: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v2
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
@@ -101,23 +101,27 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
 ; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:  .LBB2_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v3
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -144,44 +148,49 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0x3e8
-; GFX10-NEXT:    v_mov_b32_e32 v9, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3e8
+; GFX10-NEXT:    v_mov_b32_e32 v8, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_branch .LBB3_2
 ; GFX10-NEXT:  .LBB3_1: ; %loop_body
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v10, v9
-; GFX10-NEXT:    v_xor_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v9, v8
+; GFX10-NEXT:    s_xor_b32 s4, s4, -1
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_2: ; %loop_start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v9
-; GFX10-NEXT:    s_mov_b32 s5, 1
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
+; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v9
 ; GFX10-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX10-NEXT:  ; %bb.3: ; %else
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_store_dword v[6:7], v8
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    flat_store_dword v[6:7], v1
 ; GFX10-NEXT:  .LBB3_4: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    s_xor_b32 s5, s5, 1
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_xor_b32 s7, s7, 1
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10-NEXT:  ; %bb.5: ; %if
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    flat_store_dword v[4:5], v8
+; GFX10-NEXT:    flat_store_dword v[4:5], v1
 ; GFX10-NEXT:    s_branch .LBB3_1
 ; GFX10-NEXT:  .LBB3_6: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -221,8 +230,8 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_mov_b32 s12, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_mov_b32 s12, 0
 ; GFX10-NEXT:    s_mov_b32 s13, -1
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
@@ -230,7 +239,6 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s2, 1
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
@@ -257,13 +265,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    s_mov_b32 s13, 0
 ; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:  .LBB4_4: ; %Flow
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB4_6
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s13
+; GFX10-NEXT:    s_cbranch_vccz .LBB4_6
 ; GFX10-NEXT:  ; %bb.5: ; %.19
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v3, 2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 55f22b0bbb4df..5549c89dc402f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -1,10 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
 
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
 --- |
   define void @divergent_i1_phi_uniform_branch() {ret void}
   define void @divergent_i1_phi_uniform_branch_simple() {ret void}
@@ -147,8 +143,8 @@ body: |
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
@@ -206,11 +202,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -301,11 +297,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 808c2c2ded201..e9df20f9688e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
@@ -17,22 +16,29 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s6, s5, s6
+; GFX10-NEXT:    ; implicit-def: $sgpr5
 ; GFX10-NEXT:  .LBB0_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v1
+; GFX10-NEXT:    s_xor_b32 s7, s6, -1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v0
-; GFX10-NEXT:    v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 s8, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, s7
+; GFX10-NEXT:    s_andn2_b32 s5, s5, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s7, s8, s7
+; GFX10-NEXT:    s_or_b32 s5, s5, s6
+; GFX10-NEXT:    s_mov_b32 s6, s7
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -60,8 +66,11 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s4, -1
-; GFX10-NEXT:    v_mov_b32_e32 v5, 1
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s4, s5, s4
 ; GFX10-NEXT:    s_branch .LBB1_2
 ; GFX10-NEXT:  .LBB1_1: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -70,20 +79,26 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
 ; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, 4
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
 ; GFX10-NEXT:    v_cmp_le_i32_e32 vcc_lo, 10, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    s_andn2_b32 s7, s5, exec_lo
+; GFX10-NEXT:    s_and_b32 s8, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s4, s7, s8
 ; GFX10-NEXT:    s_cbranch_vccz .LBB1_4
 ; GFX10-NEXT:  .LBB1_2: ; %loop.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v5
-; GFX10-NEXT:    s_mov_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB1_1
 ; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
 ; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GFX10-NEXT:    global_load_dword v5, v[1:2], off
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-NEXT:    s_branch .LBB1_1
 ; GFX10-NEXT:  .LBB1_4: ; %exit
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
@@ -123,23 +138,28 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX10-NEXT:  .LBB2_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
-; GFX10-NEXT:    v_xor_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v5, v0
+; GFX10-NEXT:    s_xor_b32 s7, vcc_lo, -1
+; GFX10-NEXT:    s_or_b32 s5, s4, s5
+; GFX10-NEXT:    v_mov_b32_e32 v4, s7
+; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s7
+; GFX10-NEXT:    s_or_b32 s6, s4, s6
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -177,44 +197,59 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s6, 1
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
+; GFX10-NEXT:    ; implicit-def: $sgpr7
+; GFX10-NEXT:    ; implicit-def: $sgpr8
 ; GFX10-NEXT:    s_branch .LBB3_3
 ; GFX10-NEXT:  .LBB3_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_xor_b32 s7, s7, 1
-; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
-; GFX10-NEXT:    s_or_b32 s5, s6, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_xor_b32 s9, s8, -1
+; GFX10-NEXT:    s_and_b32 s10, exec_lo, s7
+; GFX10-NEXT:    s_or_b32 s5, s10, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s9, exec_lo, s9
+; GFX10-NEXT:    s_or_b32 s6, s6, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_5
 ; GFX10-NEXT:  .LBB3_3: ; %loop.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    s_andn2_b32 s8, s8, exec_lo
+; GFX10-NEXT:    s_and_b32 s9, exec_lo, -1
+; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT:    s_or_b32 s8, s8, s9
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT:    s_or_b32 s7, s7, s9
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
 ; GFX10-NEXT:    global_load_dword v6, v[6:7], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s9, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.4: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v5
-; GFX10-NEXT:    v_cmp_lt_i32_e64 s6, v5, v0
-; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    s_andn2_b32 s8, s8, exec_lo
+; GFX10-NEXT:    s_and_b32 s10, exec_lo, 0
+; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    s_and_b32 s11, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s8, s8, s10
+; GFX10-NEXT:    s_or_b32 s7, s7, s11
 ; GFX10-NEXT:    s_branch .LBB3_2
 ; GFX10-NEXT:  .LBB3_5: ; %loop.exit.guard
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_and_b32 s5, 1, s7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s5
+; GFX10-NEXT:    s_andn2_b32 s5, -1, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s6, s5, s6
 ; GFX10-NEXT:  .LBB3_6: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
@@ -266,21 +301,24 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX10-NEXT:    s_branch .LBB4_2
 ; GFX10-NEXT:  .LBB4_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; GFX10-NEXT:    s_and_b32 s4, 1, s6
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s7
 ; GFX10-NEXT:    s_or_b32 s5, s4, s5
+; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
-; GFX10-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX10-NEXT:  ; %bb.3: ; %if.block.0
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
@@ -292,20 +330,22 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:  .LBB4_4: ; %loop.break.block
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
-; GFX10-NEXT:    s_mov_b32 s6, 1
+; GFX10-NEXT:    s_mov_b32 s7, 1
 ; GFX10-NEXT:    ; implicit-def: $vgpr5
-; GFX10-NEXT:    s_and_saveexec_b32 s7, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
 ; GFX10-NEXT:  ; %bb.5: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v4
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, 0
+; GFX10-NEXT:    s_or_b32 s7, s4, s7
 ; GFX10-NEXT:    s_branch .LBB4_1
 ; GFX10-NEXT:  .LBB4_6: ; %cond.block.1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX10-NEXT:  ; %bb.7: ; %if.block.1
 ; GFX10-NEXT:    global_store_dword v[6:7], v4, off
@@ -370,34 +410,44 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
 ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v6, 1
+; GFX10-NEXT:    s_mov_b32 s3, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr2
 ; GFX10-NEXT:    s_branch .LBB5_2
 ; GFX10-NEXT:  .LBB5_1: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v5
 ; GFX10-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX10-NEXT:  .LBB5_2: ; %loop.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v6
-; GFX10-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s3
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_1
 ; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
 ; GFX10-NEXT:    global_load_dword v6, v[6:7], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    s_branch .LBB5_1
 ; GFX10-NEXT:  .LBB5_4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -436,40 +486,52 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr2
+; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB6_2
 ; GFX10-NEXT:  .LBB6_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX10-NEXT:  .LBB6_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB6_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
-; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
+; GFX10-NEXT:    s_or_b32 s2, s2, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index d84ccece81c78..ace9bec6e1c2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -1,10 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
 
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
 ---
 name: divergent_i1_phi_used_outside_loop
 legalized: true
@@ -24,8 +20,8 @@ body: |
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
@@ -34,14 +30,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]]
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -127,8 +123,8 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
@@ -137,14 +133,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %41, %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
@@ -166,8 +162,8 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
@@ -264,11 +260,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -369,8 +365,8 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
@@ -378,14 +374,14 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.1, %61, %bb.7
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.1, %48, %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
@@ -437,11 +433,11 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
@@ -578,10 +574,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -624,9 +620,9 @@ body: |
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
@@ -761,14 +757,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %42, %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
@@ -796,10 +792,10 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[PHI2]], %bb.1, [[DEF2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
@@ -912,14 +908,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
@@ -979,11 +975,11 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index fb1253c7a17d9..609fff51863a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
@@ -12,7 +11,10 @@ define amdgpu_ps void @divergent_i1_phi_if_then(ptr addrspace(1) %out, i32 %tid,
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:  ; %bb.1: ; %B
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
@@ -41,16 +43,23 @@ define amdgpu_ps void @divergent_i1_phi_if_else(ptr addrspace(1) %out, i32 %tid,
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, s0
+; GFX10-NEXT:    ; implicit-def: $sgpr0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-NEXT:  ; %bb.1: ; %B
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 2, v2
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s2, exec_lo
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
 ; GFX10-NEXT:  ; %bb.3: ; %A
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:  ; %bb.4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s0
@@ -98,36 +107,42 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    s_branch .LBB2_2
 ; GFX10-NEXT:  .LBB2_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX10-NEXT:  .LBB2_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s2
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 2, v[4:5]
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v2, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
 ; GFX10-NEXT:    global_load_dword v7, v[7:8], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v0, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v4
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v4
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v4
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
 ; GFX10-NEXT:    global_load_dword v7, v[5:6], off
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v8
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v7
 ; GFX10-NEXT:    global_store_dword v[5:6], v7, off
@@ -161,35 +176,42 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-LABEL: loop_with_2breaks:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB3_3
 ; GFX10-NEXT:  .LBB3_1: ; %Flow3
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:  .LBB3_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_3: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s2
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.4: ; %B
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
@@ -200,9 +222,12 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
@@ -242,38 +267,48 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-LABEL: loop_with_3breaks:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX10-NEXT:    s_branch .LBB4_4
 ; GFX10-NEXT:  .LBB4_1: ; %Flow5
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10-NEXT:  .LBB4_2: ; %Flow4
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:  .LBB4_3: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX10-NEXT:  .LBB4_4: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s2
 ; GFX10-NEXT:    v_lshlrev_b64 v[9:10], 2, v[8:9]
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v2, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX10-NEXT:  ; %bb.5: ; %B
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -283,6 +318,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s5, 1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -293,9 +329,12 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v8
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v8
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v8
+; GFX10-NEXT:    s_andn2_b32 s5, -1, exec_lo
 ; GFX10-NEXT:    global_load_dword v11, v[9:10], off
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v12
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s5, s5, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v11
 ; GFX10-NEXT:    global_store_dword v[9:10], v11, off
@@ -345,40 +384,52 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
 ; GFX10-LABEL: loop_with_div_break_with_body:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr2
+; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB5_2
 ; GFX10-NEXT:  .LBB5_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX10-NEXT:  .LBB5_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
-; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
+; GFX10-NEXT:    s_or_b32 s2, s2, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 0e6588b4593c9..df5505e1b28bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -1,10 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
 
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
 ---
 name: divergent_i1_phi_if_then
 legalized: true
@@ -39,8 +35,8 @@ body: |
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -104,8 +100,8 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
@@ -133,8 +129,8 @@ body: |
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
@@ -210,10 +206,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %35, %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %35(s1), %bb.3
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -251,9 +247,9 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -348,10 +344,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %48, %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %48(s1), %bb.3
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -385,9 +381,9 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -414,9 +410,9 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
@@ -540,10 +536,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %61, %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %61(s1), %bb.3
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -577,9 +573,9 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -602,9 +598,9 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
@@ -633,9 +629,9 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
@@ -782,14 +778,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
@@ -849,11 +845,11 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
@@ -986,12 +982,12 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %67(s1), %bb.6, %70, %bb.7
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI %49(s1), %bb.6, %48(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI %35(s1), %bb.6, %34(s1), %bb.7
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %70(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
@@ -1061,16 +1057,16 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.0, [[PHI7]], %bb.2, [[S_OR_B32_1]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, [[PHI1]], %bb.2, [[DEF5]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, [[PHI2]], %bb.2, [[DEF4]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]]
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 431da1de1fd48..312c6a3822ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,29 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:  .LBB0_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; GFX10-NEXT:    v_mov_b32_e32 v5, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v0
-; GFX10-NEXT:    v_xor_b32_e32 v4, 1, v5
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -49,23 +51,27 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_non_phi:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:  .LBB1_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v3
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -96,30 +102,41 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    s_branch .LBB2_3
 ; GFX10-NEXT:  .LBB2_1: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB2_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v5
-; GFX10-NEXT:    v_cmp_lt_u32_e64 s0, v5, v2
-; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v5, v2
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
 ; GFX10-NEXT:    global_load_dword v8, v[6:7], off
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v9
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s1, s1, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
 ; GFX10-NEXT:    global_store_dword v[6:7], v8, off
 ; GFX10-NEXT:  .LBB2_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT:    s_and_b32 s0, exec_lo, s0
-; GFX10-NEXT:    s_or_b32 s4, s0, s4
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
+; GFX10-NEXT:    s_or_b32 s4, s6, s4
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s0, s0, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_5
 ; GFX10-NEXT:  .LBB2_3: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s5
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v3, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
@@ -128,8 +145,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 s0, -1
-; GFX10-NEXT:    s_mov_b32 s1, 1
+; GFX10-NEXT:    s_mov_b32 s5, 1
 ; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_branch .LBB2_2
 ; GFX10-NEXT:  .LBB2_5: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index 64d740287d9fc..abb491f938e54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -1,10 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
 
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
-
 ---
 name: temporal_divergent_i1_phi
 legalized: true
@@ -26,12 +22,12 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
@@ -113,11 +109,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -207,12 +203,12 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x50000000), %bb.5(0x30000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %42, %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32)
@@ -262,11 +258,11 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 6384c47398fce..c25b0f2128266 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
@@ -203,26 +202,34 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
 ; CHECK-NEXT:    ; implicit-def: $vgpr1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; CHECK-NEXT:    s_branch .LBB5_2
-; CHECK-NEXT:  .LBB5_1: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
-; CHECK-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    s_branch .LBB5_3
+; CHECK-NEXT:  .LBB5_1: ; %bb4
+; CHECK-NEXT:    ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT:    global_load_dword v2, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v2
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT:  .LBB5_2: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[2:3]
+; CHECK-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    s_cbranch_execz .LBB5_4
-; CHECK-NEXT:  .LBB5_2: ; %bb1
+; CHECK-NEXT:    s_cbranch_execz .LBB5_5
+; CHECK-NEXT:  .LBB5_3: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, -1
 ; CHECK-NEXT:    v_cmp_le_i32_e32 vcc, 0, v1
-; CHECK-NEXT:    s_mov_b64 s[2:3], -1
-; CHECK-NEXT:    s_cbranch_vccnz .LBB5_1
-; CHECK-NEXT:  ; %bb.3: ; %bb4
-; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    global_load_dword v2, v[0:1], off glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, v2
-; CHECK-NEXT:    s_branch .LBB5_1
-; CHECK-NEXT:  .LBB5_4: ; %bb9
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT:    s_cbranch_vccz .LBB5_1
+; CHECK-NEXT:  ; %bb.4: ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:    s_branch .LBB5_2
+; CHECK-NEXT:  .LBB5_5: ; %bb9
 ; CHECK-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()

From 58aa995baf66fffb1284ecb289dc9f02c70de4fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 13:56:10 +0100
Subject: [PATCH 307/546] [clang][Interp][NFC] Fix comment typo

---
 clang/lib/AST/Interp/Interp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index aecad1598ad2b..2b36a05e1af96 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1048,7 +1048,7 @@ bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 
 /// 1) Converts the value on top of the stack to an APValue
 /// 2) Sets that APValue on \Temp
-/// 3) Initialized global with index \I with that
+/// 3) Initializes global with index \I with that
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobalTemp(InterpState &S, CodePtr OpPC, uint32_t I,
                     const LifetimeExtendedTemporaryDecl *Temp) {

From a35599b9ae5e7ad924b78c65f6348e0b711bad5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 11:39:39 +0100
Subject: [PATCH 308/546] [clang][Interp] Implement a few _is_lock_free
 builtins

Implementation looks similar to the one in the current interpreter.
Except for three static assertions, test/Sema/atomic-ops.c works.
---
 clang/lib/AST/Interp/Context.cpp       |  2 +-
 clang/lib/AST/Interp/InterpBuiltin.cpp | 74 ++++++++++++++++++++++++++
 clang/test/AST/Interp/atomic.c         | 51 +++++++++++++++++-
 3 files changed, 124 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index 578dc44fe4f8d..b09019f3e65b7 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -160,7 +160,7 @@ std::optional<PrimType> Context::classify(QualType T) const {
   if (T->isReferenceType() || T->isPointerType())
     return PT_Ptr;
 
-  if (const auto *AT = dyn_cast<AtomicType>(T))
+  if (const auto *AT = T->getAs<AtomicType>())
     return classify(AT->getValueType());
 
   if (const auto *DT = dyn_cast<DecltypeType>(T))
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 401af580e1aaf..2fb076d7793a8 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -887,6 +887,73 @@ static bool interp__builtin_bswap(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+/// bool __atomic_always_lock_free(size_t, void const volatile*)
+/// bool __atomic_is_lock_free(size_t, void const volatile*)
+/// bool __c11_atomic_is_lock_free(size_t)
+static bool interp__builtin_atomic_lock_free(InterpState &S, CodePtr OpPC,
+                                             const InterpFrame *Frame,
+                                             const Function *Func,
+                                             const CallExpr *Call) {
+  unsigned BuiltinOp = Func->getBuiltinID();
+
+  PrimType ValT = *S.getContext().classify(Call->getArg(0));
+  unsigned SizeValOffset = 0;
+  if (BuiltinOp != Builtin::BI__c11_atomic_is_lock_free)
+    SizeValOffset = align(primSize(ValT)) + align(primSize(PT_Ptr));
+  const APSInt &SizeVal = peekToAPSInt(S.Stk, ValT, SizeValOffset);
+
+  auto returnBool = [&S](bool Value) -> bool {
+    S.Stk.push<Boolean>(Value);
+    return true;
+  };
+
+  // For __atomic_is_lock_free(sizeof(_Atomic(T))), if the size is a power
+  // of two less than or equal to the maximum inline atomic width, we know it
+  // is lock-free.  If the size isn't a power of two, or greater than the
+  // maximum alignment where we promote atomics, we know it is not lock-free
+  // (at least not in the sense of atomic_is_lock_free).  Otherwise,
+  // the answer can only be determined at runtime; for example, 16-byte
+  // atomics have lock-free implementations on some, but not all,
+  // x86-64 processors.
+
+  // Check power-of-two.
+  CharUnits Size = CharUnits::fromQuantity(SizeVal.getZExtValue());
+  if (Size.isPowerOfTwo()) {
+    // Check against inlining width.
+    unsigned InlineWidthBits =
+        S.getCtx().getTargetInfo().getMaxAtomicInlineWidth();
+    if (Size <= S.getCtx().toCharUnitsFromBits(InlineWidthBits)) {
+
+      // OK, we will inline appropriately-aligned operations of this size,
+      // and _Atomic(T) is appropriately-aligned.
+      if (BuiltinOp == Builtin::BI__c11_atomic_is_lock_free ||
+          Size == CharUnits::One())
+        return returnBool(true);
+
+      // Same for null pointers.
+      assert(BuiltinOp != Builtin::BI__c11_atomic_is_lock_free);
+      const Pointer &Ptr = S.Stk.peek<Pointer>();
+      if (Ptr.isZero())
+        return returnBool(true);
+
+      QualType PointeeType = Call->getArg(1)
+                                 ->IgnoreImpCasts()
+                                 ->getType()
+                                 ->castAs<PointerType>()
+                                 ->getPointeeType();
+      // OK, we will inline operations on this object.
+      if (!PointeeType->isIncompleteType() &&
+          S.getCtx().getTypeAlignInChars(PointeeType) >= Size)
+        return returnBool(true);
+    }
+  }
+
+  if (BuiltinOp == Builtin::BI__atomic_always_lock_free)
+    return returnBool(false);
+
+  return false;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
   InterpFrame *Frame = S.Current;
@@ -1186,6 +1253,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__atomic_always_lock_free:
+  case Builtin::BI__atomic_is_lock_free:
+  case Builtin::BI__c11_atomic_is_lock_free:
+    if (!interp__builtin_atomic_lock_free(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   default:
     return false;
   }
diff --git a/clang/test/AST/Interp/atomic.c b/clang/test/AST/Interp/atomic.c
index 8d93b57c1945b..316e8d5bf3516 100644
--- a/clang/test/AST/Interp/atomic.c
+++ b/clang/test/AST/Interp/atomic.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=both,expected %s
-// RUN: %clang_cc1 -verify=both,ref %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -fexperimental-new-constant-interpreter -verify=both,expected %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -verify=both,ref %s
 
 /// FIXME: Copied from test/Sema/atomic-expr.c.
 /// this expression seems to be rejected for weird reasons,
@@ -11,3 +11,50 @@ _Atomic int ai = 0;
 // The warning is correct but the error is not.
 _Atomic(int *) aip3 = &ai; // both-warning {{incompatible pointer types initializing '_Atomic(int *)' with an expression of type '_Atomic(int) *'}} \
                            // both-error {{initializer element is not a compile-time constant}}
+
+#include <stdatomic.h>
+
+
+
+_Static_assert(__GCC_ATOMIC_BOOL_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_BOOL_LOCK_FREE == __CLANG_ATOMIC_BOOL_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_CHAR_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_CHAR_LOCK_FREE == __CLANG_ATOMIC_CHAR_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_CHAR16_T_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_CHAR16_T_LOCK_FREE == __CLANG_ATOMIC_CHAR16_T_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_CHAR32_T_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_CHAR32_T_LOCK_FREE == __CLANG_ATOMIC_CHAR32_T_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_WCHAR_T_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_WCHAR_T_LOCK_FREE == __CLANG_ATOMIC_WCHAR_T_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_SHORT_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_SHORT_LOCK_FREE == __CLANG_ATOMIC_SHORT_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_INT_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_INT_LOCK_FREE == __CLANG_ATOMIC_INT_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_LONG_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_LONG_LOCK_FREE == __CLANG_ATOMIC_LONG_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_LLONG_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_LLONG_LOCK_FREE == __CLANG_ATOMIC_LLONG_LOCK_FREE, "");
+_Static_assert(__GCC_ATOMIC_POINTER_LOCK_FREE == 2, "");
+_Static_assert(__GCC_ATOMIC_POINTER_LOCK_FREE == __CLANG_ATOMIC_POINTER_LOCK_FREE, "");
+
+_Static_assert(__c11_atomic_is_lock_free(1), "");
+_Static_assert(__c11_atomic_is_lock_free(2), "");
+_Static_assert(__c11_atomic_is_lock_free(3), ""); // both-error {{not an integral constant expression}}
+_Static_assert(__c11_atomic_is_lock_free(4), "");
+_Static_assert(__c11_atomic_is_lock_free(8), "");
+_Static_assert(__c11_atomic_is_lock_free(16), ""); // both-error {{not an integral constant expression}}
+_Static_assert(__c11_atomic_is_lock_free(17), ""); // both-error {{not an integral constant expression}}
+
+_Static_assert(__atomic_is_lock_free(1, 0), "");
+_Static_assert(__atomic_is_lock_free(2, 0), "");
+_Static_assert(__atomic_is_lock_free(3, 0), ""); // both-error {{not an integral constant expression}}
+_Static_assert(__atomic_is_lock_free(4, 0), "");
+_Static_assert(__atomic_is_lock_free(8, 0), "");
+_Static_assert(__atomic_is_lock_free(16, 0), ""); // both-error {{not an integral constant expression}}
+_Static_assert(__atomic_is_lock_free(17, 0), ""); // both-error {{not an integral constant expression}}
+
+_Static_assert(atomic_is_lock_free((atomic_char*)0), "");
+_Static_assert(atomic_is_lock_free((atomic_short*)0), "");
+_Static_assert(atomic_is_lock_free((atomic_int*)0), "");
+_Static_assert(atomic_is_lock_free((atomic_long*)0), "");
+_Static_assert(atomic_is_lock_free(0 + (atomic_char*)0), "");

From 60e7ae3f30e99423cf779c9d05513d2ae18df5aa Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 26 Feb 2024 13:02:08 +0000
Subject: [PATCH 309/546] [AMDGPU] Only try DecoderTables for the current
 subtarget. NFCI. (#82992)

Speed up disassembly by only calling tryDecodeInst for DecoderTables
that make sense for the current subtarget.

This gives a 1.3x speed-up on check-llvm-mc-disassembler-amdgpu in my
Release+Asserts build.
---
 .../Disassembler/AMDGPUDisassembler.cpp       | 45 ++++++++++++-------
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |  1 +
 llvm/lib/Target/AMDGPU/MIMGInstructions.td    |  6 +--
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index e1cca17bdbf43..8c42304ce0bee 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -466,15 +466,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     if (isGFX11Plus() && Bytes.size() >= 12 ) {
       DecoderUInt128 DecW = eat12Bytes(Bytes);
 
-      if (tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
+      if (isGFX11() &&
+          tryDecodeInst(DecoderTableGFX1196, DecoderTableGFX11_FAKE1696, MI,
                         DecW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
+      if (isGFX12() &&
+          tryDecodeInst(DecoderTableGFX1296, DecoderTableGFX12_FAKE1696, MI,
                         DecW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
+      if (isGFX12() &&
+          tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS))
         break;
     }
 
@@ -507,27 +510,32 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
+      if ((isVI() || isGFX9()) &&
+          tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
+      if (isGFX9() && tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
+      if (isGFX10() && tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+      if (isGFX12() &&
+          tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
                         Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+      if (isGFX11() &&
+          tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
                         Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
+      if (isGFX11() &&
+          tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
+      if (isGFX12() &&
+          tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
         break;
     }
 
@@ -538,13 +546,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     if (Bytes.size() >= 4) {
       const uint32_t DW = eatBytes<uint32_t>(Bytes);
 
-      if (tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
+      if ((isVI() || isGFX9()) &&
+          tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS))
         break;
 
       if (tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
+      if (isGFX9() && tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS))
         break;
 
       if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts) &&
@@ -555,14 +564,16 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
           tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
+      if (isGFX10() && tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+      if (isGFX11() &&
+          tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
                         Address, CS))
         break;
 
-      if (tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+      if (isGFX12() &&
+          tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
                         Address, CS))
         break;
     }
@@ -1750,6 +1761,10 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
   return AMDGPU::isGFX11Plus(STI);
 }
 
+bool AMDGPUDisassembler::isGFX12() const {
+  return STI.hasFeature(AMDGPU::FeatureGFX12);
+}
+
 bool AMDGPUDisassembler::isGFX12Plus() const {
   return AMDGPU::isGFX12Plus(STI);
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 2e1b6fb1c740b..6a4cb12087208 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -275,6 +275,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   bool isGFX10Plus() const;
   bool isGFX11() const;
   bool isGFX11Plus() const;
+  bool isGFX12() const;
   bool isGFX12Plus() const;
 
   bool hasArchitectedFlatScratch() const;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index fe4db0ebb0262..cc374fbae7cc5 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -527,7 +527,7 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
     let ssamp = 0 in {
       if op.HAS_GFX10M then {
         def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
-                                         !if(enableDisasm, "GFX10", "")>;
+                                         !if(enableDisasm, "GFX8", "")>;
         if !not(ExtendedImageInst) then
         def _V1_gfx90a : MIMG_NoSampler_Helper_gfx90a <op, asm, dst_rc, VGPR_32,
                                        !if(enableDisasm, "GFX90A", "")>;
@@ -754,7 +754,7 @@ multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
       let ssamp = 0 in {
         if op.HAS_GFX10M then {
           def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
-                                      !if(enableDisasm, "GFX10", "")>;
+                                      !if(enableDisasm, "GFX8", "")>;
           let hasPostISelHook = 1 in
           def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
                                       !if(enableDisasm, "GFX90A", "")>;
@@ -1298,7 +1298,7 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
       if op.HAS_GFX10M then {
         def _V # addr.NumWords
           : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass,
-                                 !if(!and(enableDisasm, addr.Disassemble), "GFX10", "")>;
+                                 !if(!and(enableDisasm, addr.Disassemble), "GFX8", "")>;
         if !not(ExtendedImageInst) then
         def _V # addr.NumWords # _gfx90a
           : MIMG_Sampler_gfx90a <op, asm, dst_rc, addr.RegClass,

From 96e536ecf5e6202089ee10ca81c38fbce70851d7 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Mon, 26 Feb 2024 14:08:16 +0100
Subject: [PATCH 310/546] [clang][NFC] Prefer usings over typedefs (#82920)

---
 .../clang/Analysis/FlowSensitive/DataflowValues.h    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowValues.h b/clang/include/clang/Analysis/FlowSensitive/DataflowValues.h
index 2248bcdf3a512..8e88f9c4d9c2d 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowValues.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowValues.h
@@ -45,12 +45,12 @@ class DataflowValues {
   //===--------------------------------------------------------------------===//
 
 public:
-  typedef typename ValueTypes::ValTy               ValTy;
-  typedef typename ValueTypes::AnalysisDataTy      AnalysisDataTy;
-  typedef _AnalysisDirTag                          AnalysisDirTag;
-  typedef llvm::DenseMap<ProgramPoint, ValTy>      EdgeDataMapTy;
-  typedef llvm::DenseMap<const CFGBlock*, ValTy>   BlockDataMapTy;
-  typedef llvm::DenseMap<const Stmt*, ValTy>       StmtDataMapTy;
+  using ValTy = typename ValueTypes::ValTy;
+  using AnalysisDataTy = typename ValueTypes::AnalysisDataTy;
+  using AnalysisDirTag = _AnalysisDirTag;
+  using EdgeDataMapTy = llvm::DenseMap<ProgramPoint, ValTy>;
+  using BlockDataMapTy = llvm::DenseMap<const CFGBlock *, ValTy>;
+  using StmtDataMapTy = llvm::DenseMap<const Stmt *, ValTy>;
 
   //===--------------------------------------------------------------------===//
   // Predicates.

From c4e94633e8a48ee33115d5d3161ee142fc1c9700 Mon Sep 17 00:00:00 2001
From: Samira Bazuzi <bazuzi@google.com>
Date: Mon, 26 Feb 2024 08:23:46 -0500
Subject: [PATCH 311/546] Revert "[clang][dataflow] Correctly handle
 `InitListExpr` of union type." (#82856)

Reverts llvm/llvm-project#82348, which caused crashes when analyzing
empty InitListExprs for unions, e.g.

```cc
union U {
  double double_value;
  int int_value;
};

void target() {
  U value;
  value = {};
}
```

Co-authored-by: Samira Bazuzi <bazuzi@users.noreply.github.com>
---
 .../FlowSensitive/DataflowEnvironment.h       |  9 +++----
 .../FlowSensitive/DataflowEnvironment.cpp     | 18 +++----------
 clang/lib/Analysis/FlowSensitive/Transfer.cpp | 25 ++++++++-----------
 .../Analysis/FlowSensitive/TestingSupport.h   | 19 --------------
 .../Analysis/FlowSensitive/TransferTest.cpp   | 14 ++---------
 5 files changed, 20 insertions(+), 65 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index b3dc940705f87..0aecc749bf415 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -753,12 +753,9 @@ RecordStorageLocation *getImplicitObjectLocation(const CXXMemberCallExpr &MCE,
 RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
                                              const Environment &Env);
 
-/// Returns the fields of a `RecordDecl` that are initialized by an
-/// `InitListExpr`, in the order in which they appear in
-/// `InitListExpr::inits()`.
-/// `Init->getType()` must be a record type.
-std::vector<const FieldDecl *>
-getFieldsForInitListExpr(const InitListExpr *InitList);
+/// Returns the fields of `RD` that are initialized by an `InitListExpr`, in the
+/// order in which they appear in `InitListExpr::inits()`.
+std::vector<FieldDecl *> getFieldsForInitListExpr(const RecordDecl *RD);
 
 /// Associates a new `RecordValue` with `Loc` and returns the new value.
 RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env);
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 0cfc26ea952cd..d487944ce9211 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -361,8 +361,8 @@ getFieldsGlobalsAndFuncs(const Stmt &S, FieldSet &Fields,
     if (const auto *FD = dyn_cast<FieldDecl>(VD))
       Fields.insert(FD);
   } else if (auto *InitList = dyn_cast<InitListExpr>(&S)) {
-    if (InitList->getType()->isRecordType())
-      for (const auto *FD : getFieldsForInitListExpr(InitList))
+    if (RecordDecl *RD = InitList->getType()->getAsRecordDecl())
+      for (const auto *FD : getFieldsForInitListExpr(RD))
         Fields.insert(FD);
   }
 }
@@ -1104,22 +1104,12 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
   return Env.get<RecordStorageLocation>(*Base);
 }
 
-std::vector<const FieldDecl *>
-getFieldsForInitListExpr(const InitListExpr *InitList) {
-  const RecordDecl *RD = InitList->getType()->getAsRecordDecl();
-  assert(RD != nullptr);
-
-  std::vector<const FieldDecl *> Fields;
-
-  if (InitList->getType()->isUnionType()) {
-    Fields.push_back(InitList->getInitializedFieldInUnion());
-    return Fields;
-  }
-
+std::vector<FieldDecl *> getFieldsForInitListExpr(const RecordDecl *RD) {
   // Unnamed bitfields are only used for padding and do not appear in
   // `InitListExpr`'s inits. However, those fields do appear in `RecordDecl`'s
   // field list, and we thus need to remove them before mapping inits to
   // fields to avoid mapping inits to the wrongs fields.
+  std::vector<FieldDecl *> Fields;
   llvm::copy_if(
       RD->fields(), std::back_inserter(Fields),
       [](const FieldDecl *Field) { return !Field->isUnnamedBitfield(); });
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index cd1f04e53cff6..fe13e919bddcd 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -663,7 +663,14 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
   void VisitInitListExpr(const InitListExpr *S) {
     QualType Type = S->getType();
 
-    if (!Type->isRecordType()) {
+    if (Type->isUnionType()) {
+      // FIXME: Initialize unions properly.
+      if (auto *Val = Env.createValue(Type))
+        Env.setValue(*S, *Val);
+      return;
+    }
+
+    if (!Type->isStructureOrClassType()) {
       // Until array initialization is implemented, we don't need to care about
       // cases where `getNumInits() > 1`.
       if (S->getNumInits() == 1)
@@ -681,9 +688,10 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     llvm::DenseMap<const ValueDecl *, StorageLocation *> FieldLocs;
 
     // This only contains the direct fields for the given type.
-    std::vector<const FieldDecl *> FieldsForInit = getFieldsForInitListExpr(S);
+    std::vector<FieldDecl *> FieldsForInit =
+        getFieldsForInitListExpr(Type->getAsRecordDecl());
 
-    // `S->inits()` contains all the initializer expressions, including the
+    // `S->inits()` contains all the initializer epressions, including the
     // ones for direct base classes.
     auto Inits = S->inits();
     size_t InitIdx = 0;
@@ -723,17 +731,6 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       FieldLocs.insert({Field, &Loc});
     }
 
-    // In the case of a union, we don't in general have initializers for all
-    // of the fields. Create storage locations for the remaining fields (but
-    // don't associate them with values).
-    if (Type->isUnionType()) {
-      for (const FieldDecl *Field :
-           Env.getDataflowAnalysisContext().getModeledFields(Type)) {
-        if (auto [it, inserted] = FieldLocs.insert({Field, nullptr}); inserted)
-          it->second = &Env.createStorageLocation(Field->getType());
-      }
-    }
-
     // Check that we satisfy the invariant that a `RecordStorageLoation`
     // contains exactly the set of modeled fields for that type.
     // `ModeledFields` includes fields from all the bases, but only the
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index b7cf6cc966edb..0d36d2802897f 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -432,8 +432,6 @@ llvm::Error checkDataflowWithNoopAnalysis(
         {});
 
 /// Returns the `ValueDecl` for the given identifier.
-/// The returned pointer is guaranteed to be non-null; the function asserts if
-/// no `ValueDecl` with the given name is found.
 ///
 /// Requirements:
 ///
@@ -477,15 +475,6 @@ ValueT &getValueForDecl(ASTContext &ASTCtx, const Environment &Env,
   return *cast<ValueT>(Env.getValue(*VD));
 }
 
-/// Returns the storage location for the field called `Name` of `Loc`.
-/// Optionally casts the field storage location to `T`.
-template <typename T = StorageLocation>
-std::enable_if_t<std::is_base_of_v<StorageLocation, T>, T &>
-getFieldLoc(const RecordStorageLocation &Loc, llvm::StringRef Name,
-            ASTContext &ASTCtx) {
-  return *cast<T>(Loc.getChild(*findValueDecl(ASTCtx, Name)));
-}
-
 /// Returns the value of a `Field` on the record referenced by `Loc.`
 /// Returns null if `Loc` is null.
 inline Value *getFieldValue(const RecordStorageLocation *Loc,
@@ -498,14 +487,6 @@ inline Value *getFieldValue(const RecordStorageLocation *Loc,
   return Env.getValue(*FieldLoc);
 }
 
-/// Returns the value of a `Field` on the record referenced by `Loc.`
-/// Returns null if `Loc` is null.
-inline Value *getFieldValue(const RecordStorageLocation *Loc,
-                            llvm::StringRef Name, ASTContext &ASTCtx,
-                            const Environment &Env) {
-  return getFieldValue(Loc, *findValueDecl(ASTCtx, Name), Env);
-}
-
 /// Creates and owns constraints which are boolean values.
 class ConstraintContext {
   unsigned NextAtom = 0;
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index e7d74581865a3..a65b0446ac781 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2377,24 +2377,14 @@ TEST(TransferTest, InitListExprAsUnion) {
       } F;
 
      public:
-      constexpr target() : F{nullptr} {
-        int *null = nullptr;
-        F.b;  // Make sure we reference 'b' so it is modeled.
-        // [[p]]
-      }
+      constexpr target() : F{nullptr} {}
     };
   )cc";
   runDataflow(
       Code,
       [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
          ASTContext &ASTCtx) {
-        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
-
-        auto &FLoc = getFieldLoc<RecordStorageLocation>(
-            *Env.getThisPointeeStorageLocation(), "F", ASTCtx);
-        auto *AVal = cast<PointerValue>(getFieldValue(&FLoc, "a", ASTCtx, Env));
-        ASSERT_EQ(AVal, &getValueForDecl<PointerValue>(ASTCtx, Env, "null"));
-        ASSERT_EQ(getFieldValue(&FLoc, "b", ASTCtx, Env), nullptr);
+        // Just verify that it doesn't crash.
       });
 }
 

From d41615e91a108bd1ae41361be97c569691ab9ebb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 26 Feb 2024 13:22:54 +0000
Subject: [PATCH 312/546] [AMDGPU] Rename a DS class template argument. NFC.

The name hasGDS better reflects what it is used for.
---
 llvm/lib/Target/AMDGPU/DSInstructions.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 9ff74d9958304..780479a3885bd 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1203,7 +1203,7 @@ def : GCNPat <
 
 class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
                                                string opName = ps.Mnemonic,
-                                               bit hasGFX12Enc = 0>
+                                               bit hasGDS = true>
     : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
 
   let Inst{7-0}   = !if(ps.has_offset0, offset0, 0);
@@ -1216,7 +1216,7 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
   let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
   let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
 
-  let gds = !if(hasGFX12Enc, 0, ?);
+  let gds = !if(hasGDS, ?, 0);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1228,7 +1228,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
     defvar ps = !cast<DS_Pseudo>(NAME);
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
-                                               ps.Mnemonic, 1>;
+                                               ps.Mnemonic, /*hasGDS=*/false>;
   }
 
   multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
@@ -1236,7 +1236,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo,
                                                SIEncodingFamily.GFX12,
-                                               real_name, 1>,
+                                               real_name, /*hasGDS=*/false>,
       MnemonicAlias<backing_pseudo.Mnemonic, real_name>,
       Requires<[isGFX12Plus]>;
   }

From 9c5ca6b0ce2fc91561708542163fae1db88c59e8 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 26 Feb 2024 13:35:59 +0000
Subject: [PATCH 313/546] Revert "Enable JumpTableToSwitch pass by default
 (#82546)"

This reverts commit 1069823ce7d154aa8ef87ae5a0fd34b527eca2a0.

This has caused second stage timeouts when building Flang on
AArch64:
https://lab.llvm.org/buildbot/#/builders/179/builds/9442
---
 llvm/lib/Passes/PassBuilderPipelines.cpp                    | 2 +-
 llvm/test/Other/new-pm-defaults.ll                          | 6 +++++-
 llvm/test/Other/new-pm-thinlto-postlink-defaults.ll         | 1 -
 llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll     | 1 -
 .../Other/new-pm-thinlto-postlink-samplepgo-defaults.ll     | 1 -
 llvm/test/Other/new-pm-thinlto-prelink-defaults.ll          | 1 -
 llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll      | 1 -
 .../test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll | 1 -
 8 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 17b55b63ac03c..142bd50b3798e 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -247,7 +247,7 @@ static cl::opt<bool>
 
 static cl::opt<bool> EnableJumpTableToSwitch(
     "enable-jump-table-to-switch",
-    cl::desc("Enable JumpTableToSwitch pass (default = on)"), cl::init(true));
+    cl::desc("Enable JumpTableToSwitch pass (default = off)"));
 
 // This option is used in simplifying testing SampleFDO optimizations for
 // profile loading.
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 285077ff8e31a..51fb93daa4dfa 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -71,6 +71,10 @@
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-OPTIMIZER-LAST,CHECK-O23SZ
 
+; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
+; RUN:     -passes='default<O3>' -enable-jump-table-to-switch -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-JUMP-TABLE-TO-SWITCH,CHECK-O23SZ,%llvmcheckext
+
 ; RUN: opt -disable-verify -verify-analysis-invalidation=0 -eagerly-invalidate-analyses=0 -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -enable-matrix -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,CHECK-O23SZ,%llvmcheckext,CHECK-MATRIX
@@ -151,7 +155,7 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
+; CHECK-JUMP-TABLE-TO-SWITCH-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 29a4d79037427..064362eabbf83 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -90,7 +90,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index bf06782c86f86..19a44867e434a 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -78,7 +78,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 0cc61121de01c..ac80a31d8fd4b 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -86,7 +86,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 0e5839797afe9..6486639e07b49 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -121,7 +121,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 68c2e58146300..09f9f0f48badd 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -118,7 +118,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 8311a009711d1..47bdbfd2d357d 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -90,7 +90,6 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
-; CHECK-O23SZ-NEXT: Running pass: JumpTableToSwitchPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass

From 046682ef88a254443e8620bfd48b35bfa0a83809 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakamura <k.nakamura.hirofumi@gmail.com>
Date: Mon, 26 Feb 2024 22:50:51 +0900
Subject: [PATCH 314/546] [clang-format] Add
 AlignConsecutiveTableGenCondOperatorColons option. (#82878)

To align colons inside TableGen !cond operators.
---
 clang/docs/ClangFormatStyleOptions.rst        | 140 ++++++++++++++++++
 clang/include/clang/Format/Format.h           |  12 ++
 clang/lib/Format/Format.cpp                   |   3 +
 clang/lib/Format/WhitespaceManager.cpp        |  18 ++-
 clang/lib/Format/WhitespaceManager.h          |   8 +
 clang/unittests/Format/FormatTestTableGen.cpp |  21 +++
 6 files changed, 199 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 6515b16600191..d509bb8076797 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -955,6 +955,146 @@ the configuration (without a prefix: ``Auto``).
       }
 
 
+.. _AlignConsecutiveTableGenCondOperatorColons:
+
+**AlignConsecutiveTableGenCondOperatorColons** (``AlignConsecutiveStyle``) :versionbadge:`clang-format 19` :ref:`¶ <AlignConsecutiveTableGenCondOperatorColons>`
+  Style of aligning consecutive TableGen cond operator colons.
+  Align the colons of cases inside !cond operators.
+
+  .. code-block:: c++
+
+    !cond(!eq(size, 1) : 1,
+          !eq(size, 16): 1,
+          true         : 0)
+
+  Nested configuration flags:
+
+  Alignment options.
+
+  They can also be read as a whole for compatibility. The choices are:
+  - None
+  - Consecutive
+  - AcrossEmptyLines
+  - AcrossComments
+  - AcrossEmptyLinesAndComments
+
+  For example, to align across empty lines and not across comments, either
+  of these work.
+
+  .. code-block:: c++
+
+    AlignConsecutiveMacros: AcrossEmptyLines
+
+    AlignConsecutiveMacros:
+      Enabled: true
+      AcrossEmptyLines: true
+      AcrossComments: false
+
+  * ``bool Enabled`` Whether aligning is enabled.
+
+    .. code-block:: c++
+
+      #define SHORT_NAME       42
+      #define LONGER_NAME      0x007f
+      #define EVEN_LONGER_NAME (2)
+      #define foo(x)           (x * x)
+      #define bar(y, z)        (y + z)
+
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int aaaa : 1;
+      int b    : 12;
+      int ccc  : 8;
+
+      int         aaaa = 12;
+      float       b = 23;
+      std::string ccc;
+
+  * ``bool AcrossEmptyLines`` Whether to align across empty lines.
+
+    .. code-block:: c++
+
+      true:
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int d            = 3;
+
+      false:
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int d = 3;
+
+  * ``bool AcrossComments`` Whether to align across comments.
+
+    .. code-block:: c++
+
+      true:
+      int d    = 3;
+      /* A comment. */
+      double e = 4;
+
+      false:
+      int d = 3;
+      /* A comment. */
+      double e = 4;
+
+  * ``bool AlignCompound`` Only for ``AlignConsecutiveAssignments``.  Whether compound assignments
+    like ``+=`` are aligned along with ``=``.
+
+    .. code-block:: c++
+
+      true:
+      a   &= 2;
+      bbb  = 2;
+
+      false:
+      a &= 2;
+      bbb = 2;
+
+  * ``bool AlignFunctionPointers`` Only for ``AlignConsecutiveDeclarations``. Whether function pointers are
+    aligned.
+
+    .. code-block:: c++
+
+      true:
+      unsigned i;
+      int     &r;
+      int     *p;
+      int      (*f)();
+
+      false:
+      unsigned i;
+      int     &r;
+      int     *p;
+      int (*f)();
+
+  * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
+    operators are left-padded to the same length as long ones in order to
+    put all assignment operators to the right of the left hand side.
+
+    .. code-block:: c++
+
+      true:
+      a   >>= 2;
+      bbb   = 2;
+
+      a     = 2;
+      bbb >>= 2;
+
+      false:
+      a >>= 2;
+      bbb = 2;
+
+      a     = 2;
+      bbb >>= 2;
+
+
 .. _AlignEscapedNewlines:
 
 **AlignEscapedNewlines** (``EscapedNewlineAlignmentStyle``) :versionbadge:`clang-format 5` :ref:`¶ <AlignEscapedNewlines>`
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 47923e06d2c2d..449ce9e53be14 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -414,6 +414,16 @@ struct FormatStyle {
   /// \version 17
   ShortCaseStatementsAlignmentStyle AlignConsecutiveShortCaseStatements;
 
+  /// Style of aligning consecutive TableGen cond operator colons.
+  /// Align the colons of cases inside !cond operators.
+  /// \code
+  ///   !cond(!eq(size, 1) : 1,
+  ///         !eq(size, 16): 1,
+  ///         true         : 0)
+  /// \endcode
+  /// \version 19
+  AlignConsecutiveStyle AlignConsecutiveTableGenCondOperatorColons;
+
   /// Different styles for aligning escaped newlines.
   enum EscapedNewlineAlignmentStyle : int8_t {
     /// Don't align escaped newlines.
@@ -4805,6 +4815,8 @@ struct FormatStyle {
            AlignConsecutiveMacros == R.AlignConsecutiveMacros &&
            AlignConsecutiveShortCaseStatements ==
                R.AlignConsecutiveShortCaseStatements &&
+           AlignConsecutiveTableGenCondOperatorColons ==
+               R.AlignConsecutiveTableGenCondOperatorColons &&
            AlignEscapedNewlines == R.AlignEscapedNewlines &&
            AlignOperands == R.AlignOperands &&
            AlignTrailingComments == R.AlignTrailingComments &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 2f6b52510099a..794e326fb1c94 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -915,6 +915,8 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("AlignConsecutiveMacros", Style.AlignConsecutiveMacros);
     IO.mapOptional("AlignConsecutiveShortCaseStatements",
                    Style.AlignConsecutiveShortCaseStatements);
+    IO.mapOptional("AlignConsecutiveTableGenCondOperatorColons",
+                   Style.AlignConsecutiveTableGenCondOperatorColons);
     IO.mapOptional("AlignEscapedNewlines", Style.AlignEscapedNewlines);
     IO.mapOptional("AlignOperands", Style.AlignOperands);
     IO.mapOptional("AlignTrailingComments", Style.AlignTrailingComments);
@@ -1420,6 +1422,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AlignConsecutiveDeclarations = {};
   LLVMStyle.AlignConsecutiveMacros = {};
   LLVMStyle.AlignConsecutiveShortCaseStatements = {};
+  LLVMStyle.AlignConsecutiveTableGenCondOperatorColons = {};
   LLVMStyle.AlignEscapedNewlines = FormatStyle::ENAS_Right;
   LLVMStyle.AlignOperands = FormatStyle::OAS_Align;
   LLVMStyle.AlignTrailingComments = {};
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index f9eed7f516bbe..dd9d5847a10dc 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -111,6 +111,8 @@ const tooling::Replacements &WhitespaceManager::generateReplacements() {
   alignConsecutiveDeclarations();
   alignConsecutiveBitFields();
   alignConsecutiveAssignments();
+  if (Style.isTableGen())
+    alignConsecutiveTableGenCondOperatorColons();
   alignChainedConditionals();
   alignTrailingComments();
   alignEscapedNewlines();
@@ -849,7 +851,12 @@ void WhitespaceManager::alignConsecutiveAssignments() {
 }
 
 void WhitespaceManager::alignConsecutiveBitFields() {
-  if (!Style.AlignConsecutiveBitFields.Enabled)
+  alignConsecutiveColons(Style.AlignConsecutiveBitFields, TT_BitFieldColon);
+}
+
+void WhitespaceManager::alignConsecutiveColons(
+    const FormatStyle::AlignConsecutiveStyle &AlignStyle, TokenType Type) {
+  if (!AlignStyle.Enabled)
     return;
 
   AlignTokens(
@@ -863,9 +870,9 @@ void WhitespaceManager::alignConsecutiveBitFields() {
         if (&C != &Changes.back() && (&C + 1)->NewlinesBefore > 0)
           return false;
 
-        return C.Tok->is(TT_BitFieldColon);
+        return C.Tok->is(Type);
       },
-      Changes, /*StartAt=*/0, Style.AlignConsecutiveBitFields);
+      Changes, /*StartAt=*/0, AlignStyle);
 }
 
 void WhitespaceManager::alignConsecutiveShortCaseStatements() {
@@ -972,6 +979,11 @@ void WhitespaceManager::alignConsecutiveShortCaseStatements() {
                              Changes);
 }
 
+void WhitespaceManager::alignConsecutiveTableGenCondOperatorColons() {
+  alignConsecutiveColons(Style.AlignConsecutiveTableGenCondOperatorColons,
+                         TT_TableGenCondOperatorColon);
+}
+
 void WhitespaceManager::alignConsecutiveDeclarations() {
   if (!Style.AlignConsecutiveDeclarations.Enabled)
     return;
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index 8ac73305871ae..c604cdb6f185a 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -226,6 +226,11 @@ class WhitespaceManager {
   /// Align consecutive bitfields over all \c Changes.
   void alignConsecutiveBitFields();
 
+  /// Align consecutive colon. For bitfields, TableGen DAGArgs and defintions.
+  void
+  alignConsecutiveColons(const FormatStyle::AlignConsecutiveStyle &AlignStyle,
+                         TokenType Type);
+
   /// Align consecutive declarations over all \c Changes.
   void alignConsecutiveDeclarations();
 
@@ -235,6 +240,9 @@ class WhitespaceManager {
   /// Align consecutive short case statements over all \c Changes.
   void alignConsecutiveShortCaseStatements();
 
+  /// Align consecutive TableGen cond operator colon over all \c Changes.
+  void alignConsecutiveTableGenCondOperatorColons();
+
   /// Align trailing comments over all \c Changes.
   void alignTrailingComments();
 
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index c07fb85319f3a..6c110beabca40 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -44,6 +44,13 @@ class FormatTestTableGen : public ::testing::Test {
   static void verifyFormat(llvm::StringRef Result, llvm::StringRef MessedUp) {
     EXPECT_EQ(Result, format(MessedUp));
   }
+
+  static void verifyFormat(llvm::StringRef Code, const FormatStyle &Style) {
+    EXPECT_EQ(Code.str(), format(Code, 0, Code.size(), Style))
+        << "Expected code is not stable";
+    auto MessUp = test::messUp(Code);
+    EXPECT_EQ(Code.str(), format(MessUp, 0, MessUp.size(), Style));
+  }
 };
 
 TEST_F(FormatTestTableGen, FormatStringBreak) {
@@ -325,5 +332,19 @@ TEST_F(FormatTestTableGen, Assert) {
   verifyFormat("assert !le(DefVar1, 0), \"Assert1\";\n");
 }
 
+TEST_F(FormatTestTableGen, CondOperatorAlignment) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  verifyFormat("let CondOpe1 = !cond(!eq(size, 1): 1,\n"
+               "                     !eq(size, 16): 1,\n"
+               "                     true: 0);\n",
+               Style);
+  Style.AlignConsecutiveTableGenCondOperatorColons.Enabled = true;
+  verifyFormat("let CondOpe1 = !cond(!eq(size, 1) : 1,\n"
+               "                     !eq(size, 16): 1,\n"
+               "                     true         : 0);\n",
+               Style);
+}
+
 } // namespace format
 } // end namespace clang

From 440b1743ee0c8bfb7bf0c4b503bde5ab9af88dc0 Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Mon, 26 Feb 2024 13:52:27 +0000
Subject: [PATCH 315/546] [APINotes] Upstream Sema logic to apply API Notes to
 decls

This upstreams more of the Clang API Notes functionality that is
currently implemented in the Apple fork:
https://github.com/apple/llvm-project/tree/next/clang/lib/APINotes

This was extracted from a larger PR:
https://github.com/llvm/llvm-project/pull/73017
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   7 +
 clang/include/clang/Sema/Sema.h               |   6 +
 clang/lib/Sema/CMakeLists.txt                 |   1 +
 clang/lib/Sema/SemaAPINotes.cpp               | 988 ++++++++++++++++++
 clang/lib/Sema/SemaDecl.cpp                   |   4 +
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 +
 clang/lib/Sema/SemaDeclCXX.cpp                |   6 +-
 clang/lib/Sema/SemaDeclObjC.cpp               |   4 +
 clang/lib/Sema/SemaObjCProperty.cpp           |   5 +
 clang/lib/Sema/SemaTemplate.cpp               |   7 +
 10 files changed, 1030 insertions(+), 1 deletion(-)
 create mode 100644 clang/lib/Sema/SemaAPINotes.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a7f2858477bee..dad1764ad4f86 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10760,6 +10760,13 @@ def warn_imp_cast_drops_unaligned : Warning<
 
 } // end of sema category
 
+let CategoryName = "API Notes Issue" in {
+
+def err_incompatible_replacement_type : Error<
+  "API notes replacement type %0 has a different size from original type %1">;
+
+} // end of API Notes category
+
 let CategoryName = "OpenMP Issue" in {
 // OpenMP support.
 def err_omp_expected_var_arg : Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index e457694e4625d..c966a99c51968 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4883,6 +4883,12 @@ class Sema final {
   bool checkCommonAttributeFeatures(const Stmt *S, const ParsedAttr &A,
                                     bool SkipArgCountCheck = false);
 
+  /// Map any API notes provided for this declaration to attributes on the
+  /// declaration.
+  ///
+  /// Triggered by declaration-attribute processing.
+  void ProcessAPINotes(Decl *D);
+
   /// Determine if type T is a valid subject for a nonnull and similar
   /// attributes. By default, we look through references (the behavior used by
   /// nonnull), but if the second parameter is true, then we treat a reference
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index 862f9d4ffb825..e8bff07ced0cf 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -27,6 +27,7 @@ add_clang_library(clangSema
   Sema.cpp
   SemaAccess.cpp
   SemaAttr.cpp
+  SemaAPINotes.cpp
   SemaAvailability.cpp
   SemaCXXScopeSpec.cpp
   SemaCast.cpp
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
new file mode 100644
index 0000000000000..836c633e9d204
--- /dev/null
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -0,0 +1,988 @@
+//===--- SemaAPINotes.cpp - API Notes Handling ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the mapping from API notes to declaration attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/APINotes/APINotesReader.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclObjC.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Sema/SemaInternal.h"
+
+using namespace clang;
+
+namespace {
+enum class IsActive_t : bool { Inactive, Active };
+enum class IsSubstitution_t : bool { Original, Replacement };
+
+struct VersionedInfoMetadata {
+  /// An empty version refers to unversioned metadata.
+  VersionTuple Version;
+  unsigned IsActive : 1;
+  unsigned IsReplacement : 1;
+
+  VersionedInfoMetadata(VersionTuple Version, IsActive_t Active,
+                        IsSubstitution_t Replacement)
+      : Version(Version), IsActive(Active == IsActive_t::Active),
+        IsReplacement(Replacement == IsSubstitution_t::Replacement) {}
+};
+} // end anonymous namespace
+
+/// Determine whether this is a multi-level pointer type.
+static bool isIndirectPointerType(QualType Type) {
+  QualType Pointee = Type->getPointeeType();
+  if (Pointee.isNull())
+    return false;
+
+  return Pointee->isAnyPointerType() || Pointee->isObjCObjectPointerType() ||
+         Pointee->isMemberPointerType();
+}
+
+/// Apply nullability to the given declaration.
+static void applyNullability(Sema &S, Decl *D, NullabilityKind Nullability,
+                             VersionedInfoMetadata Metadata) {
+  if (!Metadata.IsActive)
+    return;
+
+  auto IsModified = [&](Decl *D, QualType QT,
+                        NullabilityKind Nullability) -> bool {
+    QualType Original = QT;
+    S.CheckImplicitNullabilityTypeSpecifier(QT, Nullability, D->getLocation(),
+                                            isa<ParmVarDecl>(D),
+                                            /*OverrideExisting=*/true);
+    return QT.getTypePtr() != Original.getTypePtr();
+  };
+
+  if (auto Function = dyn_cast<FunctionDecl>(D)) {
+    if (IsModified(D, Function->getReturnType(), Nullability)) {
+      QualType FnType = Function->getType();
+      Function->setType(FnType);
+    }
+  } else if (auto Method = dyn_cast<ObjCMethodDecl>(D)) {
+    QualType Type = Method->getReturnType();
+    if (IsModified(D, Type, Nullability)) {
+      Method->setReturnType(Type);
+
+      // Make it a context-sensitive keyword if we can.
+      if (!isIndirectPointerType(Type))
+        Method->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
+            Method->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
+    }
+  } else if (auto Value = dyn_cast<ValueDecl>(D)) {
+    QualType Type = Value->getType();
+    if (IsModified(D, Type, Nullability)) {
+      Value->setType(Type);
+
+      // Make it a context-sensitive keyword if we can.
+      if (auto Parm = dyn_cast<ParmVarDecl>(D)) {
+        if (Parm->isObjCMethodParameter() && !isIndirectPointerType(Type))
+          Parm->setObjCDeclQualifier(Decl::ObjCDeclQualifier(
+              Parm->getObjCDeclQualifier() | Decl::OBJC_TQ_CSNullability));
+      }
+    }
+  } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
+    QualType Type = Property->getType();
+    if (IsModified(D, Type, Nullability)) {
+      Property->setType(Type, Property->getTypeSourceInfo());
+
+      // Make it a property attribute if we can.
+      if (!isIndirectPointerType(Type))
+        Property->setPropertyAttributes(
+            ObjCPropertyAttribute::kind_null_resettable);
+    }
+  }
+}
+
+/// Copy a string into ASTContext-allocated memory.
+static StringRef ASTAllocateString(ASTContext &Ctx, StringRef String) {
+  void *mem = Ctx.Allocate(String.size(), alignof(char *));
+  memcpy(mem, String.data(), String.size());
+  return StringRef(static_cast<char *>(mem), String.size());
+}
+
+static AttributeCommonInfo getPlaceholderAttrInfo() {
+  return AttributeCommonInfo(SourceRange(),
+                             AttributeCommonInfo::UnknownAttribute,
+                             {AttributeCommonInfo::AS_GNU,
+                              /*Spelling*/ 0, /*IsAlignas*/ false,
+                              /*IsRegularKeywordAttribute*/ false});
+}
+
+namespace {
+template <typename A> struct AttrKindFor {};
+
+#define ATTR(X)                                                                \
+  template <> struct AttrKindFor<X##Attr> {                                    \
+    static const attr::Kind value = attr::X;                                   \
+  };
+#include "clang/Basic/AttrList.inc"
+
+/// Handle an attribute introduced by API notes.
+///
+/// \param IsAddition Whether we should add a new attribute
+/// (otherwise, we might remove an existing attribute).
+/// \param CreateAttr Create the new attribute to be added.
+template <typename A>
+void handleAPINotedAttribute(
+    Sema &S, Decl *D, bool IsAddition, VersionedInfoMetadata Metadata,
+    llvm::function_ref<A *()> CreateAttr,
+    llvm::function_ref<Decl::attr_iterator(const Decl *)> GetExistingAttr) {
+  if (Metadata.IsActive) {
+    auto Existing = GetExistingAttr(D);
+    if (Existing != D->attr_end()) {
+      // Remove the existing attribute, and treat it as a superseded
+      // non-versioned attribute.
+      auto *Versioned = SwiftVersionedAdditionAttr::CreateImplicit(
+          S.Context, Metadata.Version, *Existing, /*IsReplacedByActive*/ true);
+
+      D->getAttrs().erase(Existing);
+      D->addAttr(Versioned);
+    }
+
+    // If we're supposed to add a new attribute, do so.
+    if (IsAddition) {
+      if (auto Attr = CreateAttr())
+        D->addAttr(Attr);
+    }
+
+    return;
+  }
+  if (IsAddition) {
+    if (auto Attr = CreateAttr()) {
+      auto *Versioned = SwiftVersionedAdditionAttr::CreateImplicit(
+          S.Context, Metadata.Version, Attr,
+          /*IsReplacedByActive*/ Metadata.IsReplacement);
+      D->addAttr(Versioned);
+    }
+  } else {
+    // FIXME: This isn't preserving enough information for things like
+    // availability, where we're trying to remove a /specific/ kind of
+    // attribute.
+    auto *Versioned = SwiftVersionedRemovalAttr::CreateImplicit(
+        S.Context, Metadata.Version, AttrKindFor<A>::value,
+        /*IsReplacedByActive*/ Metadata.IsReplacement);
+    D->addAttr(Versioned);
+  }
+}
+
+template <typename A>
+void handleAPINotedAttribute(Sema &S, Decl *D, bool ShouldAddAttribute,
+                             VersionedInfoMetadata Metadata,
+                             llvm::function_ref<A *()> CreateAttr) {
+  handleAPINotedAttribute<A>(
+      S, D, ShouldAddAttribute, Metadata, CreateAttr, [](const Decl *D) {
+        return llvm::find_if(D->attrs(),
+                             [](const Attr *Next) { return isa<A>(Next); });
+      });
+}
+} // namespace
+
+template <typename A>
+static void handleAPINotedRetainCountAttribute(Sema &S, Decl *D,
+                                               bool ShouldAddAttribute,
+                                               VersionedInfoMetadata Metadata) {
+  // The template argument has a default to make the "removal" case more
+  // concise; it doesn't matter /which/ attribute is being removed.
+  handleAPINotedAttribute<A>(
+      S, D, ShouldAddAttribute, Metadata,
+      [&] { return new (S.Context) A(S.Context, getPlaceholderAttrInfo()); },
+      [](const Decl *D) -> Decl::attr_iterator {
+        return llvm::find_if(D->attrs(), [](const Attr *Next) -> bool {
+          return isa<CFReturnsRetainedAttr>(Next) ||
+                 isa<CFReturnsNotRetainedAttr>(Next) ||
+                 isa<NSReturnsRetainedAttr>(Next) ||
+                 isa<NSReturnsNotRetainedAttr>(Next) ||
+                 isa<CFAuditedTransferAttr>(Next);
+        });
+      });
+}
+
+static void handleAPINotedRetainCountConvention(
+    Sema &S, Decl *D, VersionedInfoMetadata Metadata,
+    std::optional<api_notes::RetainCountConventionKind> Convention) {
+  if (!Convention)
+    return;
+  switch (*Convention) {
+  case api_notes::RetainCountConventionKind::None:
+    if (isa<FunctionDecl>(D)) {
+      handleAPINotedRetainCountAttribute<CFUnknownTransferAttr>(
+          S, D, /*shouldAddAttribute*/ true, Metadata);
+    } else {
+      handleAPINotedRetainCountAttribute<CFReturnsRetainedAttr>(
+          S, D, /*shouldAddAttribute*/ false, Metadata);
+    }
+    break;
+  case api_notes::RetainCountConventionKind::CFReturnsRetained:
+    handleAPINotedRetainCountAttribute<CFReturnsRetainedAttr>(
+        S, D, /*shouldAddAttribute*/ true, Metadata);
+    break;
+  case api_notes::RetainCountConventionKind::CFReturnsNotRetained:
+    handleAPINotedRetainCountAttribute<CFReturnsNotRetainedAttr>(
+        S, D, /*shouldAddAttribute*/ true, Metadata);
+    break;
+  case api_notes::RetainCountConventionKind::NSReturnsRetained:
+    handleAPINotedRetainCountAttribute<NSReturnsRetainedAttr>(
+        S, D, /*shouldAddAttribute*/ true, Metadata);
+    break;
+  case api_notes::RetainCountConventionKind::NSReturnsNotRetained:
+    handleAPINotedRetainCountAttribute<NSReturnsNotRetainedAttr>(
+        S, D, /*shouldAddAttribute*/ true, Metadata);
+    break;
+  }
+}
+
+static void ProcessAPINotes(Sema &S, Decl *D,
+                            const api_notes::CommonEntityInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Availability
+  if (Info.Unavailable) {
+    handleAPINotedAttribute<UnavailableAttr>(S, D, true, Metadata, [&] {
+      return new (S.Context)
+          UnavailableAttr(S.Context, getPlaceholderAttrInfo(),
+                          ASTAllocateString(S.Context, Info.UnavailableMsg));
+    });
+  }
+
+  if (Info.UnavailableInSwift) {
+    handleAPINotedAttribute<AvailabilityAttr>(
+        S, D, true, Metadata,
+        [&] {
+          return new (S.Context) AvailabilityAttr(
+              S.Context, getPlaceholderAttrInfo(),
+              &S.Context.Idents.get("swift"), VersionTuple(), VersionTuple(),
+              VersionTuple(),
+              /*Unavailable=*/true,
+              ASTAllocateString(S.Context, Info.UnavailableMsg),
+              /*Strict=*/false,
+              /*Replacement=*/StringRef(),
+              /*Priority=*/Sema::AP_Explicit);
+        },
+        [](const Decl *D) {
+          return llvm::find_if(D->attrs(), [](const Attr *next) -> bool {
+            if (const auto *AA = dyn_cast<AvailabilityAttr>(next))
+              if (const auto *II = AA->getPlatform())
+                return II->isStr("swift");
+            return false;
+          });
+        });
+  }
+
+  // swift_private
+  if (auto SwiftPrivate = Info.isSwiftPrivate()) {
+    handleAPINotedAttribute<SwiftPrivateAttr>(
+        S, D, *SwiftPrivate, Metadata, [&] {
+          return new (S.Context)
+              SwiftPrivateAttr(S.Context, getPlaceholderAttrInfo());
+        });
+  }
+
+  // swift_name
+  if (!Info.SwiftName.empty()) {
+    handleAPINotedAttribute<SwiftNameAttr>(
+        S, D, true, Metadata, [&]() -> SwiftNameAttr * {
+          AttributeFactory AF{};
+          AttributePool AP{AF};
+          auto &C = S.getASTContext();
+          ParsedAttr *SNA =
+              AP.create(&C.Idents.get("swift_name"), SourceRange(), nullptr,
+                        SourceLocation(), nullptr, nullptr, nullptr,
+                        ParsedAttr::Form::GNU());
+
+          if (!S.DiagnoseSwiftName(D, Info.SwiftName, D->getLocation(), *SNA,
+                                   /*IsAsync=*/false))
+            return nullptr;
+
+          return new (S.Context)
+              SwiftNameAttr(S.Context, getPlaceholderAttrInfo(),
+                            ASTAllocateString(S.Context, Info.SwiftName));
+        });
+  }
+}
+
+static void ProcessAPINotes(Sema &S, Decl *D,
+                            const api_notes::CommonTypeInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // swift_bridge
+  if (auto SwiftBridge = Info.getSwiftBridge()) {
+    handleAPINotedAttribute<SwiftBridgeAttr>(
+        S, D, !SwiftBridge->empty(), Metadata, [&] {
+          return new (S.Context)
+              SwiftBridgeAttr(S.Context, getPlaceholderAttrInfo(),
+                              ASTAllocateString(S.Context, *SwiftBridge));
+        });
+  }
+
+  // ns_error_domain
+  if (auto NSErrorDomain = Info.getNSErrorDomain()) {
+    handleAPINotedAttribute<NSErrorDomainAttr>(
+        S, D, !NSErrorDomain->empty(), Metadata, [&] {
+          return new (S.Context)
+              NSErrorDomainAttr(S.Context, getPlaceholderAttrInfo(),
+                                &S.Context.Idents.get(*NSErrorDomain));
+        });
+  }
+
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonEntityInfo &>(Info),
+                  Metadata);
+}
+
+/// Check that the replacement type provided by API notes is reasonable.
+///
+/// This is a very weak form of ABI check.
+static bool checkAPINotesReplacementType(Sema &S, SourceLocation Loc,
+                                         QualType OrigType,
+                                         QualType ReplacementType) {
+  if (S.Context.getTypeSize(OrigType) !=
+      S.Context.getTypeSize(ReplacementType)) {
+    S.Diag(Loc, diag::err_incompatible_replacement_type)
+        << ReplacementType << OrigType;
+    return true;
+  }
+
+  return false;
+}
+
+/// Process API notes for a variable or property.
+static void ProcessAPINotes(Sema &S, Decl *D,
+                            const api_notes::VariableInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Type override.
+  if (Metadata.IsActive && !Info.getType().empty() &&
+      S.ParseTypeFromStringCallback) {
+    auto ParsedType = S.ParseTypeFromStringCallback(
+        Info.getType(), "<API Notes>", D->getLocation());
+    if (ParsedType.isUsable()) {
+      QualType Type = Sema::GetTypeFromParser(ParsedType.get());
+      auto TypeInfo =
+          S.Context.getTrivialTypeSourceInfo(Type, D->getLocation());
+
+      if (auto Var = dyn_cast<VarDecl>(D)) {
+        // Make adjustments to parameter types.
+        if (isa<ParmVarDecl>(Var)) {
+          Type = S.AdjustParameterTypeForObjCAutoRefCount(
+              Type, D->getLocation(), TypeInfo);
+          Type = S.Context.getAdjustedParameterType(Type);
+        }
+
+        if (!checkAPINotesReplacementType(S, Var->getLocation(), Var->getType(),
+                                          Type)) {
+          Var->setType(Type);
+          Var->setTypeSourceInfo(TypeInfo);
+        }
+      } else if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
+        if (!checkAPINotesReplacementType(S, Property->getLocation(),
+                                          Property->getType(), Type))
+          Property->setType(Type, TypeInfo);
+
+      } else
+        llvm_unreachable("API notes allowed a type on an unknown declaration");
+    }
+  }
+
+  // Nullability.
+  if (auto Nullability = Info.getNullability())
+    applyNullability(S, D, *Nullability, Metadata);
+
+  // Handle common entity information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonEntityInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for a parameter.
+static void ProcessAPINotes(Sema &S, ParmVarDecl *D,
+                            const api_notes::ParamInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // noescape
+  if (auto NoEscape = Info.isNoEscape())
+    handleAPINotedAttribute<NoEscapeAttr>(S, D, *NoEscape, Metadata, [&] {
+      return new (S.Context) NoEscapeAttr(S.Context, getPlaceholderAttrInfo());
+    });
+
+  // Retain count convention
+  handleAPINotedRetainCountConvention(S, D, Metadata,
+                                      Info.getRetainCountConvention());
+
+  // Handle common entity information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::VariableInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for a global variable.
+static void ProcessAPINotes(Sema &S, VarDecl *D,
+                            const api_notes::GlobalVariableInfo &Info,
+                            VersionedInfoMetadata metadata) {
+  // Handle common entity information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::VariableInfo &>(Info),
+                  metadata);
+}
+
+/// Process API notes for an Objective-C property.
+static void ProcessAPINotes(Sema &S, ObjCPropertyDecl *D,
+                            const api_notes::ObjCPropertyInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Handle common entity information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::VariableInfo &>(Info),
+                  Metadata);
+
+  if (auto AsAccessors = Info.getSwiftImportAsAccessors()) {
+    handleAPINotedAttribute<SwiftImportPropertyAsAccessorsAttr>(
+        S, D, *AsAccessors, Metadata, [&] {
+          return new (S.Context) SwiftImportPropertyAsAccessorsAttr(
+              S.Context, getPlaceholderAttrInfo());
+        });
+  }
+}
+
+namespace {
+typedef llvm::PointerUnion<FunctionDecl *, ObjCMethodDecl *> FunctionOrMethod;
+}
+
+/// Process API notes for a function or method.
+static void ProcessAPINotes(Sema &S, FunctionOrMethod AnyFunc,
+                            const api_notes::FunctionInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Find the declaration itself.
+  FunctionDecl *FD = AnyFunc.dyn_cast<FunctionDecl *>();
+  Decl *D = FD;
+  ObjCMethodDecl *MD = nullptr;
+  if (!D) {
+    MD = AnyFunc.get<ObjCMethodDecl *>();
+    D = MD;
+  }
+
+  // Nullability of return type.
+  if (Info.NullabilityAudited)
+    applyNullability(S, D, Info.getReturnTypeInfo(), Metadata);
+
+  // Parameters.
+  unsigned NumParams = FD ? FD->getNumParams() : MD->param_size();
+
+  bool AnyTypeChanged = false;
+  for (unsigned I = 0; I != NumParams; ++I) {
+    ParmVarDecl *Param = FD ? FD->getParamDecl(I) : MD->param_begin()[I];
+    QualType ParamTypeBefore = Param->getType();
+
+    if (I < Info.Params.size())
+      ProcessAPINotes(S, Param, Info.Params[I], Metadata);
+
+    // Nullability.
+    if (Info.NullabilityAudited)
+      applyNullability(S, Param, Info.getParamTypeInfo(I), Metadata);
+
+    if (ParamTypeBefore.getAsOpaquePtr() != Param->getType().getAsOpaquePtr())
+      AnyTypeChanged = true;
+  }
+
+  // Result type override.
+  QualType OverriddenResultType;
+  if (Metadata.IsActive && !Info.ResultType.empty() &&
+      S.ParseTypeFromStringCallback) {
+    auto ParsedType = S.ParseTypeFromStringCallback(
+        Info.ResultType, "<API Notes>", D->getLocation());
+    if (ParsedType.isUsable()) {
+      QualType ResultType = Sema::GetTypeFromParser(ParsedType.get());
+
+      if (MD) {
+        if (!checkAPINotesReplacementType(S, D->getLocation(),
+                                          MD->getReturnType(), ResultType)) {
+          auto ResultTypeInfo =
+              S.Context.getTrivialTypeSourceInfo(ResultType, D->getLocation());
+          MD->setReturnType(ResultType);
+          MD->setReturnTypeSourceInfo(ResultTypeInfo);
+        }
+      } else if (!checkAPINotesReplacementType(
+                     S, FD->getLocation(), FD->getReturnType(), ResultType)) {
+        OverriddenResultType = ResultType;
+        AnyTypeChanged = true;
+      }
+    }
+  }
+
+  // If the result type or any of the parameter types changed for a function
+  // declaration, we have to rebuild the type.
+  if (FD && AnyTypeChanged) {
+    if (const auto *fnProtoType = FD->getType()->getAs<FunctionProtoType>()) {
+      if (OverriddenResultType.isNull())
+        OverriddenResultType = fnProtoType->getReturnType();
+
+      SmallVector<QualType, 4> ParamTypes;
+      for (auto Param : FD->parameters())
+        ParamTypes.push_back(Param->getType());
+
+      FD->setType(S.Context.getFunctionType(OverriddenResultType, ParamTypes,
+                                            fnProtoType->getExtProtoInfo()));
+    } else if (!OverriddenResultType.isNull()) {
+      const auto *FnNoProtoType = FD->getType()->castAs<FunctionNoProtoType>();
+      FD->setType(S.Context.getFunctionNoProtoType(
+          OverriddenResultType, FnNoProtoType->getExtInfo()));
+    }
+  }
+
+  // Retain count convention
+  handleAPINotedRetainCountConvention(S, D, Metadata,
+                                      Info.getRetainCountConvention());
+
+  // Handle common entity information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonEntityInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for a global function.
+static void ProcessAPINotes(Sema &S, FunctionDecl *D,
+                            const api_notes::GlobalFunctionInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Handle common function information.
+  ProcessAPINotes(S, FunctionOrMethod(D),
+                  static_cast<const api_notes::FunctionInfo &>(Info), Metadata);
+}
+
+/// Process API notes for an enumerator.
+static void ProcessAPINotes(Sema &S, EnumConstantDecl *D,
+                            const api_notes::EnumConstantInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Handle common information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonEntityInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for an Objective-C method.
+static void ProcessAPINotes(Sema &S, ObjCMethodDecl *D,
+                            const api_notes::ObjCMethodInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Designated initializers.
+  if (Info.DesignatedInit) {
+    handleAPINotedAttribute<ObjCDesignatedInitializerAttr>(
+        S, D, true, Metadata, [&] {
+          if (ObjCInterfaceDecl *IFace = D->getClassInterface())
+            IFace->setHasDesignatedInitializers();
+
+          return new (S.Context) ObjCDesignatedInitializerAttr(
+              S.Context, getPlaceholderAttrInfo());
+        });
+  }
+
+  // Handle common function information.
+  ProcessAPINotes(S, FunctionOrMethod(D),
+                  static_cast<const api_notes::FunctionInfo &>(Info), Metadata);
+}
+
+/// Process API notes for a tag.
+static void ProcessAPINotes(Sema &S, TagDecl *D, const api_notes::TagInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  if (auto ImportAs = Info.SwiftImportAs)
+    D->addAttr(SwiftAttrAttr::Create(S.Context, "import_" + ImportAs.value()));
+
+  if (auto RetainOp = Info.SwiftRetainOp)
+    D->addAttr(SwiftAttrAttr::Create(S.Context, "retain:" + RetainOp.value()));
+
+  if (auto ReleaseOp = Info.SwiftReleaseOp)
+    D->addAttr(
+        SwiftAttrAttr::Create(S.Context, "release:" + ReleaseOp.value()));
+
+  if (auto Extensibility = Info.EnumExtensibility) {
+    using api_notes::EnumExtensibilityKind;
+    bool ShouldAddAttribute = (*Extensibility != EnumExtensibilityKind::None);
+    handleAPINotedAttribute<EnumExtensibilityAttr>(
+        S, D, ShouldAddAttribute, Metadata, [&] {
+          EnumExtensibilityAttr::Kind kind;
+          switch (*Extensibility) {
+          case EnumExtensibilityKind::None:
+            llvm_unreachable("remove only");
+          case EnumExtensibilityKind::Open:
+            kind = EnumExtensibilityAttr::Open;
+            break;
+          case EnumExtensibilityKind::Closed:
+            kind = EnumExtensibilityAttr::Closed;
+            break;
+          }
+          return new (S.Context)
+              EnumExtensibilityAttr(S.Context, getPlaceholderAttrInfo(), kind);
+        });
+  }
+
+  if (auto FlagEnum = Info.isFlagEnum()) {
+    handleAPINotedAttribute<FlagEnumAttr>(S, D, *FlagEnum, Metadata, [&] {
+      return new (S.Context) FlagEnumAttr(S.Context, getPlaceholderAttrInfo());
+    });
+  }
+
+  // Handle common type information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonTypeInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for a typedef.
+static void ProcessAPINotes(Sema &S, TypedefNameDecl *D,
+                            const api_notes::TypedefInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // swift_wrapper
+  using SwiftWrapperKind = api_notes::SwiftNewTypeKind;
+
+  if (auto SwiftWrapper = Info.SwiftWrapper) {
+    handleAPINotedAttribute<SwiftNewTypeAttr>(
+        S, D, *SwiftWrapper != SwiftWrapperKind::None, Metadata, [&] {
+          SwiftNewTypeAttr::NewtypeKind Kind;
+          switch (*SwiftWrapper) {
+          case SwiftWrapperKind::None:
+            llvm_unreachable("Shouldn't build an attribute");
+
+          case SwiftWrapperKind::Struct:
+            Kind = SwiftNewTypeAttr::NK_Struct;
+            break;
+
+          case SwiftWrapperKind::Enum:
+            Kind = SwiftNewTypeAttr::NK_Enum;
+            break;
+          }
+          AttributeCommonInfo SyntaxInfo{
+              SourceRange(),
+              AttributeCommonInfo::AT_SwiftNewType,
+              {AttributeCommonInfo::AS_GNU, SwiftNewTypeAttr::GNU_swift_wrapper,
+               /*IsAlignas*/ false, /*IsRegularKeywordAttribute*/ false}};
+          return new (S.Context) SwiftNewTypeAttr(S.Context, SyntaxInfo, Kind);
+        });
+  }
+
+  // Handle common type information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonTypeInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for an Objective-C class or protocol.
+static void ProcessAPINotes(Sema &S, ObjCContainerDecl *D,
+                            const api_notes::ObjCContextInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  // Handle common type information.
+  ProcessAPINotes(S, D, static_cast<const api_notes::CommonTypeInfo &>(Info),
+                  Metadata);
+}
+
+/// Process API notes for an Objective-C class.
+static void ProcessAPINotes(Sema &S, ObjCInterfaceDecl *D,
+                            const api_notes::ObjCContextInfo &Info,
+                            VersionedInfoMetadata Metadata) {
+  if (auto AsNonGeneric = Info.getSwiftImportAsNonGeneric()) {
+    handleAPINotedAttribute<SwiftImportAsNonGenericAttr>(
+        S, D, *AsNonGeneric, Metadata, [&] {
+          return new (S.Context)
+              SwiftImportAsNonGenericAttr(S.Context, getPlaceholderAttrInfo());
+        });
+  }
+
+  if (auto ObjcMembers = Info.getSwiftObjCMembers()) {
+    handleAPINotedAttribute<SwiftObjCMembersAttr>(
+        S, D, *ObjcMembers, Metadata, [&] {
+          return new (S.Context)
+              SwiftObjCMembersAttr(S.Context, getPlaceholderAttrInfo());
+        });
+  }
+
+  // Handle information common to Objective-C classes and protocols.
+  ProcessAPINotes(S, static_cast<clang::ObjCContainerDecl *>(D), Info,
+                  Metadata);
+}
+
+/// If we're applying API notes with an active, non-default version, and the
+/// versioned API notes have a SwiftName but the declaration normally wouldn't
+/// have one, add a removal attribute to make it clear that the new SwiftName
+/// attribute only applies to the active version of \p D, not to all versions.
+///
+/// This must be run \em before processing API notes for \p D, because otherwise
+/// any existing SwiftName attribute will have been packaged up in a
+/// SwiftVersionedAdditionAttr.
+template <typename SpecificInfo>
+static void maybeAttachUnversionedSwiftName(
+    Sema &S, Decl *D,
+    const api_notes::APINotesReader::VersionedInfo<SpecificInfo> Info) {
+  if (D->hasAttr<SwiftNameAttr>())
+    return;
+  if (!Info.getSelected())
+    return;
+
+  // Is the active slice versioned, and does it set a Swift name?
+  VersionTuple SelectedVersion;
+  SpecificInfo SelectedInfoSlice;
+  std::tie(SelectedVersion, SelectedInfoSlice) = Info[*Info.getSelected()];
+  if (SelectedVersion.empty())
+    return;
+  if (SelectedInfoSlice.SwiftName.empty())
+    return;
+
+  // Does the unversioned slice /not/ set a Swift name?
+  for (const auto &VersionAndInfoSlice : Info) {
+    if (!VersionAndInfoSlice.first.empty())
+      continue;
+    if (!VersionAndInfoSlice.second.SwiftName.empty())
+      return;
+  }
+
+  // Then explicitly call that out with a removal attribute.
+  VersionedInfoMetadata DummyFutureMetadata(
+      SelectedVersion, IsActive_t::Inactive, IsSubstitution_t::Replacement);
+  handleAPINotedAttribute<SwiftNameAttr>(
+      S, D, /*add*/ false, DummyFutureMetadata, []() -> SwiftNameAttr * {
+        llvm_unreachable("should not try to add an attribute here");
+      });
+}
+
+/// Processes all versions of versioned API notes.
+///
+/// Just dispatches to the various ProcessAPINotes functions in this file.
+template <typename SpecificDecl, typename SpecificInfo>
+static void ProcessVersionedAPINotes(
+    Sema &S, SpecificDecl *D,
+    const api_notes::APINotesReader::VersionedInfo<SpecificInfo> Info) {
+
+  maybeAttachUnversionedSwiftName(S, D, Info);
+
+  unsigned Selected = Info.getSelected().value_or(Info.size());
+
+  VersionTuple Version;
+  SpecificInfo InfoSlice;
+  for (unsigned i = 0, e = Info.size(); i != e; ++i) {
+    std::tie(Version, InfoSlice) = Info[i];
+    auto Active = (i == Selected) ? IsActive_t::Active : IsActive_t::Inactive;
+    auto Replacement = IsSubstitution_t::Original;
+    if (Active == IsActive_t::Inactive && Version.empty()) {
+      Replacement = IsSubstitution_t::Replacement;
+      Version = Info[Selected].first;
+    }
+    ProcessAPINotes(S, D, InfoSlice,
+                    VersionedInfoMetadata(Version, Active, Replacement));
+  }
+}
+
+/// Process API notes that are associated with this declaration, mapping them
+/// to attributes as appropriate.
+void Sema::ProcessAPINotes(Decl *D) {
+  if (!D)
+    return;
+
+  // Globals.
+  if (D->getDeclContext()->isFileContext() ||
+      D->getDeclContext()->isNamespace() ||
+      D->getDeclContext()->isExternCContext() ||
+      D->getDeclContext()->isExternCXXContext()) {
+    std::optional<api_notes::Context> APINotesContext;
+    if (auto NamespaceContext = dyn_cast<NamespaceDecl>(D->getDeclContext())) {
+      for (auto Reader :
+           APINotes.findAPINotes(NamespaceContext->getLocation())) {
+        // Retrieve the context ID for the parent namespace of the decl.
+        std::stack<NamespaceDecl *> NamespaceStack;
+        {
+          for (auto CurrentNamespace = NamespaceContext; CurrentNamespace;
+               CurrentNamespace =
+                   dyn_cast<NamespaceDecl>(CurrentNamespace->getParent())) {
+            if (!CurrentNamespace->isInlineNamespace())
+              NamespaceStack.push(CurrentNamespace);
+          }
+        }
+        std::optional<api_notes::ContextID> NamespaceID;
+        while (!NamespaceStack.empty()) {
+          auto CurrentNamespace = NamespaceStack.top();
+          NamespaceStack.pop();
+          NamespaceID = Reader->lookupNamespaceID(CurrentNamespace->getName(),
+                                                  NamespaceID);
+          if (!NamespaceID)
+            break;
+        }
+        if (NamespaceID)
+          APINotesContext = api_notes::Context(
+              *NamespaceID, api_notes::ContextKind::Namespace);
+      }
+    }
+
+    // Global variables.
+    if (auto VD = dyn_cast<VarDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        auto Info =
+            Reader->lookupGlobalVariable(VD->getName(), APINotesContext);
+        ProcessVersionedAPINotes(*this, VD, Info);
+      }
+
+      return;
+    }
+
+    // Global functions.
+    if (auto FD = dyn_cast<FunctionDecl>(D)) {
+      if (FD->getDeclName().isIdentifier()) {
+        for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+          auto Info =
+              Reader->lookupGlobalFunction(FD->getName(), APINotesContext);
+          ProcessVersionedAPINotes(*this, FD, Info);
+        }
+      }
+
+      return;
+    }
+
+    // Objective-C classes.
+    if (auto Class = dyn_cast<ObjCInterfaceDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        auto Info = Reader->lookupObjCClassInfo(Class->getName());
+        ProcessVersionedAPINotes(*this, Class, Info);
+      }
+
+      return;
+    }
+
+    // Objective-C protocols.
+    if (auto Protocol = dyn_cast<ObjCProtocolDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        auto Info = Reader->lookupObjCProtocolInfo(Protocol->getName());
+        ProcessVersionedAPINotes(*this, Protocol, Info);
+      }
+
+      return;
+    }
+
+    // Tags
+    if (auto Tag = dyn_cast<TagDecl>(D)) {
+      std::string LookupName = Tag->getName().str();
+
+      // Use the source location to discern if this Tag is an OPTIONS macro.
+      // For now we would like to limit this trick of looking up the APINote tag
+      // using the EnumDecl's QualType in the case where the enum is anonymous.
+      // This is only being used to support APINotes lookup for C++
+      // NS/CF_OPTIONS when C++-Interop is enabled.
+      std::string MacroName =
+          LookupName.empty() && Tag->getOuterLocStart().isMacroID()
+              ? clang::Lexer::getImmediateMacroName(
+                    Tag->getOuterLocStart(),
+                    Tag->getASTContext().getSourceManager(), LangOpts)
+                    .str()
+              : "";
+
+      if (LookupName.empty() && isa<clang::EnumDecl>(Tag) &&
+          (MacroName == "CF_OPTIONS" || MacroName == "NS_OPTIONS" ||
+           MacroName == "OBJC_OPTIONS" || MacroName == "SWIFT_OPTIONS")) {
+
+        clang::QualType T = llvm::cast<clang::EnumDecl>(Tag)->getIntegerType();
+        LookupName = clang::QualType::getAsString(
+            T.split(), getASTContext().getPrintingPolicy());
+      }
+
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        auto Info = Reader->lookupTag(LookupName, APINotesContext);
+        ProcessVersionedAPINotes(*this, Tag, Info);
+      }
+
+      return;
+    }
+
+    // Typedefs
+    if (auto Typedef = dyn_cast<TypedefNameDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        auto Info = Reader->lookupTypedef(Typedef->getName(), APINotesContext);
+        ProcessVersionedAPINotes(*this, Typedef, Info);
+      }
+
+      return;
+    }
+  }
+
+  // Enumerators.
+  if (D->getDeclContext()->getRedeclContext()->isFileContext() ||
+      D->getDeclContext()->getRedeclContext()->isExternCContext()) {
+    if (auto EnumConstant = dyn_cast<EnumConstantDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        auto Info = Reader->lookupEnumConstant(EnumConstant->getName());
+        ProcessVersionedAPINotes(*this, EnumConstant, Info);
+      }
+
+      return;
+    }
+  }
+
+  if (auto ObjCContainer = dyn_cast<ObjCContainerDecl>(D->getDeclContext())) {
+    // Location function that looks up an Objective-C context.
+    auto GetContext = [&](api_notes::APINotesReader *Reader)
+        -> std::optional<api_notes::ContextID> {
+      if (auto Protocol = dyn_cast<ObjCProtocolDecl>(ObjCContainer)) {
+        if (auto Found = Reader->lookupObjCProtocolID(Protocol->getName()))
+          return *Found;
+
+        return std::nullopt;
+      }
+
+      if (auto Impl = dyn_cast<ObjCCategoryImplDecl>(ObjCContainer)) {
+        if (auto Cat = Impl->getCategoryDecl())
+          ObjCContainer = Cat->getClassInterface();
+        else
+          return std::nullopt;
+      }
+
+      if (auto Category = dyn_cast<ObjCCategoryDecl>(ObjCContainer)) {
+        if (Category->getClassInterface())
+          ObjCContainer = Category->getClassInterface();
+        else
+          return std::nullopt;
+      }
+
+      if (auto Impl = dyn_cast<ObjCImplDecl>(ObjCContainer)) {
+        if (Impl->getClassInterface())
+          ObjCContainer = Impl->getClassInterface();
+        else
+          return std::nullopt;
+      }
+
+      if (auto Class = dyn_cast<ObjCInterfaceDecl>(ObjCContainer)) {
+        if (auto Found = Reader->lookupObjCClassID(Class->getName()))
+          return *Found;
+
+        return std::nullopt;
+      }
+
+      return std::nullopt;
+    };
+
+    // Objective-C methods.
+    if (auto Method = dyn_cast<ObjCMethodDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        if (auto Context = GetContext(Reader)) {
+          // Map the selector.
+          Selector Sel = Method->getSelector();
+          SmallVector<StringRef, 2> SelPieces;
+          if (Sel.isUnarySelector()) {
+            SelPieces.push_back(Sel.getNameForSlot(0));
+          } else {
+            for (unsigned i = 0, n = Sel.getNumArgs(); i != n; ++i)
+              SelPieces.push_back(Sel.getNameForSlot(i));
+          }
+
+          api_notes::ObjCSelectorRef SelectorRef;
+          SelectorRef.NumArgs = Sel.getNumArgs();
+          SelectorRef.Identifiers = SelPieces;
+
+          auto Info = Reader->lookupObjCMethod(*Context, SelectorRef,
+                                               Method->isInstanceMethod());
+          ProcessVersionedAPINotes(*this, Method, Info);
+        }
+      }
+    }
+
+    // Objective-C properties.
+    if (auto Property = dyn_cast<ObjCPropertyDecl>(D)) {
+      for (auto Reader : APINotes.findAPINotes(D->getLocation())) {
+        if (auto Context = GetContext(Reader)) {
+          bool isInstanceProperty =
+              (Property->getPropertyAttributesAsWritten() &
+               ObjCPropertyAttribute::kind_class) == 0;
+          auto Info = Reader->lookupObjCProperty(*Context, Property->getName(),
+                                                 isInstanceProperty);
+          ProcessVersionedAPINotes(*this, Property, Info);
+        }
+      }
+
+      return;
+    }
+  }
+}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 10b5c271f25c1..9fdd8eb236d1e 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -16424,6 +16424,7 @@ void Sema::ActOnFinishDelayedAttribute(Scope *S, Decl *D,
   if (TemplateDecl *TD = dyn_cast<TemplateDecl>(D))
     D = TD->getTemplatedDecl();
   ProcessDeclAttributeList(S, D, Attrs);
+  ProcessAPINotes(D);
 
   if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(D))
     if (Method->isStatic())
@@ -19683,6 +19684,7 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
       CDecl->setIvarRBraceLoc(RBrac);
     }
   }
+  ProcessAPINotes(Record);
 }
 
 /// Determine whether the given integral value is representable within
@@ -19997,6 +19999,7 @@ Decl *Sema::ActOnEnumConstant(Scope *S, Decl *theEnumDecl, Decl *lastEnumConst,
   // Process attributes.
   ProcessDeclAttributeList(S, New, Attrs);
   AddPragmaAttributes(S, New);
+  ProcessAPINotes(New);
 
   // Register this decl in the current scope stack.
   New->setAccess(TheEnumDecl->getAccess());
@@ -20195,6 +20198,7 @@ void Sema::ActOnEnumBody(SourceLocation EnumLoc, SourceRange BraceRange,
   QualType EnumType = Context.getTypeDeclType(Enum);
 
   ProcessDeclAttributeList(S, Enum, Attrs);
+  ProcessAPINotes(Enum);
 
   if (Enum->isDependentType()) {
     for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 8a204b1d3b88d..c1c28a73fd79c 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -10146,6 +10146,9 @@ void Sema::ProcessDeclAttributes(Scope *S, Decl *D, const Declarator &PD) {
 
   // Apply additional attributes specified by '#pragma clang attribute'.
   AddPragmaAttributes(S, D);
+
+  // Look for API notes that map to attributes.
+  ProcessAPINotes(D);
 }
 
 /// Is the given declaration allowed to use a forbidden type?
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 7c009d9c8ec09..d4e1dc67cb50a 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11724,6 +11724,7 @@ Decl *Sema::ActOnStartNamespaceDef(Scope *NamespcScope,
 
   ProcessDeclAttributeList(DeclRegionScope, Namespc, AttrList);
   AddPragmaAttributes(DeclRegionScope, Namespc);
+  ProcessAPINotes(Namespc);
 
   // FIXME: Should we be merging attributes?
   if (const VisibilityAttr *Attr = Namespc->getAttr<VisibilityAttr>())
@@ -12270,8 +12271,10 @@ Decl *Sema::ActOnUsingDirective(Scope *S, SourceLocation UsingLoc,
     Diag(IdentLoc, diag::err_expected_namespace_name) << SS.getRange();
   }
 
-  if (UDir)
+  if (UDir) {
     ProcessDeclAttributeList(S, UDir, AttrList);
+    ProcessAPINotes(UDir);
+  }
 
   return UDir;
 }
@@ -13563,6 +13566,7 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS,
 
   ProcessDeclAttributeList(S, NewTD, AttrList);
   AddPragmaAttributes(S, NewTD);
+  ProcessAPINotes(NewTD);
 
   CheckTypedefForVariablyModifiedType(S, NewTD);
   Invalid |= NewTD->isInvalidDecl();
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index bb0d0cd2030ba..2011f4084dd2a 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -1072,6 +1072,7 @@ ObjCInterfaceDecl *Sema::ActOnStartClassInterface(
 
   ProcessDeclAttributeList(TUScope, IDecl, AttrList);
   AddPragmaAttributes(TUScope, IDecl);
+  ProcessAPINotes(IDecl);
 
   // Merge attributes from previous declarations.
   if (PrevIDecl)
@@ -1273,6 +1274,7 @@ ObjCProtocolDecl *Sema::ActOnStartProtocolInterface(
 
   ProcessDeclAttributeList(TUScope, PDecl, AttrList);
   AddPragmaAttributes(TUScope, PDecl);
+  ProcessAPINotes(PDecl);
 
   // Merge attributes from previous declarations.
   if (PrevDecl)
@@ -4807,6 +4809,7 @@ Decl *Sema::ActOnMethodDeclaration(
     // Apply the attributes to the parameter.
     ProcessDeclAttributeList(TUScope, Param, ArgInfo[i].ArgAttrs);
     AddPragmaAttributes(TUScope, Param);
+    ProcessAPINotes(Param);
 
     if (Param->hasAttr<BlocksAttr>()) {
       Diag(Param->getLocation(), diag::err_block_on_nonlocal);
@@ -4837,6 +4840,7 @@ Decl *Sema::ActOnMethodDeclaration(
 
   ProcessDeclAttributeList(TUScope, ObjCMethod, AttrList);
   AddPragmaAttributes(TUScope, ObjCMethod);
+  ProcessAPINotes(ObjCMethod);
 
   // Add the method now.
   const ObjCMethodDecl *PrevMethod = nullptr;
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 349c7fc9c91bd..4636d89ebf2b8 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -2509,6 +2509,8 @@ void Sema::ProcessPropertyDecl(ObjCPropertyDecl *property) {
       GetterMethod->addAttr(SectionAttr::CreateImplicit(
           Context, SA->getName(), Loc, SectionAttr::GNU_section));
 
+    ProcessAPINotes(GetterMethod);
+
     if (getLangOpts().ObjCAutoRefCount)
       CheckARCMethodDecl(GetterMethod);
   } else
@@ -2578,6 +2580,9 @@ void Sema::ProcessPropertyDecl(ObjCPropertyDecl *property) {
       if (const SectionAttr *SA = property->getAttr<SectionAttr>())
         SetterMethod->addAttr(SectionAttr::CreateImplicit(
             Context, SA->getName(), Loc, SectionAttr::GNU_section));
+
+      ProcessAPINotes(SetterMethod);
+
       // It's possible for the user to have set a very odd custom
       // setter selector that causes it to have a method family.
       if (getLangOpts().ObjCAutoRefCount)
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 7d3d665194add..c95c08c0422ed 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2158,6 +2158,7 @@ DeclResult Sema::CheckClassTemplate(
     NewClass->startDefinition();
 
   ProcessDeclAttributeList(S, NewClass, Attr);
+  ProcessAPINotes(NewClass);
 
   if (PrevClassTemplate)
     mergeDeclAttributes(NewClass, PrevClassTemplate->getTemplatedDecl());
@@ -9112,6 +9113,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   }
 
   ProcessDeclAttributeList(S, Specialization, Attr);
+  ProcessAPINotes(Specialization);
 
   // Add alignment attributes if necessary; these attributes are checked when
   // the ASTContext lays out the structure.
@@ -10346,6 +10348,7 @@ DeclResult Sema::ActOnExplicitInstantiation(
 
   bool PreviouslyDLLExported = Specialization->hasAttr<DLLExportAttr>();
   ProcessDeclAttributeList(S, Specialization, Attr);
+  ProcessAPINotes(Specialization);
 
   // Add the explicit instantiation into its lexical context. However,
   // since explicit instantiations are never found by name lookup, we
@@ -10756,6 +10759,9 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
       Prev->setTemplateSpecializationKind(TSK, D.getIdentifierLoc());
       // Merge attributes.
       ProcessDeclAttributeList(S, Prev, D.getDeclSpec().getAttributes());
+      if (PrevTemplate)
+        ProcessAPINotes(Prev);
+
       if (TSK == TSK_ExplicitInstantiationDefinition)
         InstantiateVariableDefinition(D.getIdentifierLoc(), Prev);
     }
@@ -10931,6 +10937,7 @@ DeclResult Sema::ActOnExplicitInstantiation(Scope *S,
   }
 
   ProcessDeclAttributeList(S, Specialization, D.getDeclSpec().getAttributes());
+  ProcessAPINotes(Specialization);
 
   // In MSVC mode, dllimported explicit instantiation definitions are treated as
   // instantiation declarations.

From 285bff39fd283b3a9a27c06525111d8d4f474e6e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 26 Feb 2024 13:52:55 +0000
Subject: [PATCH 316/546] [lldb][test][Windows] Skip thread state test on
 Windows

This actually passes on Windows but I don't know how to convey
that with an xfail without clashing with the xfail for all
platforms.

At least this avoids a UPASS.
---
 lldb/test/API/functionalities/thread/state/TestThreadStates.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lldb/test/API/functionalities/thread/state/TestThreadStates.py b/lldb/test/API/functionalities/thread/state/TestThreadStates.py
index 97a6e250dcc39..f4c17df523382 100644
--- a/lldb/test/API/functionalities/thread/state/TestThreadStates.py
+++ b/lldb/test/API/functionalities/thread/state/TestThreadStates.py
@@ -39,6 +39,9 @@ def test_state_after_continue(self):
     @skipIfDarwin  # 'llvm.org/pr23669', cause Python crash randomly
     @expectedFailureDarwin("llvm.org/pr23669")
     @expectedFailureNetBSD
+    # This actually passes on Windows on Arm but it's hard to describe that
+    # and xfail it everywhere else.
+    @skipIfWindows
     # thread states not properly maintained
     @unittest.expectedFailure  # llvm.org/pr16712
     def test_state_after_expression(self):

From 8ce81e5924935436d49e0b4e835fa107531505b5 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 26 Feb 2024 14:00:14 +0000
Subject: [PATCH 317/546] [lldb][test][Windows] Don't assert that module cache
 is empty

For whatever reason on Windows, it is not at this point.

The copy of unit test we used to use would ignore failures during
teardown but Python's does not.
---
 lldb/packages/Python/lldbsuite/test/lldbtest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 493152166094e..c28a78a2c4a27 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1060,7 +1060,9 @@ def tearDown(self):
         lldb.SBModule.GarbageCollectAllocatedModules()
 
         # Assert that the global module cache is empty.
-        self.assertEqual(lldb.SBModule.GetNumberAllocatedModules(), 0)
+        # FIXME: This assert fails on Windows.
+        if self.getPlatform() != "windows":
+            self.assertEqual(lldb.SBModule.GetNumberAllocatedModules(), 0)
 
     # =========================================================
     # Various callbacks to allow introspection of test progress

From 668cd1ca15a8b9c60a87e5244db9c97b3ba2e624 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Mon, 26 Feb 2024 06:04:38 -0800
Subject: [PATCH 318/546] [OpenACC] Implement 'return' branch-out of Compute
 Construct (#82814)

Like with 'break'/'continue', returning out of a compute construct is
ill-formed, so this implements the diagnostic. However, unlike the
OpenMP implementation of this same diagnostic, OpenACC doesn't have a
concept of 'capture region', so this is implemented as just checking the
'scope'.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  5 +++--
 clang/include/clang/Sema/Scope.h              | 13 ++++++++++++
 clang/lib/Sema/SemaStmt.cpp                   | 16 +++++++++++----
 clang/test/SemaOpenACC/no-branch-in-out.c     | 20 +++++++++++++++++++
 clang/test/SemaOpenACC/no-branch-in-out.cpp   | 17 ++++++++++++++++
 5 files changed, 65 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/SemaOpenACC/no-branch-in-out.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dad1764ad4f86..57784a4ba2e38 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12210,6 +12210,7 @@ def warn_acc_clause_unimplemented
 def err_acc_construct_appertainment
     : Error<"OpenACC construct '%0' cannot be used here; it can only "
             "be used in a statement context">;
-def err_acc_branch_in_out
-    : Error<"invalid branch %select{out of|into}0 OpenACC Compute Construct">;
+def err_acc_branch_in_out_compute_construct
+    : Error<"invalid %select{branch|return}0 %select{out of|into}1 OpenACC "
+            "Compute Construct">;
 } // end of sema component.
diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h
index e7f166fe3461f..b6b5a1f3479a2 100644
--- a/clang/include/clang/Sema/Scope.h
+++ b/clang/include/clang/Sema/Scope.h
@@ -521,6 +521,19 @@ class Scope {
     return getFlags() & Scope::OpenACCComputeConstructScope;
   }
 
+  bool isInOpenACCComputeConstructScope() const {
+    for (const Scope *S = this; S; S = S->getParent()) {
+      if (S->getFlags() & Scope::OpenACCComputeConstructScope)
+        return true;
+      else if (S->getFlags() &
+               (Scope::FnScope | Scope::ClassScope | Scope::BlockScope |
+                Scope::TemplateParamScope | Scope::FunctionPrototypeScope |
+                Scope::AtCatchScope | Scope::ObjCMethodScope))
+        return false;
+    }
+    return false;
+  }
+
   /// Determine whether this scope is a while/do/for statement, which can have
   /// continue statements embedded into it.
   bool isContinueScope() const {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index fcad09a63662b..0a5c2b23a90c8 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3361,8 +3361,9 @@ Sema::ActOnContinueStmt(SourceLocation ContinueLoc, Scope *CurScope) {
   // of a compute construct counts as 'branching out of' the compute construct,
   // so diagnose here.
   if (S->isOpenACCComputeConstructScope())
-    return StmtError(Diag(ContinueLoc, diag::err_acc_branch_in_out)
-                     << /*out of */ 0);
+    return StmtError(
+        Diag(ContinueLoc, diag::err_acc_branch_in_out_compute_construct)
+        << /*branch*/ 0 << /*out of */ 0);
 
   CheckJumpOutOfSEHFinally(*this, ContinueLoc, *S);
 
@@ -3390,8 +3391,9 @@ Sema::ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope) {
   if (S->isOpenACCComputeConstructScope() ||
       (S->isLoopScope() && S->getParent() &&
        S->getParent()->isOpenACCComputeConstructScope()))
-    return StmtError(Diag(BreakLoc, diag::err_acc_branch_in_out)
-                     << /*out of */ 0);
+    return StmtError(
+        Diag(BreakLoc, diag::err_acc_branch_in_out_compute_construct)
+        << /*branch*/ 0 << /*out of */ 0);
 
   CheckJumpOutOfSEHFinally(*this, BreakLoc, *S);
 
@@ -3947,6 +3949,12 @@ Sema::ActOnReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp,
       RetValExp, nullptr, /*RecoverUncorrectedTypos=*/true);
   if (RetVal.isInvalid())
     return StmtError();
+
+  if (getCurScope()->isInOpenACCComputeConstructScope())
+    return StmtError(
+        Diag(ReturnLoc, diag::err_acc_branch_in_out_compute_construct)
+        << /*return*/ 1 << /*out of */ 0);
+
   StmtResult R =
       BuildReturnStmt(ReturnLoc, RetVal.get(), /*AllowRecovery=*/true);
   if (R.isInvalid() || ExprEvalContexts.back().isDiscardedStatementContext())
diff --git a/clang/test/SemaOpenACC/no-branch-in-out.c b/clang/test/SemaOpenACC/no-branch-in-out.c
index 33a171f1b68d5..f8fb40a1ca8f7 100644
--- a/clang/test/SemaOpenACC/no-branch-in-out.c
+++ b/clang/test/SemaOpenACC/no-branch-in-out.c
@@ -93,3 +93,23 @@ void BreakContinue() {
 
 }
 
+void Return() {
+#pragma acc parallel
+  {
+    return;// expected-error{{invalid return out of OpenACC Compute Construct}}
+  }
+
+#pragma acc parallel
+  {
+    {
+      return;// expected-error{{invalid return out of OpenACC Compute Construct}}
+    }
+  }
+
+#pragma acc parallel
+  {
+    for (int i = 0; i < 5; ++i) {
+      return;// expected-error{{invalid return out of OpenACC Compute Construct}}
+    }
+  }
+}
diff --git a/clang/test/SemaOpenACC/no-branch-in-out.cpp b/clang/test/SemaOpenACC/no-branch-in-out.cpp
new file mode 100644
index 0000000000000..232e372cedd35
--- /dev/null
+++ b/clang/test/SemaOpenACC/no-branch-in-out.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -verify -fopenacc -fcxx-exceptions
+
+
+void ReturnTest() {
+#pragma acc parallel
+  {
+    (void)[]() { return; };
+  }
+
+#pragma acc parallel
+  {
+    try {}
+    catch(...){
+      return; // expected-error{{invalid return out of OpenACC Compute Construct}}
+    }
+  }
+}

From 7c52d0c98187b55d2f513122c21daf49d88169a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 15:07:08 +0100
Subject: [PATCH 319/546] [clang][Interp] Try to atomic.c on Mac

This test was broken on MacOS, see the discussion in
https://github.com/llvm/llvm-project/commit/a35599b9ae5e7ad924b78c65f6348e0b711bad5d
---
 clang/test/AST/Interp/atomic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/AST/Interp/atomic.c b/clang/test/AST/Interp/atomic.c
index 316e8d5bf3516..c5fd9ab222934 100644
--- a/clang/test/AST/Interp/atomic.c
+++ b/clang/test/AST/Interp/atomic.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fgnuc-version=4.2.1 -fexperimental-new-constant-interpreter -verify=both,expected %s
-// RUN: %clang_cc1 -fgnuc-version=4.2.1 -verify=both,ref %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -triple=i686-linux-gnu -ffreestanding -fexperimental-new-constant-interpreter -verify=both,expected %s
+// RUN: %clang_cc1 -fgnuc-version=4.2.1 -triple=i686-linux-gnu -ffreestanding -verify=both,ref %s
 
 /// FIXME: Copied from test/Sema/atomic-expr.c.
 /// this expression seems to be rejected for weird reasons,

From ce78dfa4f0470d79979818e322e76050fb082f4e Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 26 Feb 2024 14:10:24 +0000
Subject: [PATCH 320/546] [gn build] Port 28233408a2c8

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index e78ef13869e64..327b10c085c5b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -74,6 +74,7 @@ static_library("CodeGen") {
     "IfConversion.cpp",
     "ImplicitNullChecks.cpp",
     "IndirectBrExpandPass.cpp",
+    "InitUndef.cpp",
     "InlineSpiller.cpp",
     "InterferenceCache.cpp",
     "InterleavedAccessPass.cpp",

From 62e88bc89a718ed557784afb2572e1e664cfd94e Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 26 Feb 2024 14:10:25 +0000
Subject: [PATCH 321/546] [gn build] Port 440b1743ee0c

---
 llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index 382ebf81cc30b..b03032300a764 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -50,6 +50,7 @@ static_library("Sema") {
     "Scope.cpp",
     "ScopeInfo.cpp",
     "Sema.cpp",
+    "SemaAPINotes.cpp",
     "SemaAccess.cpp",
     "SemaAttr.cpp",
     "SemaAvailability.cpp",

From f887fad547c7103c05f33be81fecc03782216ce6 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 26 Feb 2024 14:10:26 +0000
Subject: [PATCH 322/546] [gn build] Port 8c5e9cf73713

---
 llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index 1df3f3a224854..cfdfd203fc62c 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -108,6 +108,7 @@ static_library("AST") {
     "Interp/InterpBlock.cpp",
     "Interp/InterpBuiltin.cpp",
     "Interp/InterpFrame.cpp",
+    "Interp/InterpShared.cpp",
     "Interp/InterpStack.cpp",
     "Interp/InterpState.cpp",
     "Interp/Pointer.cpp",

From ac86a76ed5ac968914dd13fc6b6b76c337728098 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 26 Feb 2024 15:24:12 +0100
Subject: [PATCH 323/546] [libc][NFC] Delete unused file (#82980)

Indentified in
https://github.com/llvm/llvm-project/pull/77741#pullrequestreview-1893531270
---
 .../crash-f7dbdb2b330aad91f520099159e736e91bb9ddbf | Bin 67 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 utils/bazel/crash-f7dbdb2b330aad91f520099159e736e91bb9ddbf

diff --git a/utils/bazel/crash-f7dbdb2b330aad91f520099159e736e91bb9ddbf b/utils/bazel/crash-f7dbdb2b330aad91f520099159e736e91bb9ddbf
deleted file mode 100644
index 5dd7dca5e358f48f0cc0fd92a530cc70cfea7189..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 67
ZcmZQzI0*&}K)?ke@B?P3v@=54UjQvc1_S^A


From 83feb846482f0100cb29d460d3d8de2690fc32ad Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 26 Feb 2024 14:29:42 +0000
Subject: [PATCH 324/546] [AMDGPU] Reduce duplication in DS Real instruction
 definitions. NFC. (#83007)

For renamed instructions, there is no need to mention the new name twice
on every line defining a Real.
---
 llvm/lib/Target/AMDGPU/DSInstructions.td | 118 +++++++++++------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 780479a3885bd..515b9476b25b7 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1231,25 +1231,25 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
                                                ps.Mnemonic, /*hasGDS=*/false>;
   }
 
-  multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
-                                   string real_name> {
+  multiclass DS_Real_Renamed_gfx12<bits<8> op, string name> {
+    defvar ps = !cast<DS_Pseudo>(NAME);
     def _gfx12 :
-      Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo,
+      Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps,
                                                SIEncodingFamily.GFX12,
-                                               real_name, /*hasGDS=*/false>,
-      MnemonicAlias<backing_pseudo.Mnemonic, real_name>,
+                                               name, /*hasGDS=*/false>,
+      MnemonicAlias<ps.Mnemonic, name>,
       Requires<[isGFX12Plus]>;
   }
 } // End AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12"
 
-defm DS_MIN_NUM_F32       : DS_Real_Renamed_gfx12<0x012, DS_MIN_F32, "ds_min_num_f32">;
-defm DS_MAX_NUM_F32       : DS_Real_Renamed_gfx12<0x013, DS_MAX_F32, "ds_max_num_f32">;
-defm DS_MIN_NUM_RTN_F32   : DS_Real_Renamed_gfx12<0x032, DS_MIN_RTN_F32, "ds_min_num_rtn_f32">;
-defm DS_MAX_NUM_RTN_F32   : DS_Real_Renamed_gfx12<0x033, DS_MAX_RTN_F32, "ds_max_num_rtn_f32">;
-defm DS_MIN_NUM_F64       : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num_f64">;
-defm DS_MAX_NUM_F64       : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">;
-defm DS_MIN_NUM_RTN_F64   : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">;
-defm DS_MAX_NUM_RTN_F64   : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">;
+defm DS_MIN_F32           : DS_Real_Renamed_gfx12<0x012, "ds_min_num_f32">;
+defm DS_MAX_F32           : DS_Real_Renamed_gfx12<0x013, "ds_max_num_f32">;
+defm DS_MIN_RTN_F32       : DS_Real_Renamed_gfx12<0x032, "ds_min_num_rtn_f32">;
+defm DS_MAX_RTN_F32       : DS_Real_Renamed_gfx12<0x033, "ds_max_num_rtn_f32">;
+defm DS_MIN_F64           : DS_Real_Renamed_gfx12<0x052, "ds_min_num_f64">;
+defm DS_MAX_F64           : DS_Real_Renamed_gfx12<0x053, "ds_max_num_f64">;
+defm DS_MIN_RTN_F64       : DS_Real_Renamed_gfx12<0x072, "ds_min_num_rtn_f64">;
+defm DS_MAX_RTN_F64       : DS_Real_Renamed_gfx12<0x073, "ds_max_num_rtn_f64">;
 defm DS_COND_SUB_U32      : DS_Real_gfx12<0x098>;
 defm DS_SUB_CLAMP_U32     : DS_Real_gfx12<0x099>;
 defm DS_COND_SUB_RTN_U32  : DS_Real_gfx12<0x0a8>;
@@ -1270,58 +1270,58 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
                                                SIEncodingFamily.GFX11>;
   }
 
-  multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> {
-     def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>,
-               MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>;
+  multiclass DS_Real_Renamed_gfx11<bits<8> op, string name> {
+    defvar ps = !cast<DS_Pseudo>(NAME);
+    def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX11, name>,
+                 MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX11Only]>;
   }
 } // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
 
 multiclass DS_Real_gfx11_gfx12<bits<8> op>
   : DS_Real_gfx11<op>, DS_Real_gfx12<op>;
 
-multiclass DS_Real_Renamed_gfx11_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
-                                       string real_name>
-  : DS_Real_Renamed_gfx11<op, backing_pseudo, real_name>,
-    DS_Real_Renamed_gfx12<op, backing_pseudo, real_name>;
-
-defm DS_STORE_B32                        : DS_Real_Renamed_gfx11_gfx12<0x00d, DS_WRITE_B32, "ds_store_b32">;
-defm DS_STORE_2ADDR_B32                  : DS_Real_Renamed_gfx11_gfx12<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">;
-defm DS_STORE_2ADDR_STRIDE64_B32         : DS_Real_Renamed_gfx11_gfx12<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">;
-defm DS_STORE_B8                         : DS_Real_Renamed_gfx11_gfx12<0x01e, DS_WRITE_B8, "ds_store_b8">;
-defm DS_STORE_B16                        : DS_Real_Renamed_gfx11_gfx12<0x01f, DS_WRITE_B16, "ds_store_b16">;
-defm DS_STOREXCHG_RTN_B32                : DS_Real_Renamed_gfx11_gfx12<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">;
-defm DS_STOREXCHG_2ADDR_RTN_B32          : DS_Real_Renamed_gfx11_gfx12<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">;
-defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">;
-defm DS_LOAD_B32                         : DS_Real_Renamed_gfx11_gfx12<0x036, DS_READ_B32, "ds_load_b32">;
-defm DS_LOAD_2ADDR_B32                   : DS_Real_Renamed_gfx11_gfx12<0x037, DS_READ2_B32, "ds_load_2addr_b32">;
-defm DS_LOAD_2ADDR_STRIDE64_B32          : DS_Real_Renamed_gfx11_gfx12<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">;
-defm DS_LOAD_I8                          : DS_Real_Renamed_gfx11_gfx12<0x039, DS_READ_I8, "ds_load_i8">;
-defm DS_LOAD_U8                          : DS_Real_Renamed_gfx11_gfx12<0x03a, DS_READ_U8, "ds_load_u8">;
-defm DS_LOAD_I16                         : DS_Real_Renamed_gfx11_gfx12<0x03b, DS_READ_I16, "ds_load_i16">;
-defm DS_LOAD_U16                         : DS_Real_Renamed_gfx11_gfx12<0x03c, DS_READ_U16, "ds_load_u16">;
-defm DS_STORE_B64                        : DS_Real_Renamed_gfx11_gfx12<0x04d, DS_WRITE_B64, "ds_store_b64">;
-defm DS_STORE_2ADDR_B64                  : DS_Real_Renamed_gfx11_gfx12<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">;
-defm DS_STORE_2ADDR_STRIDE64_B64         : DS_Real_Renamed_gfx11_gfx12<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">;
-defm DS_STOREXCHG_RTN_B64                : DS_Real_Renamed_gfx11_gfx12<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">;
-defm DS_STOREXCHG_2ADDR_RTN_B64          : DS_Real_Renamed_gfx11_gfx12<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">;
-defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">;
-defm DS_LOAD_B64                         : DS_Real_Renamed_gfx11_gfx12<0x076, DS_READ_B64, "ds_load_b64">;
-defm DS_LOAD_2ADDR_B64                   : DS_Real_Renamed_gfx11_gfx12<0x077, DS_READ2_B64, "ds_load_2addr_b64">;
-defm DS_LOAD_2ADDR_STRIDE64_B64          : DS_Real_Renamed_gfx11_gfx12<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">;
-defm DS_STORE_B8_D16_HI                  : DS_Real_Renamed_gfx11_gfx12<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">;
-defm DS_STORE_B16_D16_HI                 : DS_Real_Renamed_gfx11_gfx12<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">;
-defm DS_LOAD_U8_D16                      : DS_Real_Renamed_gfx11_gfx12<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">;
-defm DS_LOAD_U8_D16_HI                   : DS_Real_Renamed_gfx11_gfx12<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">;
-defm DS_LOAD_I8_D16                      : DS_Real_Renamed_gfx11_gfx12<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">;
-defm DS_LOAD_I8_D16_HI                   : DS_Real_Renamed_gfx11_gfx12<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">;
-defm DS_LOAD_U16_D16                     : DS_Real_Renamed_gfx11_gfx12<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">;
-defm DS_LOAD_U16_D16_HI                  : DS_Real_Renamed_gfx11_gfx12<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">;
-defm DS_STORE_ADDTID_B32                 : DS_Real_Renamed_gfx11_gfx12<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">;
-defm DS_LOAD_ADDTID_B32                  : DS_Real_Renamed_gfx11_gfx12<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">;
-defm DS_STORE_B96                        : DS_Real_Renamed_gfx11_gfx12<0x0de, DS_WRITE_B96, "ds_store_b96">;
-defm DS_STORE_B128                       : DS_Real_Renamed_gfx11_gfx12<0x0df, DS_WRITE_B128, "ds_store_b128">;
-defm DS_LOAD_B96                         : DS_Real_Renamed_gfx11_gfx12<0x0fe, DS_READ_B96, "ds_load_b96">;
-defm DS_LOAD_B128                        : DS_Real_Renamed_gfx11_gfx12<0x0ff, DS_READ_B128, "ds_load_b128">;
+multiclass DS_Real_Renamed_gfx11_gfx12<bits<8> op, string name>
+  : DS_Real_Renamed_gfx11<op, name>,
+    DS_Real_Renamed_gfx12<op, name>;
+
+defm DS_WRITE_B32           : DS_Real_Renamed_gfx11_gfx12<0x00d, "ds_store_b32">;
+defm DS_WRITE2_B32          : DS_Real_Renamed_gfx11_gfx12<0x00e, "ds_store_2addr_b32">;
+defm DS_WRITE2ST64_B32      : DS_Real_Renamed_gfx11_gfx12<0x00f, "ds_store_2addr_stride64_b32">;
+defm DS_WRITE_B8            : DS_Real_Renamed_gfx11_gfx12<0x01e, "ds_store_b8">;
+defm DS_WRITE_B16           : DS_Real_Renamed_gfx11_gfx12<0x01f, "ds_store_b16">;
+defm DS_WRXCHG_RTN_B32      : DS_Real_Renamed_gfx11_gfx12<0x02d, "ds_storexchg_rtn_b32">;
+defm DS_WRXCHG2_RTN_B32     : DS_Real_Renamed_gfx11_gfx12<0x02e, "ds_storexchg_2addr_rtn_b32">;
+defm DS_WRXCHG2ST64_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02f, "ds_storexchg_2addr_stride64_rtn_b32">;
+defm DS_READ_B32            : DS_Real_Renamed_gfx11_gfx12<0x036, "ds_load_b32">;
+defm DS_READ2_B32           : DS_Real_Renamed_gfx11_gfx12<0x037, "ds_load_2addr_b32">;
+defm DS_READ2ST64_B32       : DS_Real_Renamed_gfx11_gfx12<0x038, "ds_load_2addr_stride64_b32">;
+defm DS_READ_I8             : DS_Real_Renamed_gfx11_gfx12<0x039, "ds_load_i8">;
+defm DS_READ_U8             : DS_Real_Renamed_gfx11_gfx12<0x03a, "ds_load_u8">;
+defm DS_READ_I16            : DS_Real_Renamed_gfx11_gfx12<0x03b, "ds_load_i16">;
+defm DS_READ_U16            : DS_Real_Renamed_gfx11_gfx12<0x03c, "ds_load_u16">;
+defm DS_WRITE_B64           : DS_Real_Renamed_gfx11_gfx12<0x04d, "ds_store_b64">;
+defm DS_WRITE2_B64          : DS_Real_Renamed_gfx11_gfx12<0x04e, "ds_store_2addr_b64">;
+defm DS_WRITE2ST64_B64      : DS_Real_Renamed_gfx11_gfx12<0x04f, "ds_store_2addr_stride64_b64">;
+defm DS_WRXCHG_RTN_B64      : DS_Real_Renamed_gfx11_gfx12<0x06d, "ds_storexchg_rtn_b64">;
+defm DS_WRXCHG2_RTN_B64     : DS_Real_Renamed_gfx11_gfx12<0x06e, "ds_storexchg_2addr_rtn_b64">;
+defm DS_WRXCHG2ST64_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06f, "ds_storexchg_2addr_stride64_rtn_b64">;
+defm DS_READ_B64            : DS_Real_Renamed_gfx11_gfx12<0x076, "ds_load_b64">;
+defm DS_READ2_B64           : DS_Real_Renamed_gfx11_gfx12<0x077, "ds_load_2addr_b64">;
+defm DS_READ2ST64_B64       : DS_Real_Renamed_gfx11_gfx12<0x078, "ds_load_2addr_stride64_b64">;
+defm DS_WRITE_B8_D16_HI     : DS_Real_Renamed_gfx11_gfx12<0x0a0, "ds_store_b8_d16_hi">;
+defm DS_WRITE_B16_D16_HI    : DS_Real_Renamed_gfx11_gfx12<0x0a1, "ds_store_b16_d16_hi">;
+defm DS_READ_U8_D16         : DS_Real_Renamed_gfx11_gfx12<0x0a2, "ds_load_u8_d16">;
+defm DS_READ_U8_D16_HI      : DS_Real_Renamed_gfx11_gfx12<0x0a3, "ds_load_u8_d16_hi">;
+defm DS_READ_I8_D16         : DS_Real_Renamed_gfx11_gfx12<0x0a4, "ds_load_i8_d16">;
+defm DS_READ_I8_D16_HI      : DS_Real_Renamed_gfx11_gfx12<0x0a5, "ds_load_i8_d16_hi">;
+defm DS_READ_U16_D16        : DS_Real_Renamed_gfx11_gfx12<0x0a6, "ds_load_u16_d16">;
+defm DS_READ_U16_D16_HI     : DS_Real_Renamed_gfx11_gfx12<0x0a7, "ds_load_u16_d16_hi">;
+defm DS_WRITE_ADDTID_B32    : DS_Real_Renamed_gfx11_gfx12<0x0b0, "ds_store_addtid_b32">;
+defm DS_READ_ADDTID_B32     : DS_Real_Renamed_gfx11_gfx12<0x0b1, "ds_load_addtid_b32">;
+defm DS_WRITE_B96           : DS_Real_Renamed_gfx11_gfx12<0x0de, "ds_store_b96">;
+defm DS_WRITE_B128          : DS_Real_Renamed_gfx11_gfx12<0x0df, "ds_store_b128">;
+defm DS_READ_B96            : DS_Real_Renamed_gfx11_gfx12<0x0fe, "ds_load_b96">;
+defm DS_READ_B128           : DS_Real_Renamed_gfx11_gfx12<0x0ff, "ds_load_b128">;
 
 // DS_CMPST_* are renamed to DS_CMPSTORE_* in GFX11, but also the data operands (src and cmp) are swapped
 // comparing to pre-GFX11.

From 9cfb138eccb83b5876928b08be346fde5ca78b47 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Mon, 26 Feb 2024 09:58:55 -0500
Subject: [PATCH 325/546] [Clang][Sema] Defer instantiation of exception
 specification until after partial ordering when determining primary template
 (#82417)

Consider the following:
```
struct A {
  static constexpr bool x = true;
};

template<typename T, typename U>
void f(T, U) noexcept(T::y); // #1, error: no member named 'y' in 'A'

template<typename T, typename U>
void f(T, U*) noexcept(T::x); // #2

template<>
void f(A, int*) noexcept; // explicit specialization of #2
```

We currently instantiate the exception specification of all candidate
function template specializations when deducting template arguments for
an explicit specialization, which results in a error despite `#1` not
being selected by partial ordering as the most specialized template.
According to [except.spec] p13:
> An exception specification is considered to be needed when:
> - [...]
> - the exception specification is compared to that of another
declaration (e.g., an explicit specialization or an overriding virtual
function);

Assuming that "comparing declarations" means "determining whether the
declarations correspond and declare the same entity" (per [basic.scope.scope] p4 and
[basic.link] p11.1, respectively), the exception specification does _not_ need to be
instantiated until _after_ partial ordering, at which point we determine
whether the implicitly instantiated specialization and the explicit
specialization declare the same entity (the determination of whether two
functions/function templates correspond does not consider the exception
specifications).

This patch defers the instantiation of the exception specification until
a single function template specialization is selected via partial
ordering, matching the behavior of GCC, EDG, and
MSVC: see https://godbolt.org/z/Ebb6GTcWE.
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 clang/lib/Sema/SemaTemplate.cpp               | 34 +++++++++
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 14 ++--
 clang/test/CXX/except/except.spec/p13.cpp     | 74 +++++++++++++++++++
 .../SemaTemplate/class-template-noexcept.cpp  | 14 +---
 5 files changed, 117 insertions(+), 21 deletions(-)
 create mode 100644 clang/test/CXX/except/except.spec/p13.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 529dd783ab738..9e67bbb789504 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -281,6 +281,8 @@ Bug Fixes to C++ Support
   a requires-clause lie at the same depth as those of the surrounding lambda. This,
   in turn, results in the wrong template argument substitution during constraint checking.
   (`#78524 <https://github.com/llvm/llvm-project/issues/78524>`_)
+- Clang no longer instantiates the exception specification of discarded candidate function
+  templates when determining the primary template of an explicit specialization.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index c95c08c0422ed..e91033dd88689 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -9709,6 +9709,40 @@ bool Sema::CheckFunctionTemplateSpecialization(
   // Ignore access information;  it doesn't figure into redeclaration checking.
   FunctionDecl *Specialization = cast<FunctionDecl>(*Result);
 
+  // C++23 [except.spec]p13:
+  //   An exception specification is considered to be needed when:
+  //   - [...]
+  //   - the exception specification is compared to that of another declaration
+  //     (e.g., an explicit specialization or an overriding virtual function);
+  //   - [...]
+  //
+  //  The exception specification of a defaulted function is evaluated as
+  //  described above only when needed; similarly, the noexcept-specifier of a
+  //  specialization of a function template or member function of a class
+  //  template is instantiated only when needed.
+  //
+  // The standard doesn't specify what the "comparison with another declaration"
+  // entails, nor the exact circumstances in which it occurs. Moreover, it does
+  // not state which properties of an explicit specialization must match the
+  // primary template.
+  //
+  // We assume that an explicit specialization must correspond with (per
+  // [basic.scope.scope]p4) and declare the same entity as (per [basic.link]p8)
+  // the declaration produced by substitution into the function template.
+  //
+  // Since the determination whether two function declarations correspond does
+  // not consider exception specification, we only need to instantiate it once
+  // we determine the primary template when comparing types per
+  // [basic.link]p11.1.
+  auto *SpecializationFPT =
+      Specialization->getType()->castAs<FunctionProtoType>();
+  // If the function has a dependent exception specification, resolve it after
+  // we have selected the primary template so we can check whether it matches.
+  if (getLangOpts().CPlusPlus17 &&
+      isUnresolvedExceptionSpec(SpecializationFPT->getExceptionSpecType()) &&
+      !ResolveExceptionSpec(FD->getLocation(), SpecializationFPT))
+    return true;
+
   FunctionTemplateSpecializationInfo *SpecInfo
     = Specialization->getTemplateSpecializationInfo();
   assert(SpecInfo && "Function template specialization info missing?");
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 47cc22310c4ee..563491f76f547 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -4632,11 +4632,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
                                                Info.getLocation()))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
-  // If the function has a dependent exception specification, resolve it now,
-  // so we can check that the exception specification matches.
   auto *SpecializationFPT =
       Specialization->getType()->castAs<FunctionProtoType>();
-  if (getLangOpts().CPlusPlus17 &&
+  if (IsAddressOfFunction && getLangOpts().CPlusPlus17 &&
       isUnresolvedExceptionSpec(SpecializationFPT->getExceptionSpecType()) &&
       !ResolveExceptionSpec(Info.getLocation(), SpecializationFPT))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
@@ -4662,11 +4660,11 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
   // specialization with respect to arguments of compatible pointer to function
   // types, template argument deduction fails.
   if (!ArgFunctionType.isNull()) {
-    if (IsAddressOfFunction
-            ? !isSameOrCompatibleFunctionType(
-                  Context.getCanonicalType(SpecializationType),
-                  Context.getCanonicalType(ArgFunctionType))
-            : !Context.hasSameType(SpecializationType, ArgFunctionType)) {
+    if (IsAddressOfFunction ? !isSameOrCompatibleFunctionType(
+                                  Context.getCanonicalType(SpecializationType),
+                                  Context.getCanonicalType(ArgFunctionType))
+                            : !Context.hasSameFunctionTypeIgnoringExceptionSpec(
+                                  SpecializationType, ArgFunctionType)) {
       Info.FirstArg = TemplateArgument(SpecializationType);
       Info.SecondArg = TemplateArgument(ArgFunctionType);
       return TemplateDeductionResult::NonDeducedMismatch;
diff --git a/clang/test/CXX/except/except.spec/p13.cpp b/clang/test/CXX/except/except.spec/p13.cpp
new file mode 100644
index 0000000000000..61cdb74f21ec5
--- /dev/null
+++ b/clang/test/CXX/except/except.spec/p13.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -fexceptions -fcxx-exceptions -fsyntax-only -verify %s
+
+struct A {
+  static constexpr bool x = true;
+};
+
+namespace N0 {
+
+template<typename T, typename U>
+void f(T, U) noexcept(T::y); // #1
+
+template<typename T, typename U> // #2
+void f(T, U*) noexcept(T::x);
+
+// Deduction should succeed for both candidates, and #2 should be selected as the primary template.
+// Only the exception specification of #2 should be instantiated.
+template<>
+void f(A, int*) noexcept;
+
+}
+
+namespace N1 {
+
+template<typename T, typename U>
+void f(T, U) noexcept(T::x); // #1
+
+template<typename T, typename U>
+void f(T, U*) noexcept(T::y); // #2
+// expected-error@-1 {{no member named 'y' in 'A'}}
+
+// Deduction should succeed for both candidates, and #2 should be selected as the primary template.
+// Only the exception specification of #2 should be instantiated.
+template<>
+void f(A, int*) noexcept; // expected-error {{exception specification in declaration does not match previous declaration}}
+                          // expected-note@-1 {{in instantiation of exception specification for 'f<A, int>' requested here}}
+                          // expected-note@-2 {{previous declaration is here}}
+}
+
+namespace N2 {
+
+template<typename T, typename U>
+void f(T, U) noexcept(T::x);
+
+template<typename T, typename U>
+void f(T, U*) noexcept(T::x);
+
+template<typename T, typename U>
+void f(T, U**) noexcept(T::y); // expected-error {{no member named 'y' in 'A'}}
+
+template<typename T, typename U>
+void f(T, U***) noexcept(T::x);
+
+template<>
+void f(A, int*) noexcept; // expected-note {{previous declaration is here}}
+
+template<>
+void f(A, int*); // expected-error {{'f<A, int>' is missing exception specification 'noexcept'}}
+
+template<>
+void f(A, int**) noexcept; // expected-error {{exception specification in declaration does not match previous declaration}}
+                           // expected-note@-1 {{in instantiation of exception specification for 'f<A, int>' requested here}}
+                           // expected-note@-2 {{previous declaration is here}}
+
+// FIXME: Exception specification is currently set to EST_None if instantiation fails.
+template<>
+void f(A, int**);
+
+template<>
+void f(A, int***) noexcept; // expected-note {{previous declaration is here}}
+
+template<>
+void f(A, int***); // expected-error {{'f<A, int>' is missing exception specification 'noexcept'}}
+
+}
diff --git a/clang/test/SemaTemplate/class-template-noexcept.cpp b/clang/test/SemaTemplate/class-template-noexcept.cpp
index 5c4ac090f3166..14d2e36bc0bfa 100644
--- a/clang/test/SemaTemplate/class-template-noexcept.cpp
+++ b/clang/test/SemaTemplate/class-template-noexcept.cpp
@@ -2,9 +2,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
 // RUN: %clang_cc1 -std=c++17 -verify %s
 // RUN: %clang_cc1 -std=c++1z -verify %s
-#if __cplusplus >= 201703
-// expected-no-diagnostics
-#endif
+
 class A {
 public:
   static const char X;
@@ -14,19 +12,9 @@ const char A::X = 0;
 template<typename U> void func() noexcept(U::X);
 
 template<class... B, char x>
-#if __cplusplus >= 201703
-void foo(void(B...) noexcept(x)) {} 
-#else
 void foo(void(B...) noexcept(x)) {} // expected-note{{candidate template ignored}}
-#endif
 
 void bar()
 {
-#if __cplusplus >= 201703
-  foo(func<A>);
-#else
   foo(func<A>);	// expected-error{{no matching function for call}}
-#endif	
 }
-
-

From 969d7ecf0b1d51e04e774b0695ffc1d04af81bde Mon Sep 17 00:00:00 2001
From: Francesco Petrogalli <francesco.petrogalli@apple.com>
Date: Mon, 26 Feb 2024 16:01:52 +0100
Subject: [PATCH 326/546] [llvm][CodeGen] Add ValueType v3i1. [NFCI] (#82338)

---
 llvm/include/llvm/CodeGen/ValueTypes.td       | 403 +++++++++---------
 llvm/lib/CodeGen/ValueTypes.cpp               |   2 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   1 +
 3 files changed, 205 insertions(+), 201 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 1054738306530..900b30d9b0249 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -78,212 +78,213 @@ def ppcf128 : VTFP<128, 16>;  // PPC 128-bit floating point value
 
 def v1i1    : VTVec<1,    i1, 17>;  //    1 x i1 vector value
 def v2i1    : VTVec<2,    i1, 18>;  //    2 x i1 vector value
-def v4i1    : VTVec<4,    i1, 19>;  //    4 x i1 vector value
-def v8i1    : VTVec<8,    i1, 20>;  //    8 x i1 vector value
-def v16i1   : VTVec<16,   i1, 21>;  //   16 x i1 vector value
-def v32i1   : VTVec<32,   i1, 22>;  //   32 x i1 vector value
-def v64i1   : VTVec<64,   i1, 23>;  //   64 x i1 vector value
-def v128i1  : VTVec<128,  i1, 24>;  //  128 x i1 vector value
-def v256i1  : VTVec<256,  i1, 25>;  //  256 x i1 vector value
-def v512i1  : VTVec<512,  i1, 26>;  //  512 x i1 vector value
-def v1024i1 : VTVec<1024, i1, 27>;  // 1024 x i1 vector value
-def v2048i1 : VTVec<2048, i1, 28>;  // 2048 x i1 vector value
-
-def v128i2  : VTVec<128,  i2, 29>;   //  128 x i2 vector value
-def v256i2  : VTVec<256,  i2, 30>;   //  256 x i2 vector value
-
-def v64i4   : VTVec<64,   i4, 31>;   //   64 x i4 vector value
-def v128i4  : VTVec<128,  i4, 32>;   //  128 x i4 vector value
-
-def v1i8    : VTVec<1,    i8, 33>;  //    1 x i8 vector value
-def v2i8    : VTVec<2,    i8, 34>;  //    2 x i8 vector value
-def v3i8    : VTVec<3,    i8, 35>;  //    3 x i8 vector value
-def v4i8    : VTVec<4,    i8, 36>;  //    4 x i8 vector value
-def v8i8    : VTVec<8,    i8, 37>;  //    8 x i8 vector value
-def v16i8   : VTVec<16,   i8, 38>;  //   16 x i8 vector value
-def v32i8   : VTVec<32,   i8, 39>;  //   32 x i8 vector value
-def v64i8   : VTVec<64,   i8, 40>;  //   64 x i8 vector value
-def v128i8  : VTVec<128,  i8, 41>;  //  128 x i8 vector value
-def v256i8  : VTVec<256,  i8, 42>;  //  256 x i8 vector value
-def v512i8  : VTVec<512,  i8, 43>;  //  512 x i8 vector value
-def v1024i8 : VTVec<1024, i8, 44>;  // 1024 x i8 vector value
-
-def v1i16   : VTVec<1,   i16, 45>;  //   1 x i16 vector value
-def v2i16   : VTVec<2,   i16, 46>;  //   2 x i16 vector value
-def v3i16   : VTVec<3,   i16, 47>;  //   3 x i16 vector value
-def v4i16   : VTVec<4,   i16, 48>;  //   4 x i16 vector value
-def v8i16   : VTVec<8,   i16, 49>;  //   8 x i16 vector value
-def v16i16  : VTVec<16,  i16, 50>;  //  16 x i16 vector value
-def v32i16  : VTVec<32,  i16, 51>;  //  32 x i16 vector value
-def v64i16  : VTVec<64,  i16, 52>;  //  64 x i16 vector value
-def v128i16 : VTVec<128, i16, 53>;  // 128 x i16 vector value
-def v256i16 : VTVec<256, i16, 54>;  // 256 x i16 vector value
-def v512i16 : VTVec<512, i16, 55>;  // 512 x i16 vector value
-
-def v1i32    : VTVec<1,    i32, 56>;  //    1 x i32 vector value
-def v2i32    : VTVec<2,    i32, 57>;  //    2 x i32 vector value
-def v3i32    : VTVec<3,    i32, 58>;  //    3 x i32 vector value
-def v4i32    : VTVec<4,    i32, 59>;  //    4 x i32 vector value
-def v5i32    : VTVec<5,    i32, 60>;  //    5 x i32 vector value
-def v6i32    : VTVec<6,    i32, 61>;  //    6 x f32 vector value
-def v7i32    : VTVec<7,    i32, 62>;  //    7 x f32 vector value
-def v8i32    : VTVec<8,    i32, 63>;  //    8 x i32 vector value
-def v9i32    : VTVec<9,    i32, 64>;  //    9 x i32 vector value
-def v10i32   : VTVec<10,   i32, 65>;  //   10 x i32 vector value
-def v11i32   : VTVec<11,   i32, 66>;  //   11 x i32 vector value
-def v12i32   : VTVec<12,   i32, 67>;  //   12 x i32 vector value
-def v16i32   : VTVec<16,   i32, 68>;  //   16 x i32 vector value
-def v32i32   : VTVec<32,   i32, 69>;  //   32 x i32 vector value
-def v64i32   : VTVec<64,   i32, 70>;  //   64 x i32 vector value
-def v128i32  : VTVec<128,  i32, 71>;  //  128 x i32 vector value
-def v256i32  : VTVec<256,  i32, 72>;  //  256 x i32 vector value
-def v512i32  : VTVec<512,  i32, 73>;  //  512 x i32 vector value
-def v1024i32 : VTVec<1024, i32, 74>;  // 1024 x i32 vector value
-def v2048i32 : VTVec<2048, i32, 75>;  // 2048 x i32 vector value
-
-def v1i64   : VTVec<1,   i64, 76>;  //   1 x i64 vector value
-def v2i64   : VTVec<2,   i64, 77>;  //   2 x i64 vector value
-def v3i64   : VTVec<3,   i64, 78>;  //   3 x i64 vector value
-def v4i64   : VTVec<4,   i64, 79>;  //   4 x i64 vector value
-def v8i64   : VTVec<8,   i64, 80>;  //   8 x i64 vector value
-def v16i64  : VTVec<16,  i64, 81>;  //  16 x i64 vector value
-def v32i64  : VTVec<32,  i64, 82>;  //  32 x i64 vector value
-def v64i64  : VTVec<64,  i64, 83>;  //  64 x i64 vector value
-def v128i64 : VTVec<128, i64, 84>;  // 128 x i64 vector value
-def v256i64 : VTVec<256, i64, 85>;  // 256 x i64 vector value
-
-def v1i128  : VTVec<1,  i128, 86>;  //  1 x i128 vector value
-
-def v1f16    : VTVec<1,    f16,  87>;  //    1 x f16 vector value
-def v2f16    : VTVec<2,    f16,  88>;  //    2 x f16 vector value
-def v3f16    : VTVec<3,    f16,  89>;  //    3 x f16 vector value
-def v4f16    : VTVec<4,    f16,  90>;  //    4 x f16 vector value
-def v8f16    : VTVec<8,    f16,  91>;  //    8 x f16 vector value
-def v16f16   : VTVec<16,   f16,  92>;  //   16 x f16 vector value
-def v32f16   : VTVec<32,   f16,  93>;  //   32 x f16 vector value
-def v64f16   : VTVec<64,   f16,  94>;  //   64 x f16 vector value
-def v128f16  : VTVec<128,  f16,  95>;  //  128 x f16 vector value
-def v256f16  : VTVec<256,  f16,  96>;  //  256 x f16 vector value
-def v512f16  : VTVec<512,  f16,  97>;  //  512 x f16 vector value
-
-def v2bf16   : VTVec<2,   bf16,  98>;  //    2 x bf16 vector value
-def v3bf16   : VTVec<3,   bf16,  99>;  //    3 x bf16 vector value
-def v4bf16   : VTVec<4,   bf16, 100>;  //    4 x bf16 vector value
-def v8bf16   : VTVec<8,   bf16, 101>;  //    8 x bf16 vector value
-def v16bf16  : VTVec<16,  bf16, 102>;  //   16 x bf16 vector value
-def v32bf16  : VTVec<32,  bf16, 103>;  //   32 x bf16 vector value
-def v64bf16  : VTVec<64,  bf16, 104>;  //   64 x bf16 vector value
-def v128bf16 : VTVec<128, bf16, 105>;  //  128 x bf16 vector value
-
-def v1f32    : VTVec<1,    f32, 106>;  //    1 x f32 vector value
-def v2f32    : VTVec<2,    f32, 107>;  //    2 x f32 vector value
-def v3f32    : VTVec<3,    f32, 108>;  //    3 x f32 vector value
-def v4f32    : VTVec<4,    f32, 109>;  //    4 x f32 vector value
-def v5f32    : VTVec<5,    f32, 110>;  //    5 x f32 vector value
-def v6f32    : VTVec<6,    f32, 111>;  //    6 x f32 vector value
-def v7f32    : VTVec<7,    f32, 112>;  //    7 x f32 vector value
-def v8f32    : VTVec<8,    f32, 113>;  //    8 x f32 vector value
-def v9f32    : VTVec<9,    f32, 114>;  //    9 x f32 vector value
-def v10f32   : VTVec<10,   f32, 115>;  //   10 x f32 vector value
-def v11f32   : VTVec<11,   f32, 116>;  //   11 x f32 vector value
-def v12f32   : VTVec<12,   f32, 117>;  //   12 x f32 vector value
-def v16f32   : VTVec<16,   f32, 118>;  //   16 x f32 vector value
-def v32f32   : VTVec<32,   f32, 119>;  //   32 x f32 vector value
-def v64f32   : VTVec<64,   f32, 120>;  //   64 x f32 vector value
-def v128f32  : VTVec<128,  f32, 121>;  //  128 x f32 vector value
-def v256f32  : VTVec<256,  f32, 122>;  //  256 x f32 vector value
-def v512f32  : VTVec<512,  f32, 123>;  //  512 x f32 vector value
-def v1024f32 : VTVec<1024, f32, 124>;  // 1024 x f32 vector value
-def v2048f32 : VTVec<2048, f32, 125>;  // 2048 x f32 vector value
-
-def v1f64    : VTVec<1,    f64, 126>;  //    1 x f64 vector value
-def v2f64    : VTVec<2,    f64, 127>;  //    2 x f64 vector value
-def v3f64    : VTVec<3,    f64, 128>;  //    3 x f64 vector value
-def v4f64    : VTVec<4,    f64, 129>;  //    4 x f64 vector value
-def v8f64    : VTVec<8,    f64, 130>;  //    8 x f64 vector value
-def v16f64   : VTVec<16,   f64, 131>;  //   16 x f64 vector value
-def v32f64   : VTVec<32,   f64, 132>;  //   32 x f64 vector value
-def v64f64   : VTVec<64,   f64, 133>;  //   64 x f64 vector value
-def v128f64  : VTVec<128,  f64, 134>;  //  128 x f64 vector value
-def v256f64  : VTVec<256,  f64, 135>;  //  256 x f64 vector value
-
-def nxv1i1  : VTScalableVec<1,  i1, 136>;  // n x  1 x i1  vector value
-def nxv2i1  : VTScalableVec<2,  i1, 137>;  // n x  2 x i1  vector value
-def nxv4i1  : VTScalableVec<4,  i1, 138>;  // n x  4 x i1  vector value
-def nxv8i1  : VTScalableVec<8,  i1, 139>;  // n x  8 x i1  vector value
-def nxv16i1 : VTScalableVec<16, i1, 140>;  // n x 16 x i1  vector value
-def nxv32i1 : VTScalableVec<32, i1, 141>;  // n x 32 x i1  vector value
-def nxv64i1 : VTScalableVec<64, i1, 142>;  // n x 64 x i1  vector value
-
-def nxv1i8  : VTScalableVec<1,  i8, 143>;  // n x  1 x i8  vector value
-def nxv2i8  : VTScalableVec<2,  i8, 144>;  // n x  2 x i8  vector value
-def nxv4i8  : VTScalableVec<4,  i8, 145>;  // n x  4 x i8  vector value
-def nxv8i8  : VTScalableVec<8,  i8, 146>;  // n x  8 x i8  vector value
-def nxv16i8 : VTScalableVec<16, i8, 147>;  // n x 16 x i8  vector value
-def nxv32i8 : VTScalableVec<32, i8, 148>;  // n x 32 x i8  vector value
-def nxv64i8 : VTScalableVec<64, i8, 149>;  // n x 64 x i8  vector value
-
-def nxv1i16  : VTScalableVec<1,  i16, 150>;  // n x  1 x i16 vector value
-def nxv2i16  : VTScalableVec<2,  i16, 151>;  // n x  2 x i16 vector value
-def nxv4i16  : VTScalableVec<4,  i16, 152>;  // n x  4 x i16 vector value
-def nxv8i16  : VTScalableVec<8,  i16, 153>;  // n x  8 x i16 vector value
-def nxv16i16 : VTScalableVec<16, i16, 154>;  // n x 16 x i16 vector value
-def nxv32i16 : VTScalableVec<32, i16, 155>;  // n x 32 x i16 vector value
-
-def nxv1i32  : VTScalableVec<1,  i32, 156>;  // n x  1 x i32 vector value
-def nxv2i32  : VTScalableVec<2,  i32, 157>;  // n x  2 x i32 vector value
-def nxv4i32  : VTScalableVec<4,  i32, 158>;  // n x  4 x i32 vector value
-def nxv8i32  : VTScalableVec<8,  i32, 159>;  // n x  8 x i32 vector value
-def nxv16i32 : VTScalableVec<16, i32, 160>;  // n x 16 x i32 vector value
-def nxv32i32 : VTScalableVec<32, i32, 161>;  // n x 32 x i32 vector value
-
-def nxv1i64  : VTScalableVec<1,  i64, 162>;  // n x  1 x i64 vector value
-def nxv2i64  : VTScalableVec<2,  i64, 163>;  // n x  2 x i64 vector value
-def nxv4i64  : VTScalableVec<4,  i64, 164>;  // n x  4 x i64 vector value
-def nxv8i64  : VTScalableVec<8,  i64, 165>;  // n x  8 x i64 vector value
-def nxv16i64 : VTScalableVec<16, i64, 166>;  // n x 16 x i64 vector value
-def nxv32i64 : VTScalableVec<32, i64, 167>;  // n x 32 x i64 vector value
-
-def nxv1f16  : VTScalableVec<1,  f16, 168>;  // n x  1 x  f16 vector value
-def nxv2f16  : VTScalableVec<2,  f16, 169>;  // n x  2 x  f16 vector value
-def nxv4f16  : VTScalableVec<4,  f16, 170>;  // n x  4 x  f16 vector value
-def nxv8f16  : VTScalableVec<8,  f16, 171>;  // n x  8 x  f16 vector value
-def nxv16f16 : VTScalableVec<16, f16, 172>;  // n x 16 x  f16 vector value
-def nxv32f16 : VTScalableVec<32, f16, 173>;  // n x 32 x  f16 vector value
-
-def nxv1bf16  : VTScalableVec<1,  bf16, 174>;  // n x  1 x bf16 vector value
-def nxv2bf16  : VTScalableVec<2,  bf16, 175>;  // n x  2 x bf16 vector value
-def nxv4bf16  : VTScalableVec<4,  bf16, 176>;  // n x  4 x bf16 vector value
-def nxv8bf16  : VTScalableVec<8,  bf16, 177>;  // n x  8 x bf16 vector value
-def nxv16bf16 : VTScalableVec<16, bf16, 178>;  // n x 16 x bf16 vector value
-def nxv32bf16 : VTScalableVec<32, bf16, 179>;  // n x 32 x bf16 vector value
-
-def nxv1f32  : VTScalableVec<1,  f32, 180>;  // n x  1 x  f32 vector value
-def nxv2f32  : VTScalableVec<2,  f32, 181>;  // n x  2 x  f32 vector value
-def nxv4f32  : VTScalableVec<4,  f32, 182>;  // n x  4 x  f32 vector value
-def nxv8f32  : VTScalableVec<8,  f32, 183>;  // n x  8 x  f32 vector value
-def nxv16f32 : VTScalableVec<16, f32, 184>;  // n x 16 x  f32 vector value
-
-def nxv1f64  : VTScalableVec<1,  f64, 185>;  // n x  1 x  f64 vector value
-def nxv2f64  : VTScalableVec<2,  f64, 186>;  // n x  2 x  f64 vector value
-def nxv4f64  : VTScalableVec<4,  f64, 187>;  // n x  4 x  f64 vector value
-def nxv8f64  : VTScalableVec<8,  f64, 188>;  // n x  8 x  f64 vector value
-
-def x86mmx    : ValueType<64,   189>;  // X86 MMX value
-def FlagVT    : ValueType<0,    190> { // Pre-RA sched glue
+def v3i1    : VTVec<3,    i1, 19>;  //    3 x i1 vector value
+def v4i1    : VTVec<4,    i1, 20>;  //    4 x i1 vector value
+def v8i1    : VTVec<8,    i1, 21>;  //    8 x i1 vector value
+def v16i1   : VTVec<16,   i1, 22>;  //   16 x i1 vector value
+def v32i1   : VTVec<32,   i1, 23>;  //   32 x i1 vector value
+def v64i1   : VTVec<64,   i1, 24>;  //   64 x i1 vector value
+def v128i1  : VTVec<128,  i1, 25>;  //  128 x i1 vector value
+def v256i1  : VTVec<256,  i1, 26>;  //  256 x i1 vector value
+def v512i1  : VTVec<512,  i1, 27>;  //  512 x i1 vector value
+def v1024i1 : VTVec<1024, i1, 28>;  // 1024 x i1 vector value
+def v2048i1 : VTVec<2048, i1, 29>;  // 2048 x i1 vector value
+
+def v128i2  : VTVec<128,  i2, 30>;   //  128 x i2 vector value
+def v256i2  : VTVec<256,  i2, 31>;   //  256 x i2 vector value
+
+def v64i4   : VTVec<64,   i4, 32>;   //   64 x i4 vector value
+def v128i4  : VTVec<128,  i4, 33>;   //  128 x i4 vector value
+
+def v1i8    : VTVec<1,    i8, 34>;  //    1 x i8 vector value
+def v2i8    : VTVec<2,    i8, 35>;  //    2 x i8 vector value
+def v3i8    : VTVec<3,    i8, 36>;  //    3 x i8 vector value
+def v4i8    : VTVec<4,    i8, 37>;  //    4 x i8 vector value
+def v8i8    : VTVec<8,    i8, 38>;  //    8 x i8 vector value
+def v16i8   : VTVec<16,   i8, 39>;  //   16 x i8 vector value
+def v32i8   : VTVec<32,   i8, 40>;  //   32 x i8 vector value
+def v64i8   : VTVec<64,   i8, 41>;  //   64 x i8 vector value
+def v128i8  : VTVec<128,  i8, 42>;  //  128 x i8 vector value
+def v256i8  : VTVec<256,  i8, 43>;  //  256 x i8 vector value
+def v512i8  : VTVec<512,  i8, 44>;  //  512 x i8 vector value
+def v1024i8 : VTVec<1024, i8, 45>;  // 1024 x i8 vector value
+
+def v1i16   : VTVec<1,   i16, 46>;  //   1 x i16 vector value
+def v2i16   : VTVec<2,   i16, 47>;  //   2 x i16 vector value
+def v3i16   : VTVec<3,   i16, 48>;  //   3 x i16 vector value
+def v4i16   : VTVec<4,   i16, 49>;  //   4 x i16 vector value
+def v8i16   : VTVec<8,   i16, 50>;  //   8 x i16 vector value
+def v16i16  : VTVec<16,  i16, 51>;  //  16 x i16 vector value
+def v32i16  : VTVec<32,  i16, 52>;  //  32 x i16 vector value
+def v64i16  : VTVec<64,  i16, 53>;  //  64 x i16 vector value
+def v128i16 : VTVec<128, i16, 54>;  // 128 x i16 vector value
+def v256i16 : VTVec<256, i16, 55>;  // 256 x i16 vector value
+def v512i16 : VTVec<512, i16, 56>;  // 512 x i16 vector value
+
+def v1i32    : VTVec<1,    i32, 57>;  //    1 x i32 vector value
+def v2i32    : VTVec<2,    i32, 58>;  //    2 x i32 vector value
+def v3i32    : VTVec<3,    i32, 59>;  //    3 x i32 vector value
+def v4i32    : VTVec<4,    i32, 60>;  //    4 x i32 vector value
+def v5i32    : VTVec<5,    i32, 61>;  //    5 x i32 vector value
+def v6i32    : VTVec<6,    i32, 62>;  //    6 x f32 vector value
+def v7i32    : VTVec<7,    i32, 63>;  //    7 x f32 vector value
+def v8i32    : VTVec<8,    i32, 64>;  //    8 x i32 vector value
+def v9i32    : VTVec<9,    i32, 65>;  //    9 x i32 vector value
+def v10i32   : VTVec<10,   i32, 66>;  //   10 x i32 vector value
+def v11i32   : VTVec<11,   i32, 67>;  //   11 x i32 vector value
+def v12i32   : VTVec<12,   i32, 68>;  //   12 x i32 vector value
+def v16i32   : VTVec<16,   i32, 69>;  //   16 x i32 vector value
+def v32i32   : VTVec<32,   i32, 70>;  //   32 x i32 vector value
+def v64i32   : VTVec<64,   i32, 71>;  //   64 x i32 vector value
+def v128i32  : VTVec<128,  i32, 72>;  //  128 x i32 vector value
+def v256i32  : VTVec<256,  i32, 73>;  //  256 x i32 vector value
+def v512i32  : VTVec<512,  i32, 74>;  //  512 x i32 vector value
+def v1024i32 : VTVec<1024, i32, 75>;  // 1024 x i32 vector value
+def v2048i32 : VTVec<2048, i32, 76>;  // 2048 x i32 vector value
+
+def v1i64   : VTVec<1,   i64, 77>;  //   1 x i64 vector value
+def v2i64   : VTVec<2,   i64, 78>;  //   2 x i64 vector value
+def v3i64   : VTVec<3,   i64, 79>;  //   3 x i64 vector value
+def v4i64   : VTVec<4,   i64, 80>;  //   4 x i64 vector value
+def v8i64   : VTVec<8,   i64, 81>;  //   8 x i64 vector value
+def v16i64  : VTVec<16,  i64, 82>;  //  16 x i64 vector value
+def v32i64  : VTVec<32,  i64, 83>;  //  32 x i64 vector value
+def v64i64  : VTVec<64,  i64, 84>;  //  64 x i64 vector value
+def v128i64 : VTVec<128, i64, 85>;  // 128 x i64 vector value
+def v256i64 : VTVec<256, i64, 86>;  // 256 x i64 vector value
+
+def v1i128  : VTVec<1,  i128, 87>;  //  1 x i128 vector value
+
+def v1f16    : VTVec<1,    f16,  88>;  //    1 x f16 vector value
+def v2f16    : VTVec<2,    f16,  89>;  //    2 x f16 vector value
+def v3f16    : VTVec<3,    f16,  90>;  //    3 x f16 vector value
+def v4f16    : VTVec<4,    f16,  91>;  //    4 x f16 vector value
+def v8f16    : VTVec<8,    f16,  92>;  //    8 x f16 vector value
+def v16f16   : VTVec<16,   f16,  93>;  //   16 x f16 vector value
+def v32f16   : VTVec<32,   f16,  94>;  //   32 x f16 vector value
+def v64f16   : VTVec<64,   f16,  95>;  //   64 x f16 vector value
+def v128f16  : VTVec<128,  f16,  96>;  //  128 x f16 vector value
+def v256f16  : VTVec<256,  f16,  97>;  //  256 x f16 vector value
+def v512f16  : VTVec<512,  f16,  98>;  //  512 x f16 vector value
+
+def v2bf16   : VTVec<2,   bf16,  99>;  //    2 x bf16 vector value
+def v3bf16   : VTVec<3,   bf16, 100>;  //    3 x bf16 vector value
+def v4bf16   : VTVec<4,   bf16, 101>;  //    4 x bf16 vector value
+def v8bf16   : VTVec<8,   bf16, 102>;  //    8 x bf16 vector value
+def v16bf16  : VTVec<16,  bf16, 103>;  //   16 x bf16 vector value
+def v32bf16  : VTVec<32,  bf16, 104>;  //   32 x bf16 vector value
+def v64bf16  : VTVec<64,  bf16, 105>;  //   64 x bf16 vector value
+def v128bf16 : VTVec<128, bf16, 106>;  //  128 x bf16 vector value
+
+def v1f32    : VTVec<1,    f32, 107>;  //    1 x f32 vector value
+def v2f32    : VTVec<2,    f32, 108>;  //    2 x f32 vector value
+def v3f32    : VTVec<3,    f32, 109>;  //    3 x f32 vector value
+def v4f32    : VTVec<4,    f32, 110>;  //    4 x f32 vector value
+def v5f32    : VTVec<5,    f32, 111>;  //    5 x f32 vector value
+def v6f32    : VTVec<6,    f32, 112>;  //    6 x f32 vector value
+def v7f32    : VTVec<7,    f32, 113>;  //    7 x f32 vector value
+def v8f32    : VTVec<8,    f32, 114>;  //    8 x f32 vector value
+def v9f32    : VTVec<9,    f32, 115>;  //    9 x f32 vector value
+def v10f32   : VTVec<10,   f32, 116>;  //   10 x f32 vector value
+def v11f32   : VTVec<11,   f32, 117>;  //   11 x f32 vector value
+def v12f32   : VTVec<12,   f32, 118>;  //   12 x f32 vector value
+def v16f32   : VTVec<16,   f32, 119>;  //   16 x f32 vector value
+def v32f32   : VTVec<32,   f32, 120>;  //   32 x f32 vector value
+def v64f32   : VTVec<64,   f32, 121>;  //   64 x f32 vector value
+def v128f32  : VTVec<128,  f32, 122>;  //  128 x f32 vector value
+def v256f32  : VTVec<256,  f32, 123>;  //  256 x f32 vector value
+def v512f32  : VTVec<512,  f32, 124>;  //  512 x f32 vector value
+def v1024f32 : VTVec<1024, f32, 125>;  // 1024 x f32 vector value
+def v2048f32 : VTVec<2048, f32, 126>;  // 2048 x f32 vector value
+
+def v1f64    : VTVec<1,    f64, 127>;  //    1 x f64 vector value
+def v2f64    : VTVec<2,    f64, 128>;  //    2 x f64 vector value
+def v3f64    : VTVec<3,    f64, 129>;  //    3 x f64 vector value
+def v4f64    : VTVec<4,    f64, 130>;  //    4 x f64 vector value
+def v8f64    : VTVec<8,    f64, 131>;  //    8 x f64 vector value
+def v16f64   : VTVec<16,   f64, 132>;  //   16 x f64 vector value
+def v32f64   : VTVec<32,   f64, 133>;  //   32 x f64 vector value
+def v64f64   : VTVec<64,   f64, 134>;  //   64 x f64 vector value
+def v128f64  : VTVec<128,  f64, 135>;  //  128 x f64 vector value
+def v256f64  : VTVec<256,  f64, 136>;  //  256 x f64 vector value
+
+def nxv1i1  : VTScalableVec<1,  i1, 137>;  // n x  1 x i1  vector value
+def nxv2i1  : VTScalableVec<2,  i1, 138>;  // n x  2 x i1  vector value
+def nxv4i1  : VTScalableVec<4,  i1, 139>;  // n x  4 x i1  vector value
+def nxv8i1  : VTScalableVec<8,  i1, 140>;  // n x  8 x i1  vector value
+def nxv16i1 : VTScalableVec<16, i1, 141>;  // n x 16 x i1  vector value
+def nxv32i1 : VTScalableVec<32, i1, 142>;  // n x 32 x i1  vector value
+def nxv64i1 : VTScalableVec<64, i1, 143>;  // n x 64 x i1  vector value
+
+def nxv1i8  : VTScalableVec<1,  i8, 144>;  // n x  1 x i8  vector value
+def nxv2i8  : VTScalableVec<2,  i8, 145>;  // n x  2 x i8  vector value
+def nxv4i8  : VTScalableVec<4,  i8, 146>;  // n x  4 x i8  vector value
+def nxv8i8  : VTScalableVec<8,  i8, 147>;  // n x  8 x i8  vector value
+def nxv16i8 : VTScalableVec<16, i8, 148>;  // n x 16 x i8  vector value
+def nxv32i8 : VTScalableVec<32, i8, 149>;  // n x 32 x i8  vector value
+def nxv64i8 : VTScalableVec<64, i8, 150>;  // n x 64 x i8  vector value
+
+def nxv1i16  : VTScalableVec<1,  i16, 151>;  // n x  1 x i16 vector value
+def nxv2i16  : VTScalableVec<2,  i16, 152>;  // n x  2 x i16 vector value
+def nxv4i16  : VTScalableVec<4,  i16, 153>;  // n x  4 x i16 vector value
+def nxv8i16  : VTScalableVec<8,  i16, 154>;  // n x  8 x i16 vector value
+def nxv16i16 : VTScalableVec<16, i16, 155>;  // n x 16 x i16 vector value
+def nxv32i16 : VTScalableVec<32, i16, 156>;  // n x 32 x i16 vector value
+
+def nxv1i32  : VTScalableVec<1,  i32, 157>;  // n x  1 x i32 vector value
+def nxv2i32  : VTScalableVec<2,  i32, 158>;  // n x  2 x i32 vector value
+def nxv4i32  : VTScalableVec<4,  i32, 159>;  // n x  4 x i32 vector value
+def nxv8i32  : VTScalableVec<8,  i32, 160>;  // n x  8 x i32 vector value
+def nxv16i32 : VTScalableVec<16, i32, 161>;  // n x 16 x i32 vector value
+def nxv32i32 : VTScalableVec<32, i32, 162>;  // n x 32 x i32 vector value
+
+def nxv1i64  : VTScalableVec<1,  i64, 163>;  // n x  1 x i64 vector value
+def nxv2i64  : VTScalableVec<2,  i64, 164>;  // n x  2 x i64 vector value
+def nxv4i64  : VTScalableVec<4,  i64, 165>;  // n x  4 x i64 vector value
+def nxv8i64  : VTScalableVec<8,  i64, 166>;  // n x  8 x i64 vector value
+def nxv16i64 : VTScalableVec<16, i64, 167>;  // n x 16 x i64 vector value
+def nxv32i64 : VTScalableVec<32, i64, 168>;  // n x 32 x i64 vector value
+
+def nxv1f16  : VTScalableVec<1,  f16, 169>;  // n x  1 x  f16 vector value
+def nxv2f16  : VTScalableVec<2,  f16, 170>;  // n x  2 x  f16 vector value
+def nxv4f16  : VTScalableVec<4,  f16, 171>;  // n x  4 x  f16 vector value
+def nxv8f16  : VTScalableVec<8,  f16, 172>;  // n x  8 x  f16 vector value
+def nxv16f16 : VTScalableVec<16, f16, 173>;  // n x 16 x  f16 vector value
+def nxv32f16 : VTScalableVec<32, f16, 174>;  // n x 32 x  f16 vector value
+
+def nxv1bf16  : VTScalableVec<1,  bf16, 175>;  // n x  1 x bf16 vector value
+def nxv2bf16  : VTScalableVec<2,  bf16, 176>;  // n x  2 x bf16 vector value
+def nxv4bf16  : VTScalableVec<4,  bf16, 177>;  // n x  4 x bf16 vector value
+def nxv8bf16  : VTScalableVec<8,  bf16, 178>;  // n x  8 x bf16 vector value
+def nxv16bf16 : VTScalableVec<16, bf16, 179>;  // n x 16 x bf16 vector value
+def nxv32bf16 : VTScalableVec<32, bf16, 180>;  // n x 32 x bf16 vector value
+
+def nxv1f32  : VTScalableVec<1,  f32, 181>;  // n x  1 x  f32 vector value
+def nxv2f32  : VTScalableVec<2,  f32, 182>;  // n x  2 x  f32 vector value
+def nxv4f32  : VTScalableVec<4,  f32, 183>;  // n x  4 x  f32 vector value
+def nxv8f32  : VTScalableVec<8,  f32, 184>;  // n x  8 x  f32 vector value
+def nxv16f32 : VTScalableVec<16, f32, 185>;  // n x 16 x  f32 vector value
+
+def nxv1f64  : VTScalableVec<1,  f64, 186>;  // n x  1 x  f64 vector value
+def nxv2f64  : VTScalableVec<2,  f64, 187>;  // n x  2 x  f64 vector value
+def nxv4f64  : VTScalableVec<4,  f64, 188>;  // n x  4 x  f64 vector value
+def nxv8f64  : VTScalableVec<8,  f64, 189>;  // n x  8 x  f64 vector value
+
+def x86mmx    : ValueType<64,   190>;  // X86 MMX value
+def FlagVT    : ValueType<0,    191> { // Pre-RA sched glue
   let LLVMName = "Glue";
 }
-def isVoid    : ValueType<0,    191>;  // Produces no value
-def untyped   : ValueType<8,    192> { // Produces an untyped value
+def isVoid    : ValueType<0,    192>;  // Produces no value
+def untyped   : ValueType<8,    193> { // Produces an untyped value
   let LLVMName = "Untyped";
 }
-def funcref   : ValueType<0,    193>;  // WebAssembly's funcref type
-def externref : ValueType<0,    194>;  // WebAssembly's externref type
-def x86amx    : ValueType<8192, 195>;  // X86 AMX value
-def i64x8     : ValueType<512,  196>;  // 8 Consecutive GPRs (AArch64)
+def funcref   : ValueType<0,    194>;  // WebAssembly's funcref type
+def externref : ValueType<0,    195>;  // WebAssembly's externref type
+def x86amx    : ValueType<8192, 196>;  // X86 AMX value
+def i64x8     : ValueType<512,  197>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,  197>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0, 198>; // SPIR-V's builtin type
+              : ValueType<16,  198>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 199>; // SPIR-V's builtin type
 
 def token      : ValueType<0, 248>;  // TokenTy
 def MetadataVT : ValueType<0, 249> { // Metadata
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 731fcabaee402..5dcabdb6a9e06 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -232,6 +232,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
     return FixedVectorType::get(Type::getInt1Ty(Context), 1);
   case MVT::v2i1:
     return FixedVectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::v3i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 3);
   case MVT::v4i1:
     return FixedVectorType::get(Type::getInt1Ty(Context), 4);
   case MVT::v8i1:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 528257ead585e..e8cc87e52e65a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -313,6 +313,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
   setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
+  setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
 

From 1253e535bd421b955cce1c67ed4304f2ae9bcdfd Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 26 Feb 2024 15:20:55 +0000
Subject: [PATCH 327/546] [RemoveDIs] Use iterators for moving PHIs in
 loop-unroll-and-jam (#83003)

With no debug intrinsics, correctly identifying the start of a block
with iterators becomes important. We need to use the iterator-returning
methods here in loop-unroll-and-jam where we're shifting PHIs around.
Otherwise they can be inserted after debug-info records, leading to
debug-info attached to PHIs, which is ill formed.

Fixes #83000
---
 llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 3c06a6e47a303..26b8c790f2a06 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -473,9 +473,9 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   };
   // Move all the phis from Src into Dest
   auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
-    Instruction *insertPoint = Dest->getFirstNonPHI();
+    BasicBlock::iterator insertPoint = Dest->getFirstNonPHIIt();
     while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
-      Phi->moveBefore(insertPoint);
+      Phi->moveBefore(*Dest, insertPoint);
   };
 
   // Update the PHI values outside the loop to point to the last block

From b5048700fc31f3bf6dd32ace7730815d4cfef411 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 13:15:51 +0100
Subject: [PATCH 328/546] [clang][Interp] Fix lvalue CompoundLiteralExprs

We need to leave a pointer on the stack for them, even if their
type is primitive.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 15 +++++++++++++--
 clang/test/AST/Interp/c.c                |  3 +++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index e96afb1078cc7..a71b6e82817e4 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1679,13 +1679,24 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundLiteralExpr(
 
   std::optional<PrimType> T = classify(E->getType());
   if (E->isFileScope()) {
+    // Avoid creating a variable if this is a primitive RValue anyway.
+    if (T && !E->isLValue())
+      return this->delegate(Init);
+
     if (std::optional<unsigned> GlobalIndex = P.createGlobal(E)) {
-      if (classify(E->getType()))
-        return this->visit(Init);
       if (!this->emitGetPtrGlobal(*GlobalIndex, E))
         return false;
+
+      if (T) {
+        if (!this->visit(Init))
+          return false;
+        return this->emitInitGlobal(*T, *GlobalIndex, E);
+      }
+
       return this->visitInitializer(Init);
     }
+
+    return false;
   }
 
   // Otherwise, use a local variable.
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index b67fe93417058..a6244c3af202a 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -21,6 +21,9 @@ _Static_assert(!!1.0, ""); // pedantic-ref-warning {{not an integer constant exp
                            // pedantic-expected-warning {{not an integer constant expression}}
 _Static_assert(!!1, "");
 
+_Static_assert(!(_Bool){(void*)0}, ""); // pedantic-ref-warning {{not an integer constant expression}} \
+                                        // pedantic-expected-warning {{not an integer constant expression}}
+
 int a = (1 == 1 ? 5 : 3);
 _Static_assert(a == 5, ""); // all-error {{not an integral constant expression}}
 

From 252f1cdebfffd846afe969d3f6e4684ed39536ad Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Mon, 26 Feb 2024 09:45:09 -0600
Subject: [PATCH 329/546] [lldb][test] Remove vendored packages `unittest2` and
 `progress` (#82670)

The `unittest2` package is unused since
5b386158aacac4b41126983a5379d36ed413d0ea.

The `progress` package was only used internally by `unittest2`, so it
can be deleted as well.
---
 .../Python/module/progress/progress.py        |  181 ---
 .../module/unittest2/unittest2/__init__.py    |   78 -
 .../module/unittest2/unittest2/__main__.py    |   10 -
 .../Python/module/unittest2/unittest2/case.py | 1169 --------------
 .../module/unittest2/unittest2/collector.py   |   10 -
 .../unittest2/unittest2/compatibility.py      |   67 -
 .../module/unittest2/unittest2/loader.py      |  339 ----
 .../Python/module/unittest2/unittest2/main.py |  257 ---
 .../module/unittest2/unittest2/result.py      |  197 ---
 .../module/unittest2/unittest2/runner.py      |  206 ---
 .../module/unittest2/unittest2/signals.py     |   63 -
 .../module/unittest2/unittest2/suite.py       |  286 ----
 .../unittest2/unittest2/test/__init__.py      |    1 -
 .../module/unittest2/unittest2/test/dummy.py  |    0
 .../unittest2/unittest2/test/support.py       |  189 ---
 .../unittest2/test/test_assertions.py         |  269 ----
 .../unittest2/unittest2/test/test_break.py    |  258 ---
 .../unittest2/unittest2/test/test_case.py     | 1244 ---------------
 .../unittest2/test/test_discovery.py          |  392 -----
 .../unittest2/test/test_functiontestcase.py   |  148 --
 .../unittest2/unittest2/test/test_loader.py   | 1380 -----------------
 .../unittest2/test/test_new_tests.py          |   52 -
 .../unittest2/unittest2/test/test_program.py  |  251 ---
 .../unittest2/unittest2/test/test_result.py   |  426 -----
 .../unittest2/unittest2/test/test_runner.py   |  136 --
 .../unittest2/unittest2/test/test_setups.py   |  596 -------
 .../unittest2/unittest2/test/test_skipping.py |  154 --
 .../unittest2/unittest2/test/test_suite.py    |  363 -----
 .../unittest2/test/test_unittest2_with.py     |  148 --
 .../Python/module/unittest2/unittest2/util.py |  105 --
 30 files changed, 8975 deletions(-)
 delete mode 100644 lldb/third_party/Python/module/progress/progress.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/__init__.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/__main__.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/case.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/collector.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/compatibility.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/loader.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/main.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/result.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/runner.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/signals.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/suite.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/__init__.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/dummy.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/support.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_assertions.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_break.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_case.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_discovery.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_functiontestcase.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_loader.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_new_tests.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_program.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_result.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_runner.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_setups.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_skipping.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_suite.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/test/test_unittest2_with.py
 delete mode 100644 lldb/third_party/Python/module/unittest2/unittest2/util.py

diff --git a/lldb/third_party/Python/module/progress/progress.py b/lldb/third_party/Python/module/progress/progress.py
deleted file mode 100644
index f844b9800c019..0000000000000
--- a/lldb/third_party/Python/module/progress/progress.py
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env python
-
-import use_lldb_suite
-
-import sys
-import time
-
-
-class ProgressBar(object):
-    """ProgressBar class holds the options of the progress bar.
-    The options are:
-        start   State from which start the progress. For example, if start is
-                5 and the end is 10, the progress of this state is 50%
-        end     State in which the progress has terminated.
-        width   --
-        fill    String to use for "filled" used to represent the progress
-        blank   String to use for "filled" used to represent remaining space.
-        format  Format
-        incremental
-    """
-    light_block = chr(0x2591).encode("utf-8")
-    solid_block = chr(0x2588).encode("utf-8")
-    solid_right_arrow = chr(0x25BA).encode("utf-8")
-
-    def __init__(self,
-                 start=0,
-                 end=10,
-                 width=12,
-                 fill=chr(0x25C9).encode("utf-8"),
-                 blank=chr(0x25CC).encode("utf-8"),
-                 marker=chr(0x25CE).encode("utf-8"),
-                 format='[%(fill)s%(marker)s%(blank)s] %(progress)s%%',
-                 incremental=True):
-        super(ProgressBar, self).__init__()
-
-        self.start = start
-        self.end = end
-        self.width = width
-        self.fill = fill
-        self.blank = blank
-        self.marker = marker
-        self.format = format
-        self.incremental = incremental
-        self.step = 100 / float(width)  # fix
-        self.reset()
-
-    def __add__(self, increment):
-        increment = self._get_progress(increment)
-        if 100 > self.progress + increment:
-            self.progress += increment
-        else:
-            self.progress = 100
-        return self
-
-    def complete(self):
-        self.progress = 100
-        return self
-
-    def __str__(self):
-        progressed = int(self.progress / self.step)  # fix
-        fill = progressed * self.fill
-        blank = (self.width - progressed) * self.blank
-        return self.format % {
-            'fill': fill,
-            'blank': blank,
-            'marker': self.marker,
-            'progress': int(
-                self.progress)}
-
-    __repr__ = __str__
-
-    def _get_progress(self, increment):
-        return float(increment * 100) / self.end
-
-    def reset(self):
-        """Resets the current progress to the start point"""
-        self.progress = self._get_progress(self.start)
-        return self
-
-
-class AnimatedProgressBar(ProgressBar):
-    """Extends ProgressBar to allow you to use it straighforward on a script.
-    Accepts an extra keyword argument named `stdout` (by default use sys.stdout)
-    and may be any file-object to which send the progress status.
-    """
-
-    def __init__(self,
-                 start=0,
-                 end=10,
-                 width=12,
-                 fill=chr(0x25C9).encode("utf-8"),
-                 blank=chr(0x25CC).encode("utf-8"),
-                 marker=chr(0x25CE).encode("utf-8"),
-                 format='[%(fill)s%(marker)s%(blank)s] %(progress)s%%',
-                 incremental=True,
-                 stdout=sys.stdout):
-        super(
-            AnimatedProgressBar,
-            self).__init__(
-            start,
-            end,
-            width,
-            fill,
-            blank,
-            marker,
-            format,
-            incremental)
-        self.stdout = stdout
-
-    def show_progress(self):
-        if hasattr(self.stdout, 'isatty') and self.stdout.isatty():
-            self.stdout.write('\r')
-        else:
-            self.stdout.write('\n')
-        self.stdout.write(str(self))
-        self.stdout.flush()
-
-
-class ProgressWithEvents(AnimatedProgressBar):
-    """Extends AnimatedProgressBar to allow you to track a set of events that
-       cause the progress to move. For instance, in a deletion progress bar, you
-       can track files that were nuked and files that the user doesn't have access to
-    """
-
-    def __init__(self,
-                 start=0,
-                 end=10,
-                 width=12,
-                 fill=chr(0x25C9).encode("utf-8"),
-                 blank=chr(0x25CC).encode("utf-8"),
-                 marker=chr(0x25CE).encode("utf-8"),
-                 format='[%(fill)s%(marker)s%(blank)s] %(progress)s%%',
-                 incremental=True,
-                 stdout=sys.stdout):
-        super(
-            ProgressWithEvents,
-            self).__init__(
-            start,
-            end,
-            width,
-            fill,
-            blank,
-            marker,
-            format,
-            incremental,
-            stdout)
-        self.events = {}
-
-    def add_event(self, event):
-        if event in self.events:
-            self.events[event] += 1
-        else:
-            self.events[event] = 1
-
-    def show_progress(self):
-        isatty = hasattr(self.stdout, 'isatty') and self.stdout.isatty()
-        if isatty:
-            self.stdout.write('\r')
-        else:
-            self.stdout.write('\n')
-        self.stdout.write(str(self))
-        if len(self.events) == 0:
-            return
-        self.stdout.write('\n')
-        for key in list(self.events.keys()):
-            self.stdout.write(str(key) + ' = ' + str(self.events[key]) + ' ')
-        if isatty:
-            self.stdout.write('\033[1A')
-        self.stdout.flush()
-
-
-if __name__ == '__main__':
-    p = AnimatedProgressBar(end=200, width=200)
-
-    while True:
-        p + 5
-        p.show_progress()
-        time.sleep(0.3)
-        if p.progress == 100:
-            break
-    print()  # new line
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/__init__.py b/lldb/third_party/Python/module/unittest2/unittest2/__init__.py
deleted file mode 100644
index 14fea5a2599ee..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/__init__.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-unittest2
-
-unittest2 is a backport of the new features added to the unittest testing
-framework in Python 2.7. It is tested to run on Python 2.4 - 2.6.
-
-To use unittest2 instead of unittest simply replace ``import unittest`` with
-``import unittest2``.
-
-
-Copyright (c) 1999-2003 Steve Purcell
-Copyright (c) 2003-2010 Python Software Foundation
-This module is free software, and you may redistribute it and/or modify
-it under the same terms as Python itself, so long as this copyright message
-and disclaimer are retained in their original form.
-
-IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
-SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF
-THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGE.
-
-THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE.  THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
-AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
-SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-"""
-
-import sys
-
-if sys.version_info[0] >= 3:
-    # Python 3 doesn't have the builtin `cmp` function anymore
-    cmp_ = lambda x, y: (x > y) - (x < y)
-else:
-    cmp_ = cmp
-
-reversed_cmp_ = lambda x, y: -cmp_(x, y)
-
-__all__ = ['TestResult', 'TestCase', 'TestSuite',
-           'TextTestRunner', 'TestLoader', 'FunctionTestCase', 'main',
-           'defaultTestLoader', 'SkipTest', 'skip', 'skipIf', 'skipUnless',
-           'expectedFailure', 'TextTestResult', '__version__', 'collector']
-
-__version__ = '0.5.1'
-
-# Expose obsolete functions for backwards compatibility
-__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases'])
-
-
-from unittest2.collector import collector
-from unittest2.result import TestResult
-from unittest2.case import (
-    TestCase, FunctionTestCase, SkipTest, skip, skipIf,
-    skipUnless, expectedFailure
-)
-from unittest2.suite import BaseTestSuite, TestSuite
-from unittest2.loader import (
-    TestLoader, defaultTestLoader, makeSuite, getTestCaseNames,
-    findTestCases
-)
-from unittest2.main import TestProgram, main, main_
-from unittest2.runner import TextTestRunner, TextTestResult
-
-try:
-    from unittest2.signals import (
-        installHandler, registerResult, removeResult, removeHandler
-    )
-except ImportError:
-    # Compatibility with platforms that don't have the signal module
-    pass
-else:
-    __all__.extend(['installHandler', 'registerResult', 'removeResult',
-                    'removeHandler'])
-
-# deprecated
-_TextTestResult = TextTestResult
-
-__unittest = True
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/__main__.py b/lldb/third_party/Python/module/unittest2/unittest2/__main__.py
deleted file mode 100644
index 04ed982df0fbe..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/__main__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""Main entry point"""
-
-import sys
-if sys.argv[0].endswith("__main__.py"):
-    sys.argv[0] = "unittest2"
-
-__unittest = True
-
-from unittest2.main import main_
-main_()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/case.py b/lldb/third_party/Python/module/unittest2/unittest2/case.py
deleted file mode 100644
index a24b9af98f40b..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/case.py
+++ /dev/null
@@ -1,1169 +0,0 @@
-"""Test case implementation"""
-
-import sys
-import difflib
-import pprint
-import re
-import unittest
-import warnings
-
-from unittest2 import result
-from unittest2.util import (
-    safe_repr, safe_str, strclass,
-    unorderable_list_difference
-)
-
-from unittest2.compatibility import wraps
-
-__unittest = True
-
-
-DIFF_OMITTED = ('\nDiff is %s characters long. '
-                'Set self.maxDiff to None to see it.')
-
-
-class SkipTest(Exception):
-    """
-    Raise this exception in a test to skip it.
-
-    Usually you can use TestResult.skip() or one of the skipping decorators
-    instead of raising this directly.
-    """
-
-
-class _ExpectedFailure(Exception):
-    """
-    Raise this when a test is expected to fail.
-
-    This is an implementation detail.
-    """
-
-    def __init__(self, exc_info, bugnumber=None):
-        # can't use super because Python 2.4 exceptions are old style
-        Exception.__init__(self)
-        self.exc_info = exc_info
-        self.bugnumber = bugnumber
-
-
-class _UnexpectedSuccess(Exception):
-    """
-    The test was supposed to fail, but it didn't!
-    """
-
-    def __init__(self, exc_info, bugnumber=None):
-        # can't use super because Python 2.4 exceptions are old style
-        Exception.__init__(self)
-        self.exc_info = exc_info
-        self.bugnumber = bugnumber
-
-
-def _id(obj):
-    return obj
-
-
-def skip(reason):
-    """
-    Unconditionally skip a test.
-    """
-    def decorator(test_item):
-        if not (
-            isinstance(
-                test_item,
-                type) and issubclass(
-                test_item,
-                TestCase)):
-            @wraps(test_item)
-            def skip_wrapper(*args, **kwargs):
-                raise SkipTest(reason)
-            test_item = skip_wrapper
-
-        test_item.__unittest_skip__ = True
-        test_item.__unittest_skip_why__ = reason
-        return test_item
-    return decorator
-
-
-def skipIf(condition, reason):
-    """
-    Skip a test if the condition is true.
-    """
-    if condition:
-        return skip(reason)
-    return _id
-
-
-def skipUnless(condition, reason):
-    """
-    Skip a test unless the condition is true.
-    """
-    if not condition:
-        return skip(reason)
-    return _id
-
-
-def expectedFailure(bugnumber=None):
-    if callable(bugnumber):
-        @wraps(bugnumber)
-        def expectedFailure_easy_wrapper(*args, **kwargs):
-            try:
-                bugnumber(*args, **kwargs)
-            except Exception:
-                raise _ExpectedFailure(sys.exc_info(), None)
-            raise _UnexpectedSuccess(sys.exc_info(), None)
-        return expectedFailure_easy_wrapper
-    else:
-        def expectedFailure_impl(func):
-            @wraps(func)
-            def wrapper(*args, **kwargs):
-                try:
-                    func(*args, **kwargs)
-                except Exception:
-                    raise _ExpectedFailure(sys.exc_info(), bugnumber)
-                raise _UnexpectedSuccess(sys.exc_info(), bugnumber)
-            return wrapper
-        return expectedFailure_impl
-
-
-class _AssertRaisesContext(object):
-    """A context manager used to implement TestCase.assertRaises* methods."""
-
-    def __init__(self, expected, test_case, expected_regexp=None):
-        self.expected = expected
-        self.failureException = test_case.failureException
-        self.expected_regexp = expected_regexp
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, tb):
-        if exc_type is None:
-            try:
-                exc_name = self.expected.__name__
-            except AttributeError:
-                exc_name = str(self.expected)
-            raise self.failureException(
-                "%s not raised" % (exc_name,))
-        if not issubclass(exc_type, self.expected):
-            # let unexpected exceptions pass through
-            return False
-        self.exception = exc_value  # store for later retrieval
-        if self.expected_regexp is None:
-            return True
-
-        expected_regexp = self.expected_regexp
-        if isinstance(expected_regexp, str):
-            expected_regexp = re.compile(expected_regexp)
-        if not expected_regexp.search(str(exc_value)):
-            raise self.failureException(
-                '"%s" does not match "%s"' %
-                (expected_regexp.pattern, str(exc_value)))
-        return True
-
-
-class _TypeEqualityDict(object):
-
-    def __init__(self, testcase):
-        self.testcase = testcase
-        self._store = {}
-
-    def __setitem__(self, key, value):
-        self._store[key] = value
-
-    def __getitem__(self, key):
-        value = self._store[key]
-        if isinstance(value, str):
-            return getattr(self.testcase, value)
-        return value
-
-    def get(self, key, default=None):
-        if key in self._store:
-            return self[key]
-        return default
-
-
-class TestCase(unittest.TestCase):
-    """A class whose instances are single test cases.
-
-    By default, the test code itself should be placed in a method named
-    'runTest'.
-
-    If the fixture may be used for many test cases, create as
-    many test methods as are needed. When instantiating such a TestCase
-    subclass, specify in the constructor arguments the name of the test method
-    that the instance is to execute.
-
-    Test authors should subclass TestCase for their own tests. Construction
-    and deconstruction of the test's environment ('fixture') can be
-    implemented by overriding the 'setUp' and 'tearDown' methods respectively.
-
-    If it is necessary to override the __init__ method, the base class
-    __init__ method must always be called. It is important that subclasses
-    should not change the signature of their __init__ method, since instances
-    of the classes are instantiated automatically by parts of the framework
-    in order to be run.
-    """
-
-    # This attribute determines which exception will be raised when
-    # the instance's assertion methods fail; test methods raising this
-    # exception will be deemed to have 'failed' rather than 'errored'
-
-    failureException = AssertionError
-
-    # This attribute sets the maximum length of a diff in failure messages
-    # by assert methods using difflib. It is looked up as an instance attribute
-    # so can be configured by individual tests if required.
-
-    maxDiff = 80 * 8
-
-    # This attribute determines whether long messages (including repr of
-    # objects used in assert methods) will be printed on failure in *addition*
-    # to any explicit message passed.
-
-    longMessage = True
-
-    # Attribute used by TestSuite for classSetUp
-
-    _classSetupFailed = False
-
-    def __init__(self, methodName='runTest'):
-        """Create an instance of the class that will use the named test
-           method when executed. Raises a ValueError if the instance does
-           not have a method with the specified name.
-        """
-        self._testMethodName = methodName
-        self._resultForDoCleanups = None
-        try:
-            testMethod = getattr(self, methodName)
-        except AttributeError:
-            raise ValueError("no such test method in %s: %s" %
-                             (self.__class__, methodName))
-        self._testMethodDoc = testMethod.__doc__
-        self._cleanups = []
-
-        # Map types to custom assertEqual functions that will compare
-        # instances of said type in more detail to generate a more useful
-        # error message.
-        self._type_equality_funcs = _TypeEqualityDict(self)
-        self.addTypeEqualityFunc(dict, 'assertDictEqual')
-        self.addTypeEqualityFunc(list, 'assertListEqual')
-        self.addTypeEqualityFunc(tuple, 'assertTupleEqual')
-        self.addTypeEqualityFunc(set, 'assertSetEqual')
-        self.addTypeEqualityFunc(frozenset, 'assertSetEqual')
-        self.addTypeEqualityFunc(str, 'assertMultiLineEqual')
-
-    def addTypeEqualityFunc(self, typeobj, function):
-        """Add a type specific assertEqual style function to compare a type.
-
-        This method is for use by TestCase subclasses that need to register
-        their own type equality functions to provide nicer error messages.
-
-        Args:
-            typeobj: The data type to call this function on when both values
-                    are of the same type in assertEqual().
-            function: The callable taking two arguments and an optional
-                    msg= argument that raises self.failureException with a
-                    useful error message when the two arguments are not equal.
-        """
-        self._type_equality_funcs[typeobj] = function
-
-    def addCleanup(self, function, *args, **kwargs):
-        """Add a function, with arguments, to be called when the test is
-        completed. Functions added are called on a LIFO basis and are
-        called after tearDown on test failure or success.
-
-        Cleanup items are called even if setUp fails (unlike tearDown)."""
-        self._cleanups.append((function, args, kwargs))
-
-    def setUp(self):
-        "Hook method for setting up the test fixture before exercising it."
-
-    @classmethod
-    def setUpClass(cls):
-        "Hook method for setting up class fixture before running tests in the class."
-
-    @classmethod
-    def tearDownClass(cls):
-        "Hook method for deconstructing the class fixture after running all tests in the class."
-
-    def tearDown(self):
-        "Hook method for deconstructing the test fixture after testing it."
-
-    def countTestCases(self):
-        return 1
-
-    def defaultTestResult(self):
-        return result.TestResult()
-
-    def shortDescription(self):
-        """Returns a one-line description of the test, or None if no
-        description has been provided.
-
-        The default implementation of this method returns the first line of
-        the specified test method's docstring.
-        """
-        doc = self._testMethodDoc
-        return doc and doc.split("\n")[0].strip() or None
-
-    def id(self):
-        return "%s.%s" % (strclass(self.__class__), self._testMethodName)
-
-    def __eq__(self, other):
-        if not isinstance(self, type(other)):
-            return NotImplemented
-
-        return self._testMethodName == other._testMethodName
-
-    def __ne__(self, other):
-        return not self == other
-
-    def __hash__(self):
-        return hash((type(self), self._testMethodName))
-
-    def __str__(self):
-        return "%s (%s)" % (self._testMethodName, strclass(self.__class__))
-
-    def __repr__(self):
-        return "<%s testMethod=%s>" % \
-               (strclass(self.__class__), self._testMethodName)
-
-    def _addSkip(self, result, reason):
-        addSkip = getattr(result, 'addSkip', None)
-        if addSkip is not None:
-            addSkip(self, reason)
-        else:
-            warnings.warn(
-                "Use of a TestResult without an addSkip method is deprecated",
-                DeprecationWarning,
-                2)
-            result.addSuccess(self)
-
-    def run(self, result=None):
-        orig_result = result
-        if result is None:
-            result = self.defaultTestResult()
-            startTestRun = getattr(result, 'startTestRun', None)
-            if startTestRun is not None:
-                startTestRun()
-
-        self._resultForDoCleanups = result
-        result.startTest(self)
-
-        testMethod = getattr(self, self._testMethodName)
-
-        if (getattr(self.__class__, "__unittest_skip__", False) or
-                getattr(testMethod, "__unittest_skip__", False)):
-            # If the class or method was skipped.
-            try:
-                skip_why = (
-                    getattr(
-                        self.__class__,
-                        '__unittest_skip_why__',
-                        '') or getattr(
-                        testMethod,
-                        '__unittest_skip_why__',
-                        ''))
-                self._addSkip(result, skip_why)
-            finally:
-                result.stopTest(self)
-            return
-        try:
-            success = False
-            try:
-                self.setUp()
-            except SkipTest as e:
-                self._addSkip(result, str(e))
-            except Exception:
-                result.addError(self, sys.exc_info())
-            else:
-                success = self.runMethod(testMethod, result)
-
-                try:
-                    self.tearDown()
-                except Exception:
-                    result.addCleanupError(self, sys.exc_info())
-                    success = False
-
-                self.dumpSessionInfo()
-
-            cleanUpSuccess = self.doCleanups()
-            success = success and cleanUpSuccess
-            if success:
-                result.addSuccess(self)
-        finally:
-            result.stopTest(self)
-            if orig_result is None:
-                stopTestRun = getattr(result, 'stopTestRun', None)
-                if stopTestRun is not None:
-                    stopTestRun()
-
-    def runMethod(self, testMethod, result):
-        """Runs the test method and catches any exception that might be thrown.
-
-        This is factored out of TestCase.run() to ensure that any exception
-        thrown during the test goes out of scope before tearDown.  Otherwise, an
-        exception could hold references to Python objects that are bound to
-        SB objects and prevent them from being deleted in time.
-        """
-        try:
-            testMethod()
-        except self.failureException:
-            result.addFailure(self, sys.exc_info())
-        except _ExpectedFailure as e:
-            addExpectedFailure = getattr(result, 'addExpectedFailure', None)
-            if addExpectedFailure is not None:
-                addExpectedFailure(self, e.exc_info, e.bugnumber)
-            else:
-                warnings.warn(
-                    "Use of a TestResult without an addExpectedFailure method is deprecated",
-                    DeprecationWarning)
-                result.addSuccess(self)
-        except _UnexpectedSuccess as x:
-            addUnexpectedSuccess = getattr(
-                result, 'addUnexpectedSuccess', None)
-            if addUnexpectedSuccess is not None:
-                addUnexpectedSuccess(self, x.bugnumber)
-            else:
-                warnings.warn(
-                    "Use of a TestResult without an addUnexpectedSuccess method is deprecated",
-                    DeprecationWarning)
-                result.addFailure(self, sys.exc_info())
-        except SkipTest as e:
-            self._addSkip(result, str(e))
-        except Exception:
-            result.addError(self, sys.exc_info())
-        else:
-            return True
-        return False
-
-    def doCleanups(self):
-        """Execute all cleanup functions. Normally called for you after
-        tearDown."""
-        result = self._resultForDoCleanups
-        ok = True
-        while self._cleanups:
-            function, args, kwargs = self._cleanups.pop(-1)
-            try:
-                function(*args, **kwargs)
-            except Exception:
-                ok = False
-                result.addError(self, sys.exc_info())
-        return ok
-
-    def __call__(self, *args, **kwds):
-        return self.run(*args, **kwds)
-
-    def debug(self):
-        """Run the test without collecting errors in a TestResult"""
-        self.setUp()
-        getattr(self, self._testMethodName)()
-        self.tearDown()
-        while self._cleanups:
-            function, args, kwargs = self._cleanups.pop(-1)
-            function(*args, **kwargs)
-
-    def skipTest(self, reason):
-        """Skip this test."""
-        raise SkipTest(reason)
-
-    def fail(self, msg=None):
-        """Fail immediately, with the given message."""
-        raise self.failureException(msg)
-
-    def assertFalse(self, expr, msg=None):
-        "Fail the test if the expression is true."
-        if expr:
-            msg = self._formatMessage(msg, "%s is not False" % safe_repr(expr))
-            raise self.failureException(msg)
-
-    def assertTrue(self, expr, msg=None):
-        """Fail the test unless the expression is true."""
-        if not expr:
-            msg = self._formatMessage(msg, "%s is not True" % safe_repr(expr))
-            raise self.failureException(msg)
-
-    def _formatMessage(self, msg, standardMsg):
-        """Honour the longMessage attribute when generating failure messages.
-        If longMessage is False this means:
-        * Use only an explicit message if it is provided
-        * Otherwise use the standard message for the assert
-
-        If longMessage is True:
-        * Use the standard message
-        * If an explicit message is provided, plus ' : ' and the explicit message
-        """
-        if not self.longMessage:
-            return msg or standardMsg
-        if msg is None:
-            return standardMsg
-        try:
-            return '%s : %s' % (standardMsg, msg)
-        except UnicodeDecodeError:
-            return '%s : %s' % (safe_str(standardMsg), safe_str(msg))
-
-    def assertRaises(self, excClass, callableObj=None, *args, **kwargs):
-        """Fail unless an exception of class excClass is thrown
-           by callableObj when invoked with arguments args and keyword
-           arguments kwargs. If a different type of exception is
-           thrown, it will not be caught, and the test case will be
-           deemed to have suffered an error, exactly as for an
-           unexpected exception.
-
-           If called with callableObj omitted or None, will return a
-           context object used like this::
-
-                with self.assertRaises(SomeException):
-                    do_something()
-
-           The context manager keeps a reference to the exception as
-           the 'exception' attribute. This allows you to inspect the
-           exception after the assertion::
-
-               with self.assertRaises(SomeException) as cm:
-                   do_something()
-               the_exception = cm.exception
-               self.assertEqual(the_exception.error_code, 3)
-        """
-        if callableObj is None:
-            return _AssertRaisesContext(excClass, self)
-        try:
-            callableObj(*args, **kwargs)
-        except excClass:
-            return
-
-        if hasattr(excClass, '__name__'):
-            excName = excClass.__name__
-        else:
-            excName = str(excClass)
-        raise self.failureException("%s not raised" % excName)
-
-    def _getAssertEqualityFunc(self, first, second):
-        """Get a detailed comparison function for the types of the two args.
-
-        Returns: A callable accepting (first, second, msg=None) that will
-        raise a failure exception if first != second with a useful human
-        readable error message for those types.
-        """
-        #
-        # NOTE(gregory.p.smith): I considered isinstance(first, type(second))
-        # and vice versa.  I opted for the conservative approach in case
-        # subclasses are not intended to be compared in detail to their super
-        # class instances using a type equality func.  This means testing
-        # subtypes won't automagically use the detailed comparison.  Callers
-        # should use their type specific assertSpamEqual method to compare
-        # subclasses if the detailed comparison is desired and appropriate.
-        # See the discussion in http://bugs.python.org/issue2578.
-        #
-        if isinstance(first, type(second)):
-            asserter = self._type_equality_funcs.get(type(first))
-            if asserter is not None:
-                return asserter
-
-        return self._baseAssertEqual
-
-    def _baseAssertEqual(self, first, second, msg=None):
-        """The default assertEqual implementation, not type specific."""
-        if not first == second:
-            standardMsg = '%s != %s' % (safe_repr(first), safe_repr(second))
-            msg = self._formatMessage(msg, standardMsg)
-            raise self.failureException(msg)
-
-    def assertEqual(self, first, second, msg=None):
-        """Fail if the two objects are unequal as determined by the '=='
-           operator.
-        """
-        assertion_func = self._getAssertEqualityFunc(first, second)
-        assertion_func(first, second, msg=msg)
-
-    def assertNotEqual(self, first, second, msg=None):
-        """Fail if the two objects are equal as determined by the '=='
-           operator.
-        """
-        if not first != second:
-            msg = self._formatMessage(msg, '%s == %s' % (safe_repr(first),
-                                                         safe_repr(second)))
-            raise self.failureException(msg)
-
-    def assertAlmostEqual(
-            self,
-            first,
-            second,
-            places=None,
-            msg=None,
-            delta=None):
-        """Fail if the two objects are unequal as determined by their
-           difference rounded to the given number of decimal places
-           (default 7) and comparing to zero, or by comparing that the
-           between the two objects is more than the given delta.
-
-           Note that decimal places (from zero) are usually not the same
-           as significant digits (measured from the most signficant digit).
-
-           If the two objects compare equal then they will automatically
-           compare almost equal.
-        """
-        if first == second:
-            # shortcut
-            return
-        if delta is not None and places is not None:
-            raise TypeError("specify delta or places not both")
-
-        if delta is not None:
-            if abs(first - second) <= delta:
-                return
-
-            standardMsg = '%s != %s within %s delta' % (safe_repr(first),
-                                                        safe_repr(second),
-                                                        safe_repr(delta))
-        else:
-            if places is None:
-                places = 7
-
-            if round(abs(second - first), places) == 0:
-                return
-
-            standardMsg = '%s != %s within %r places' % (safe_repr(first),
-                                                         safe_repr(second),
-                                                         places)
-        msg = self._formatMessage(msg, standardMsg)
-        raise self.failureException(msg)
-
-    def assertNotAlmostEqual(
-            self,
-            first,
-            second,
-            places=None,
-            msg=None,
-            delta=None):
-        """Fail if the two objects are equal as determined by their
-           difference rounded to the given number of decimal places
-           (default 7) and comparing to zero, or by comparing that the
-           between the two objects is less than the given delta.
-
-           Note that decimal places (from zero) are usually not the same
-           as significant digits (measured from the most signficant digit).
-
-           Objects that are equal automatically fail.
-        """
-        if delta is not None and places is not None:
-            raise TypeError("specify delta or places not both")
-        if delta is not None:
-            if not (first == second) and abs(first - second) > delta:
-                return
-            standardMsg = '%s == %s within %s delta' % (safe_repr(first),
-                                                        safe_repr(second),
-                                                        safe_repr(delta))
-        else:
-            if places is None:
-                places = 7
-            if not (first == second) and round(
-                    abs(second - first), places) != 0:
-                return
-            standardMsg = '%s == %s within %r places' % (safe_repr(first),
-                                                         safe_repr(second),
-                                                         places)
-
-        msg = self._formatMessage(msg, standardMsg)
-        raise self.failureException(msg)
-
-    # Synonyms for assertion methods
-
-    # The plurals are undocumented.  Keep them that way to discourage use.
-    # Do not add more.  Do not remove.
-    # Going through a deprecation cycle on these would annoy many people.
-    assertEquals = assertEqual
-    assertNotEquals = assertNotEqual
-    assertAlmostEquals = assertAlmostEqual
-    assertNotAlmostEquals = assertNotAlmostEqual
-    assert_ = assertTrue
-
-    # These fail* assertion method names are pending deprecation and will
-    # be a DeprecationWarning in 3.2; http://bugs.python.org/issue2578
-    def _deprecate(original_func):
-        def deprecated_func(*args, **kwargs):
-            warnings.warn(
-                ('Please use %s instead.' % original_func.__name__),
-                PendingDeprecationWarning, 2)
-            return original_func(*args, **kwargs)
-        return deprecated_func
-
-    failUnlessEqual = _deprecate(assertEqual)
-    failIfEqual = _deprecate(assertNotEqual)
-    failUnlessAlmostEqual = _deprecate(assertAlmostEqual)
-    failIfAlmostEqual = _deprecate(assertNotAlmostEqual)
-    failUnless = _deprecate(assertTrue)
-    failUnlessRaises = _deprecate(assertRaises)
-    failIf = _deprecate(assertFalse)
-
-    def assertSequenceEqual(self, seq1, seq2,
-                            msg=None, seq_type=None, max_diff=80 * 8):
-        """An equality assertion for ordered sequences (like lists and tuples).
-
-        For the purposes of this function, a valid ordered sequence type is one
-        which can be indexed, has a length, and has an equality operator.
-
-        Args:
-            seq1: The first sequence to compare.
-            seq2: The second sequence to compare.
-            seq_type: The expected datatype of the sequences, or None if no
-                    datatype should be enforced.
-            msg: Optional message to use on failure instead of a list of
-                    differences.
-            max_diff: Maximum size off the diff, larger diffs are not shown
-        """
-        if seq_type is not None:
-            seq_type_name = seq_type.__name__
-            if not isinstance(seq1, seq_type):
-                raise self.failureException('First sequence is not a %s: %s'
-                                            % (seq_type_name, safe_repr(seq1)))
-            if not isinstance(seq2, seq_type):
-                raise self.failureException('Second sequence is not a %s: %s'
-                                            % (seq_type_name, safe_repr(seq2)))
-        else:
-            seq_type_name = "sequence"
-
-        differing = None
-        try:
-            len1 = len(seq1)
-        except (TypeError, NotImplementedError):
-            differing = 'First %s has no length.    Non-sequence?' % (
-                seq_type_name)
-
-        if differing is None:
-            try:
-                len2 = len(seq2)
-            except (TypeError, NotImplementedError):
-                differing = 'Second %s has no length.    Non-sequence?' % (
-                    seq_type_name)
-
-        if differing is None:
-            if seq1 == seq2:
-                return
-
-            seq1_repr = repr(seq1)
-            seq2_repr = repr(seq2)
-            if len(seq1_repr) > 30:
-                seq1_repr = seq1_repr[:30] + '...'
-            if len(seq2_repr) > 30:
-                seq2_repr = seq2_repr[:30] + '...'
-            elements = (seq_type_name.capitalize(), seq1_repr, seq2_repr)
-            differing = '%ss differ: %s != %s\n' % elements
-
-            for i in range(min(len1, len2)):
-                try:
-                    item1 = seq1[i]
-                except (TypeError, IndexError, NotImplementedError):
-                    differing += ('\nUnable to index element %d of first %s\n' %
-                                  (i, seq_type_name))
-                    break
-
-                try:
-                    item2 = seq2[i]
-                except (TypeError, IndexError, NotImplementedError):
-                    differing += ('\nUnable to index element %d of second %s\n' %
-                                  (i, seq_type_name))
-                    break
-
-                if item1 != item2:
-                    differing += ('\nFirst differing element %d:\n%s\n%s\n' %
-                                  (i, item1, item2))
-                    break
-            else:
-                if (len1 == len2 and seq_type is None and
-                        not isinstance(seq1, type(seq2))):
-                    # The sequences are the same, but have differing types.
-                    return
-
-            if len1 > len2:
-                differing += ('\nFirst %s contains %d additional '
-                              'elements.\n' % (seq_type_name, len1 - len2))
-                try:
-                    differing += ('First extra element %d:\n%s\n' %
-                                  (len2, seq1[len2]))
-                except (TypeError, IndexError, NotImplementedError):
-                    differing += ('Unable to index element %d '
-                                  'of first %s\n' % (len2, seq_type_name))
-            elif len1 < len2:
-                differing += ('\nSecond %s contains %d additional '
-                              'elements.\n' % (seq_type_name, len2 - len1))
-                try:
-                    differing += ('First extra element %d:\n%s\n' %
-                                  (len1, seq2[len1]))
-                except (TypeError, IndexError, NotImplementedError):
-                    differing += ('Unable to index element %d '
-                                  'of second %s\n' % (len1, seq_type_name))
-        standardMsg = differing
-        diffMsg = '\n' + '\n'.join(
-            difflib.ndiff(pprint.pformat(seq1).splitlines(),
-                          pprint.pformat(seq2).splitlines()))
-
-        standardMsg = self._truncateMessage(standardMsg, diffMsg)
-        msg = self._formatMessage(msg, standardMsg)
-        self.fail(msg)
-
-    def _truncateMessage(self, message, diff):
-        max_diff = self.maxDiff
-        if max_diff is None or len(diff) <= max_diff:
-            return message + diff
-        return message + (DIFF_OMITTED % len(diff))
-
-    def assertListEqual(self, list1, list2, msg=None):
-        """A list-specific equality assertion.
-
-        Args:
-            list1: The first list to compare.
-            list2: The second list to compare.
-            msg: Optional message to use on failure instead of a list of
-                    differences.
-
-        """
-        self.assertSequenceEqual(list1, list2, msg, seq_type=list)
-
-    def assertTupleEqual(self, tuple1, tuple2, msg=None):
-        """A tuple-specific equality assertion.
-
-        Args:
-            tuple1: The first tuple to compare.
-            tuple2: The second tuple to compare.
-            msg: Optional message to use on failure instead of a list of
-                    differences.
-        """
-        self.assertSequenceEqual(tuple1, tuple2, msg, seq_type=tuple)
-
-    def assertSetEqual(self, set1, set2, msg=None):
-        """A set-specific equality assertion.
-
-        Args:
-            set1: The first set to compare.
-            set2: The second set to compare.
-            msg: Optional message to use on failure instead of a list of
-                    differences.
-
-        assertSetEqual uses ducktyping to support
-        different types of sets, and is optimized for sets specifically
-        (parameters must support a difference method).
-        """
-        try:
-            difference1 = set1.difference(set2)
-        except TypeError as e:
-            self.fail('invalid type when attempting set difference: %s' % e)
-        except AttributeError as e:
-            self.fail('first argument does not support set difference: %s' % e)
-
-        try:
-            difference2 = set2.difference(set1)
-        except TypeError as e:
-            self.fail('invalid type when attempting set difference: %s' % e)
-        except AttributeError as e:
-            self.fail(
-                'second argument does not support set difference: %s' %
-                e)
-
-        if not (difference1 or difference2):
-            return
-
-        lines = []
-        if difference1:
-            lines.append('Items in the first set but not the second:')
-            for item in difference1:
-                lines.append(repr(item))
-        if difference2:
-            lines.append('Items in the second set but not the first:')
-            for item in difference2:
-                lines.append(repr(item))
-
-        standardMsg = '\n'.join(lines)
-        self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertIn(self, member, container, msg=None):
-        """Just like self.assertTrue(a in b), but with a nicer default message."""
-        if member not in container:
-            standardMsg = '%s not found in %s' % (safe_repr(member),
-                                                  safe_repr(container))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertNotIn(self, member, container, msg=None):
-        """Just like self.assertTrue(a not in b), but with a nicer default message."""
-        if member in container:
-            standardMsg = '%s unexpectedly found in %s' % (
-                safe_repr(member), safe_repr(container))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertIs(self, expr1, expr2, msg=None):
-        """Just like self.assertTrue(a is b), but with a nicer default message."""
-        if expr1 is not expr2:
-            standardMsg = '%s is not %s' % (safe_repr(expr1), safe_repr(expr2))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertIsNot(self, expr1, expr2, msg=None):
-        """Just like self.assertTrue(a is not b), but with a nicer default message."""
-        if expr1 is expr2:
-            standardMsg = 'unexpectedly identical: %s' % (safe_repr(expr1),)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertDictEqual(self, d1, d2, msg=None):
-        self.assert_(
-            isinstance(
-                d1,
-                dict),
-            'First argument is not a dictionary')
-        self.assert_(
-            isinstance(
-                d2,
-                dict),
-            'Second argument is not a dictionary')
-
-        if d1 != d2:
-            standardMsg = '%s != %s' % (
-                safe_repr(d1, True), safe_repr(d2, True))
-            diff = ('\n' + '\n'.join(difflib.ndiff(
-                           pprint.pformat(d1).splitlines(),
-                           pprint.pformat(d2).splitlines())))
-            standardMsg = self._truncateMessage(standardMsg, diff)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertDictContainsSubset(self, expected, actual, msg=None):
-        """Checks whether actual is a superset of expected."""
-        missing = []
-        mismatched = []
-        for key, value in expected.iteritems():
-            if key not in actual:
-                missing.append(key)
-            elif value != actual[key]:
-                mismatched.append('%s, expected: %s, actual: %s' %
-                                  (safe_repr(key), safe_repr(value),
-                                   safe_repr(actual[key])))
-
-        if not (missing or mismatched):
-            return
-
-        standardMsg = ''
-        if missing:
-            standardMsg = 'Missing: %s' % ','.join(safe_repr(m) for m in
-                                                   missing)
-        if mismatched:
-            if standardMsg:
-                standardMsg += '; '
-            standardMsg += 'Mismatched values: %s' % ','.join(mismatched)
-
-        self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertItemsEqual(self, expected_seq, actual_seq, msg=None):
-        """An unordered sequence specific comparison. It asserts that
-        expected_seq and actual_seq contain the same elements. It is
-        the equivalent of::
-
-            self.assertEqual(sorted(expected_seq), sorted(actual_seq))
-
-        Raises with an error message listing which elements of expected_seq
-        are missing from actual_seq and vice versa if any.
-
-        Asserts that each element has the same count in both sequences.
-        Example:
-            - [0, 1, 1] and [1, 0, 1] compare equal.
-            - [0, 0, 1] and [0, 1] compare unequal.
-        """
-        try:
-            expected = sorted(expected_seq)
-            actual = sorted(actual_seq)
-        except TypeError:
-            # Unsortable items (example: set(), complex(), ...)
-            expected = list(expected_seq)
-            actual = list(actual_seq)
-            missing, unexpected = unorderable_list_difference(
-                expected, actual, ignore_duplicate=False
-            )
-        else:
-            return self.assertSequenceEqual(expected, actual, msg=msg)
-
-        errors = []
-        if missing:
-            errors.append('Expected, but missing:\n    %s' %
-                          safe_repr(missing))
-        if unexpected:
-            errors.append('Unexpected, but present:\n    %s' %
-                          safe_repr(unexpected))
-        if errors:
-            standardMsg = '\n'.join(errors)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertMultiLineEqual(self, first, second, msg=None):
-        """Assert that two multi-line strings are equal."""
-        self.assert_(isinstance(first, str), (
-            'First argument is not a string'))
-        self.assert_(isinstance(second, str), (
-            'Second argument is not a string'))
-
-        if first != second:
-            standardMsg = '%s != %s' % (
-                safe_repr(first, True), safe_repr(second, True))
-            diff = '\n' + ''.join(difflib.ndiff(first.splitlines(True),
-                                                second.splitlines(True)))
-            standardMsg = self._truncateMessage(standardMsg, diff)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertLess(self, a, b, msg=None):
-        """Just like self.assertTrue(a < b), but with a nicer default message."""
-        if not a < b:
-            standardMsg = '%s not less than %s' % (safe_repr(a), safe_repr(b))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertLessEqual(self, a, b, msg=None):
-        """Just like self.assertTrue(a <= b), but with a nicer default message."""
-        if not a <= b:
-            standardMsg = '%s not less than or equal to %s' % (
-                safe_repr(a), safe_repr(b))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertGreater(self, a, b, msg=None):
-        """Just like self.assertTrue(a > b), but with a nicer default message."""
-        if not a > b:
-            standardMsg = '%s not greater than %s' % (
-                safe_repr(a), safe_repr(b))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertGreaterEqual(self, a, b, msg=None):
-        """Just like self.assertTrue(a >= b), but with a nicer default message."""
-        if not a >= b:
-            standardMsg = '%s not greater than or equal to %s' % (
-                safe_repr(a), safe_repr(b))
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertIsNone(self, obj, msg=None):
-        """Same as self.assertTrue(obj is None), with a nicer default message."""
-        if obj is not None:
-            standardMsg = '%s is not None' % (safe_repr(obj),)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertIsNotNone(self, obj, msg=None):
-        """Included for symmetry with assertIsNone."""
-        if obj is None:
-            standardMsg = 'unexpectedly None'
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertIsInstance(self, obj, cls, msg=None):
-        """Same as self.assertTrue(isinstance(obj, cls)), with a nicer
-        default message."""
-        if not isinstance(obj, cls):
-            standardMsg = '%s is not an instance of %r' % (safe_repr(obj), cls)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertNotIsInstance(self, obj, cls, msg=None):
-        """Included for symmetry with assertIsInstance."""
-        if isinstance(obj, cls):
-            standardMsg = '%s is an instance of %r' % (safe_repr(obj), cls)
-            self.fail(self._formatMessage(msg, standardMsg))
-
-    def assertRaisesRegexp(self, expected_exception, expected_regexp,
-                           callable_obj=None, *args, **kwargs):
-        """Asserts that the message in a raised exception matches a regexp.
-
-        Args:
-            expected_exception: Exception class expected to be raised.
-            expected_regexp: Regexp (re pattern object or string) expected
-                    to be found in error message.
-            callable_obj: Function to be called.
-            args: Extra args.
-            kwargs: Extra kwargs.
-        """
-        if callable_obj is None:
-            return _AssertRaisesContext(
-                expected_exception, self, expected_regexp)
-        try:
-            callable_obj(*args, **kwargs)
-        except expected_exception as exc_value:
-            if isinstance(expected_regexp, str):
-                expected_regexp = re.compile(expected_regexp)
-            if not expected_regexp.search(str(exc_value)):
-                raise self.failureException(
-                    '"%s" does not match "%s"' %
-                    (expected_regexp.pattern, str(exc_value)))
-        else:
-            if hasattr(expected_exception, '__name__'):
-                excName = expected_exception.__name__
-            else:
-                excName = str(expected_exception)
-            raise self.failureException("%s not raised" % excName)
-
-    def assertRegexpMatches(self, text, expected_regexp, msg=None):
-        """Fail the test unless the text matches the regular expression."""
-        if isinstance(expected_regexp, str):
-            expected_regexp = re.compile(expected_regexp)
-        if not expected_regexp.search(text):
-            msg = msg or "Regexp didn't match"
-            msg = '%s: %r not found in %r' % (
-                msg, expected_regexp.pattern, text)
-            raise self.failureException(msg)
-
-    def assertNotRegexpMatches(self, text, unexpected_regexp, msg=None):
-        """Fail the test if the text matches the regular expression."""
-        if isinstance(unexpected_regexp, str):
-            unexpected_regexp = re.compile(unexpected_regexp)
-        match = unexpected_regexp.search(text)
-        if match:
-            msg = msg or "Regexp matched"
-            msg = '%s: %r matches %r in %r' % (msg,
-                                               text[match.start():match.end()],
-                                               unexpected_regexp.pattern,
-                                               text)
-            raise self.failureException(msg)
-
-
-class FunctionTestCase(TestCase):
-    """A test case that wraps a test function.
-
-    This is useful for slipping pre-existing test functions into the
-    unittest framework. Optionally, set-up and tidy-up functions can be
-    supplied. As with TestCase, the tidy-up ('tearDown') function will
-    always be called if the set-up ('setUp') function ran successfully.
-    """
-
-    def __init__(self, testFunc, setUp=None, tearDown=None, description=None):
-        super(FunctionTestCase, self).__init__()
-        self._setUpFunc = setUp
-        self._tearDownFunc = tearDown
-        self._testFunc = testFunc
-        self._description = description
-
-    def setUp(self):
-        if self._setUpFunc is not None:
-            self._setUpFunc()
-
-    def tearDown(self):
-        if self._tearDownFunc is not None:
-            self._tearDownFunc()
-
-    def runTest(self):
-        self._testFunc()
-
-    def id(self):
-        return self._testFunc.__name__
-
-    def __eq__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-
-        return self._setUpFunc == other._setUpFunc and \
-            self._tearDownFunc == other._tearDownFunc and \
-            self._testFunc == other._testFunc and \
-            self._description == other._description
-
-    def __ne__(self, other):
-        return not self == other
-
-    def __hash__(self):
-        return hash((type(self), self._setUpFunc, self._tearDownFunc,
-                     self._testFunc, self._description))
-
-    def __str__(self):
-        return "%s (%s)" % (strclass(self.__class__),
-                            self._testFunc.__name__)
-
-    def __repr__(self):
-        return "<%s testFunc=%s>" % (strclass(self.__class__),
-                                     self._testFunc)
-
-    def shortDescription(self):
-        if self._description is not None:
-            return self._description
-        doc = self._testFunc.__doc__
-        return doc and doc.split("\n")[0].strip() or None
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/collector.py b/lldb/third_party/Python/module/unittest2/unittest2/collector.py
deleted file mode 100644
index b9013e8888850..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/collector.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import os
-import sys
-from unittest2.loader import defaultTestLoader
-
-
-def collector():
-    # import __main__ triggers code re-execution
-    __main__ = sys.modules['__main__']
-    setupDir = os.path.abspath(os.path.dirname(__main__.__file__))
-    return defaultTestLoader.discover(setupDir)
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/compatibility.py b/lldb/third_party/Python/module/unittest2/unittest2/compatibility.py
deleted file mode 100644
index 3adcdc4b35bd8..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/compatibility.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import os
-import sys
-
-try:
-    from functools import wraps
-except ImportError:
-    # only needed for Python 2.4
-    def wraps(_):
-        def _wraps(func):
-            return func
-        return _wraps
-
-__unittest = True
-
-
-def _relpath_nt(path, start=os.path.curdir):
-    """Return a relative version of a path"""
-
-    if not path:
-        raise ValueError("no path specified")
-    start_list = os.path.abspath(start).split(os.path.sep)
-    path_list = os.path.abspath(path).split(os.path.sep)
-    if start_list[0].lower() != path_list[0].lower():
-        unc_path, rest = os.path.splitunc(path)
-        unc_start, rest = os.path.splitunc(start)
-        if bool(unc_path) ^ bool(unc_start):
-            raise ValueError("Cannot mix UNC and non-UNC paths (%s and %s)"
-                             % (path, start))
-        else:
-            raise ValueError("path is on drive %s, start on drive %s"
-                             % (path_list[0], start_list[0]))
-    # Work out how much of the filepath is shared by start and path.
-    for i in range(min(len(start_list), len(path_list))):
-        if start_list[i].lower() != path_list[i].lower():
-            break
-    else:
-        i += 1
-
-    rel_list = [os.path.pardir] * (len(start_list) - i) + path_list[i:]
-    if not rel_list:
-        return os.path.curdir
-    return os.path.join(*rel_list)
-
-# default to posixpath definition
-
-
-def _relpath_posix(path, start=os.path.curdir):
-    """Return a relative version of a path"""
-
-    if not path:
-        raise ValueError("no path specified")
-
-    start_list = os.path.abspath(start).split(os.path.sep)
-    path_list = os.path.abspath(path).split(os.path.sep)
-
-    # Work out how much of the filepath is shared by start and path.
-    i = len(os.path.commonprefix([start_list, path_list]))
-
-    rel_list = [os.path.pardir] * (len(start_list) - i) + path_list[i:]
-    if not rel_list:
-        return os.path.curdir
-    return os.path.join(*rel_list)
-
-if os.path is sys.modules.get('ntpath'):
-    relpath = _relpath_nt
-else:
-    relpath = _relpath_posix
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/loader.py b/lldb/third_party/Python/module/unittest2/unittest2/loader.py
deleted file mode 100644
index 87edddca40281..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/loader.py
+++ /dev/null
@@ -1,339 +0,0 @@
-"""Loading unittests."""
-
-import functools
-import os
-import re
-import sys
-import traceback
-import types
-import unittest
-
-from fnmatch import fnmatch
-
-from unittest2 import case, suite, cmp_
-
-try:
-    from os.path import relpath
-except ImportError:
-    from unittest2.compatibility import relpath
-
-__unittest = True
-
-# what about .pyc or .pyo (etc)
-# we would need to avoid loading the same tests multiple times
-# from '.py', '.pyc' *and* '.pyo'
-VALID_MODULE_NAME = re.compile(r'[_a-z]\w*\.py$', re.IGNORECASE)
-
-
-def _make_failed_import_test(name, suiteClass):
-    message = 'Failed to import test module: %s' % name
-    if hasattr(traceback, 'format_exc'):
-        # Python 2.3 compatibility
-        # format_exc returns two frames of discover.py as well
-        message += '\n%s' % traceback.format_exc()
-    return _make_failed_test('ModuleImportFailure', name, ImportError(message),
-                             suiteClass)
-
-
-def _make_failed_load_tests(name, exception, suiteClass):
-    return _make_failed_test('LoadTestsFailure', name, exception, suiteClass)
-
-
-def _make_failed_test(classname, methodname, exception, suiteClass):
-    def testFailure(self):
-        raise exception
-    attrs = {methodname: testFailure}
-    TestClass = type(classname, (case.TestCase,), attrs)
-    return suiteClass((TestClass(methodname),))
-
-
-class TestLoader(unittest.TestLoader):
-    """
-    This class is responsible for loading tests according to various criteria
-    and returning them wrapped in a TestSuite
-    """
-
-    def __init__(self):
-        self.testMethodPrefix = 'test'
-        self.sortTestMethodsUsing = cmp_
-        self.suiteClass = suite.TestSuite
-        self._top_level_dir = None
-
-    def loadTestsFromTestCase(self, testCaseClass):
-        """Return a suite of all tests cases contained in testCaseClass"""
-        if issubclass(testCaseClass, suite.TestSuite):
-            raise TypeError("Test cases should not be derived from TestSuite."
-                            " Maybe you meant to derive from TestCase?")
-        testCaseNames = self.getTestCaseNames(testCaseClass)
-        if not testCaseNames and hasattr(testCaseClass, 'runTest'):
-            testCaseNames = ['runTest']
-        loaded_suite = self.suiteClass(map(testCaseClass, testCaseNames))
-        return loaded_suite
-
-    def loadTestsFromModule(self, module, use_load_tests=True):
-        """Return a suite of all tests cases contained in the given module"""
-        tests = []
-        for name in dir(module):
-            obj = getattr(module, name)
-            if isinstance(obj, type) and issubclass(obj, unittest.TestCase):
-                tests.append(self.loadTestsFromTestCase(obj))
-
-        load_tests = getattr(module, 'load_tests', None)
-        tests = self.suiteClass(tests)
-        if use_load_tests and load_tests is not None:
-            try:
-                return load_tests(self, tests, None)
-            except Exception as e:
-                return _make_failed_load_tests(module.__name__, e,
-                                               self.suiteClass)
-        return tests
-
-    def loadTestsFromName(self, name, module=None):
-        """Return a suite of all tests cases given a string specifier.
-
-        The name may resolve either to a module, a test case class, a
-        test method within a test case class, or a callable object which
-        returns a TestCase or TestSuite instance.
-
-        The method optionally resolves the names relative to a given module.
-        """
-        parts = name.split('.')
-        if module is None:
-            parts_copy = parts[:]
-            while parts_copy:
-                try:
-                    module = __import__('.'.join(parts_copy))
-                    break
-                except ImportError:
-                    del parts_copy[-1]
-                    if not parts_copy:
-                        raise
-            parts = parts[1:]
-        obj = module
-        for part in parts:
-            parent, obj = obj, getattr(obj, part)
-
-        if isinstance(obj, types.ModuleType):
-            return self.loadTestsFromModule(obj)
-        elif isinstance(obj, type) and issubclass(obj, unittest.TestCase):
-            return self.loadTestsFromTestCase(obj)
-        elif (isinstance(obj, (types.MethodType, types.FunctionType)) and
-              isinstance(parent, type) and
-              issubclass(parent, case.TestCase)):
-            return self.suiteClass([parent(obj.__name__)])
-        elif isinstance(obj, unittest.TestSuite):
-            return obj
-        elif hasattr(obj, '__call__'):
-            test = obj()
-            if isinstance(test, unittest.TestSuite):
-                return test
-            elif isinstance(test, unittest.TestCase):
-                return self.suiteClass([test])
-            else:
-                raise TypeError("calling %s returned %s, not a test" %
-                                (obj, test))
-        else:
-            raise TypeError("don't know how to make test from: %s" % obj)
-
-    def loadTestsFromNames(self, names, module=None):
-        """Return a suite of all tests cases found using the given sequence
-        of string specifiers. See 'loadTestsFromName()'.
-        """
-        suites = [self.loadTestsFromName(name, module) for name in names]
-        return self.suiteClass(suites)
-
-    def getTestCaseNames(self, testCaseClass):
-        """Return a sorted sequence of method names found within testCaseClass
-        """
-        def isTestMethod(attrname, testCaseClass=testCaseClass,
-                         prefix=self.testMethodPrefix):
-            return attrname.startswith(prefix) and \
-                hasattr(getattr(testCaseClass, attrname), '__call__')
-        testFnNames = list(filter(isTestMethod, dir(testCaseClass)))
-        if self.sortTestMethodsUsing:
-            testFnNames.sort(
-                key=functools.cmp_to_key(
-                    self.sortTestMethodsUsing))
-        return testFnNames
-
-    def discover(self, start_dir, pattern='test*.py', top_level_dir=None):
-        """Find and return all test modules from the specified start
-        directory, recursing into subdirectories to find them. Only test files
-        that match the pattern will be loaded. (Using shell style pattern
-        matching.)
-
-        All test modules must be importable from the top level of the project.
-        If the start directory is not the top level directory then the top
-        level directory must be specified separately.
-
-        If a test package name (directory with '__init__.py') matches the
-        pattern then the package will be checked for a 'load_tests' function. If
-        this exists then it will be called with loader, tests, pattern.
-
-        If load_tests exists then discovery does  *not* recurse into the package,
-        load_tests is responsible for loading all tests in the package.
-
-        The pattern is deliberately not stored as a loader attribute so that
-        packages can continue discovery themselves. top_level_dir is stored so
-        load_tests does not need to pass this argument in to loader.discover().
-        """
-        set_implicit_top = False
-        if top_level_dir is None and self._top_level_dir is not None:
-            # make top_level_dir optional if called from load_tests in a
-            # package
-            top_level_dir = self._top_level_dir
-        elif top_level_dir is None:
-            set_implicit_top = True
-            top_level_dir = start_dir
-
-        top_level_dir = os.path.abspath(top_level_dir)
-
-        if top_level_dir not in sys.path:
-            # all test modules must be importable from the top level directory
-            # should we *unconditionally* put the start directory in first
-            # in sys.path to minimise likelihood of conflicts between installed
-            # modules and development versions?
-            sys.path.insert(0, top_level_dir)
-        self._top_level_dir = top_level_dir
-
-        is_not_importable = False
-        if os.path.isdir(os.path.abspath(start_dir)):
-            start_dir = os.path.abspath(start_dir)
-            if start_dir != top_level_dir:
-                is_not_importable = not os.path.isfile(
-                    os.path.join(start_dir, '__init__.py'))
-        else:
-            # support for discovery from dotted module names
-            try:
-                __import__(start_dir)
-            except ImportError:
-                is_not_importable = True
-            else:
-                the_module = sys.modules[start_dir]
-                top_part = start_dir.split('.')[0]
-                start_dir = os.path.abspath(
-                    os.path.dirname((the_module.__file__)))
-                if set_implicit_top:
-                    self._top_level_dir = os.path.abspath(os.path.dirname(
-                        os.path.dirname(sys.modules[top_part].__file__)))
-                    sys.path.remove(top_level_dir)
-
-        if is_not_importable:
-            raise ImportError(
-                'Start directory is not importable: %r' %
-                start_dir)
-
-        tests = list(self._find_tests(start_dir, pattern))
-        return self.suiteClass(tests)
-
-    def _get_name_from_path(self, path):
-        path = os.path.splitext(os.path.normpath(path))[0]
-
-        _relpath = relpath(path, self._top_level_dir)
-        assert not os.path.isabs(_relpath), "Path must be within the project"
-        assert not _relpath.startswith('..'), "Path must be within the project"
-
-        name = _relpath.replace(os.path.sep, '.')
-        return name
-
-    def _get_module_from_name(self, name):
-        __import__(name)
-        return sys.modules[name]
-
-    def _match_path(self, path, full_path, pattern):
-        # override this method to use alternative matching strategy
-        return fnmatch(path, pattern)
-
-    def _find_tests(self, start_dir, pattern):
-        """Used by discovery. Yields test suites it loads."""
-        paths = os.listdir(start_dir)
-
-        for path in paths:
-            full_path = os.path.join(start_dir, path)
-            if os.path.isfile(full_path):
-                if not VALID_MODULE_NAME.match(path):
-                    # valid Python identifiers only
-                    continue
-                if not self._match_path(path, full_path, pattern):
-                    continue
-                # if the test file matches, load it
-                name = self._get_name_from_path(full_path)
-                try:
-                    module = self._get_module_from_name(name)
-                except:
-                    yield _make_failed_import_test(name, self.suiteClass)
-                else:
-                    mod_file = os.path.abspath(
-                        getattr(module, '__file__', full_path))
-                    realpath = os.path.splitext(mod_file)[0]
-                    fullpath_noext = os.path.splitext(full_path)[0]
-                    if realpath.lower() != fullpath_noext.lower():
-                        module_dir = os.path.dirname(realpath)
-                        mod_name = os.path.splitext(
-                            os.path.basename(full_path))[0]
-                        expected_dir = os.path.dirname(full_path)
-                        msg = (
-                            "%r module incorrectly imported from %r. Expected %r. "
-                            "Is this module globally installed?")
-                        raise ImportError(msg %
-                                          (mod_name, module_dir, expected_dir))
-                    yield self.loadTestsFromModule(module)
-            elif os.path.isdir(full_path):
-                if not os.path.isfile(os.path.join(full_path, '__init__.py')):
-                    continue
-
-                load_tests = None
-                tests = None
-                if fnmatch(path, pattern):
-                    # only check load_tests if the package directory itself
-                    # matches the filter
-                    name = self._get_name_from_path(full_path)
-                    package = self._get_module_from_name(name)
-                    load_tests = getattr(package, 'load_tests', None)
-                    tests = self.loadTestsFromModule(
-                        package, use_load_tests=False)
-
-                if load_tests is None:
-                    if tests is not None:
-                        # tests loaded from package file
-                        yield tests
-                    # recurse into the package
-                    for test in self._find_tests(full_path, pattern):
-                        yield test
-                else:
-                    try:
-                        yield load_tests(self, tests, pattern)
-                    except Exception as e:
-                        yield _make_failed_load_tests(package.__name__, e,
-                                                      self.suiteClass)
-
-defaultTestLoader = TestLoader()
-
-
-def _makeLoader(prefix, sortUsing, suiteClass=None):
-    loader = TestLoader()
-    loader.sortTestMethodsUsing = sortUsing
-    loader.testMethodPrefix = prefix
-    if suiteClass:
-        loader.suiteClass = suiteClass
-    return loader
-
-
-def getTestCaseNames(testCaseClass, prefix, sortUsing=cmp_):
-    return _makeLoader(prefix, sortUsing).getTestCaseNames(testCaseClass)
-
-
-def makeSuite(testCaseClass, prefix='test', sortUsing=cmp_,
-              suiteClass=suite.TestSuite):
-    return _makeLoader(
-        prefix,
-        sortUsing,
-        suiteClass).loadTestsFromTestCase(testCaseClass)
-
-
-def findTestCases(module, prefix='test', sortUsing=cmp_,
-                  suiteClass=suite.TestSuite):
-    return _makeLoader(
-        prefix,
-        sortUsing,
-        suiteClass).loadTestsFromModule(module)
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/main.py b/lldb/third_party/Python/module/unittest2/unittest2/main.py
deleted file mode 100644
index 0d3c5b8bd6c4d..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/main.py
+++ /dev/null
@@ -1,257 +0,0 @@
-"""Unittest main program"""
-
-import sys
-import os
-import types
-
-from unittest2 import loader, runner
-try:
-    from unittest2.signals import installHandler
-except ImportError:
-    installHandler = None
-
-__unittest = True
-
-FAILFAST = "  -f, --failfast   Stop on first failure\n"
-CATCHBREAK = "  -c, --catch      Catch control-C and display results\n"
-BUFFEROUTPUT = "  -b, --buffer     Buffer stdout and stderr during test runs\n"
-
-USAGE_AS_MAIN = """\
-Usage: %(progName)s [options] [tests]
-
-Options:
-  -h, --help       Show this message
-  -v, --verbose    Verbose output
-  -q, --quiet      Minimal output
-%(failfast)s%(catchbreak)s%(buffer)s
-Examples:
-  %(progName)s test_module                       - run tests from test_module
-  %(progName)s test_module.TestClass             - run tests from
-                                                   test_module.TestClass
-  %(progName)s test_module.TestClass.test_method - run specified test method
-
-[tests] can be a list of any number of test modules, classes and test
-methods.
-
-Alternative Usage: %(progName)s discover [options]
-
-Options:
-  -v, --verbose    Verbose output
-%(failfast)s%(catchbreak)s%(buffer)s  -s directory     Directory to start discovery ('.' default)
-  -p pattern       Pattern to match test files ('test*.py' default)
-  -t directory     Top level directory of project (default to
-                   start directory)
-
-For test discovery all test modules must be importable from the top
-level directory of the project.
-"""
-
-USAGE_FROM_MODULE = """\
-Usage: %(progName)s [options] [test] [...]
-
-Options:
-  -h, --help       Show this message
-  -v, --verbose    Verbose output
-  -q, --quiet      Minimal output
-%(failfast)s%(catchbreak)s%(buffer)s
-Examples:
-  %(progName)s                               - run default set of tests
-  %(progName)s MyTestSuite                   - run suite 'MyTestSuite'
-  %(progName)s MyTestCase.testSomething      - run MyTestCase.testSomething
-  %(progName)s MyTestCase                    - run all 'test*' test methods
-                                               in MyTestCase
-"""
-
-
-class TestProgram(object):
-    """A command-line program that runs a set of tests; this is primarily
-       for making test modules conveniently executable.
-    """
-    USAGE = USAGE_FROM_MODULE
-
-    # defaults for testing
-    failfast = catchbreak = buffer = progName = None
-
-    def __init__(self, module='__main__', defaultTest=None,
-                 argv=None, testRunner=None,
-                 testLoader=loader.defaultTestLoader, exit=True,
-                 verbosity=1, failfast=None, catchbreak=None, buffer=None):
-        if isinstance(module, str):
-            self.module = __import__(module)
-            for part in module.split('.')[1:]:
-                self.module = getattr(self.module, part)
-        else:
-            self.module = module
-        if argv is None:
-            argv = sys.argv
-
-        self.exit = exit
-        self.verbosity = verbosity
-        self.failfast = failfast
-        self.catchbreak = catchbreak
-        self.buffer = buffer
-        self.defaultTest = defaultTest
-        self.testRunner = testRunner
-        self.testLoader = testLoader
-        self.progName = os.path.basename(argv[0])
-        self.parseArgs(argv)
-        self.runTests()
-
-    def usageExit(self, msg=None):
-        if msg:
-            print(msg)
-        usage = {'progName': self.progName, 'catchbreak': '', 'failfast': '',
-                 'buffer': ''}
-        if self.failfast:
-            usage['failfast'] = FAILFAST
-        if self.catchbreak and installHandler is not None:
-            usage['catchbreak'] = CATCHBREAK
-        if self.buffer:
-            usage['buffer'] = BUFFEROUTPUT
-        print(self.USAGE % usage)
-        sys.exit(2)
-
-    def parseArgs(self, argv):
-        if len(argv) > 1 and argv[1].lower() == 'discover':
-            self._do_discovery(argv[2:])
-            return
-
-        import getopt
-        long_opts = ['help', 'verbose', 'quiet', 'failfast', 'catch', 'buffer']
-        try:
-            options, args = getopt.getopt(argv[1:], 'hHvqfcb', long_opts)
-            for opt, value in options:
-                if opt in ('-h', '-H', '--help'):
-                    self.usageExit()
-                if opt in ('-q', '--quiet'):
-                    self.verbosity = 0
-                if opt in ('-v', '--verbose'):
-                    self.verbosity = 2
-                if opt in ('-f', '--failfast'):
-                    if self.failfast is None:
-                        self.failfast = True
-                    # Should this raise an exception if -f is not valid?
-                if opt in ('-c', '--catch'):
-                    if self.catchbreak is None and installHandler is not None:
-                        self.catchbreak = True
-                    # Should this raise an exception if -c is not valid?
-                if opt in ('-b', '--buffer'):
-                    if self.buffer is None:
-                        self.buffer = True
-                    # Should this raise an exception if -b is not valid?
-            if len(args) == 0 and self.defaultTest is None:
-                # createTests will load tests from self.module
-                self.testNames = None
-            elif len(args) > 0:
-                self.testNames = args
-                if __name__ == '__main__':
-                    # to support python -m unittest ...
-                    self.module = None
-            else:
-                self.testNames = (self.defaultTest,)
-            self.createTests()
-        except getopt.error as msg:
-            self.usageExit(msg)
-
-    def createTests(self):
-        if self.testNames is None:
-            self.test = self.testLoader.loadTestsFromModule(self.module)
-        else:
-            self.test = self.testLoader.loadTestsFromNames(self.testNames,
-                                                           self.module)
-
-    def _do_discovery(self, argv, Loader=loader.TestLoader):
-        # handle command line args for test discovery
-        self.progName = '%s discover' % self.progName
-        import optparse
-        parser = optparse.OptionParser()
-        parser.prog = self.progName
-        parser.add_option('-v', '--verbose', dest='verbose', default=False,
-                          help='Verbose output', action='store_true')
-        if self.failfast:
-            parser.add_option(
-                '-f',
-                '--failfast',
-                dest='failfast',
-                default=False,
-                help='Stop on first fail or error',
-                action='store_true')
-        if self.catchbreak and installHandler is not None:
-            parser.add_option(
-                '-c',
-                '--catch',
-                dest='catchbreak',
-                default=False,
-                help='Catch ctrl-C and display results so far',
-                action='store_true')
-        if self.buffer:
-            parser.add_option('-b', '--buffer', dest='buffer', default=False,
-                              help='Buffer stdout and stderr during tests',
-                              action='store_true')
-        parser.add_option('-s', '--start-directory', dest='start', default='.',
-                          help="Directory to start discovery ('.' default)")
-        parser.add_option(
-            '-p',
-            '--pattern',
-            dest='pattern',
-            default='test*.py',
-            help="Pattern to match tests ('test*.py' default)")
-        parser.add_option(
-            '-t',
-            '--top-level-directory',
-            dest='top',
-            default=None,
-            help='Top level directory of project (defaults to start directory)')
-
-        options, args = parser.parse_args(argv)
-        if len(args) > 3:
-            self.usageExit()
-
-        for name, value in zip(('start', 'pattern', 'top'), args):
-            setattr(options, name, value)
-
-        # only set options from the parsing here
-        # if they weren't set explicitly in the constructor
-        if self.failfast is None:
-            self.failfast = options.failfast
-        if self.catchbreak is None and installHandler is not None:
-            self.catchbreak = options.catchbreak
-        if self.buffer is None:
-            self.buffer = options.buffer
-
-        if options.verbose:
-            self.verbosity = 2
-
-        start_dir = options.start
-        pattern = options.pattern
-        top_level_dir = options.top
-
-        loader = Loader()
-        self.test = loader.discover(start_dir, pattern, top_level_dir)
-
-    def runTests(self):
-        if self.catchbreak:
-            installHandler()
-        if self.testRunner is None:
-            self.testRunner = runner.TextTestRunner
-        if isinstance(self.testRunner, (type, types.ClassType)):
-            try:
-                testRunner = self.testRunner(verbosity=self.verbosity,
-                                             failfast=self.failfast,
-                                             buffer=self.buffer)
-            except TypeError:
-                # didn't accept the verbosity, buffer or failfast arguments
-                testRunner = self.testRunner()
-        else:
-            # it is assumed to be a TestRunner instance
-            testRunner = self.testRunner
-        self.result = testRunner.run(self.test)
-        if self.exit:
-            sys.exit(not self.result.wasSuccessful())
-
-main = TestProgram
-
-
-def main_():
-    TestProgram.USAGE = USAGE_AS_MAIN
-    main(module=None)
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/result.py b/lldb/third_party/Python/module/unittest2/unittest2/result.py
deleted file mode 100644
index 97eb4fa851481..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/result.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""Test result object"""
-
-import use_lldb_suite
-
-import io
-import sys
-import traceback
-import unittest
-
-from unittest2 import util
-from unittest2.compatibility import wraps
-
-__unittest = True
-
-
-def failfast(method):
-    @wraps(method)
-    def inner(self, *args, **kw):
-        if getattr(self, 'failfast', False):
-            self.stop()
-        return method(self, *args, **kw)
-    return inner
-
-
-STDOUT_LINE = '\nStdout:\n%s'
-STDERR_LINE = '\nStderr:\n%s'
-
-
-class TestResult(unittest.TestResult):
-    """Holder for test result information.
-
-    Test results are automatically managed by the TestCase and TestSuite
-    classes, and do not need to be explicitly manipulated by writers of tests.
-
-    Each instance holds the total number of tests run, and collections of
-    failures and errors that occurred among those test runs. The collections
-    contain tuples of (testcase, exceptioninfo), where exceptioninfo is the
-    formatted traceback of the error that occurred.
-    """
-    _previousTestClass = None
-    _moduleSetUpFailed = False
-
-    def __init__(self):
-        self.failfast = False
-        self.failures = []
-        self.passes = []
-        self.errors = []
-        self.cleanup_errors = []
-        self.testsRun = 0
-        self.skipped = []
-        self.expectedFailures = []
-        self.unexpectedSuccesses = []
-        self.shouldStop = False
-        self.buffer = False
-        self._stdout_buffer = None
-        self._stderr_buffer = None
-        self._original_stdout = sys.stdout
-        self._original_stderr = sys.stderr
-        self._mirrorOutput = False
-
-    def startTest(self, test):
-        "Called when the given test is about to be run"
-        self.testsRun += 1
-        self._mirrorOutput = False
-        if self.buffer:
-            if self._stderr_buffer is None:
-                self._stderr_buffer = io.StringIO()
-                self._stdout_buffer = io.StringIO()
-            sys.stdout = self._stdout_buffer
-            sys.stderr = self._stderr_buffer
-
-    def startTestRun(self):
-        """Called once before any tests are executed.
-
-        See startTest for a method called before each test.
-        """
-
-    def stopTest(self, test):
-        """Called when the given test has been run"""
-        if self.buffer:
-            if self._mirrorOutput:
-                output = sys.stdout.getvalue()
-                error = sys.stderr.getvalue()
-                if output:
-                    if not output.endswith('\n'):
-                        output += '\n'
-                    self._original_stdout.write(STDOUT_LINE % output)
-                if error:
-                    if not error.endswith('\n'):
-                        error += '\n'
-                    self._original_stderr.write(STDERR_LINE % error)
-
-            sys.stdout = self._original_stdout
-            sys.stderr = self._original_stderr
-            self._stdout_buffer.seek(0)
-            self._stdout_buffer.truncate()
-            self._stderr_buffer.seek(0)
-            self._stderr_buffer.truncate()
-        self._mirrorOutput = False
-
-    def stopTestRun(self):
-        """Called once after all tests are executed.
-
-        See stopTest for a method called after each test.
-        """
-
-    @failfast
-    def addError(self, test, err):
-        """Called when an error has occurred. 'err' is a tuple of values as
-        returned by sys.exc_info().
-        """
-        self.errors.append((test, self._exc_info_to_string(err, test)))
-        self._mirrorOutput = True
-
-    def addCleanupError(self, test, err):
-        """Called when an error has occurred during cleanup. 'err' is a tuple of
-        values as returned by sys.exc_info().
-        """
-        self.cleanup_errors.append((test, self._exc_info_to_string(err, test)))
-        self._mirrorOutput = True
-
-    @failfast
-    def addFailure(self, test, err):
-        """Called when an error has occurred. 'err' is a tuple of values as
-        returned by sys.exc_info()."""
-        self.failures.append((test, self._exc_info_to_string(err, test)))
-        self._mirrorOutput = True
-
-    def addSuccess(self, test):
-        "Called when a test has completed successfully"
-        self.passes.append(test)
-        pass
-
-    def addSkip(self, test, reason):
-        """Called when a test is skipped."""
-        self.skipped.append((test, reason))
-
-    def addExpectedFailure(self, test, err, bugnumber):
-        """Called when an expected failure/error occured."""
-        self.expectedFailures.append(
-            (test, self._exc_info_to_string(err, test)))
-
-    @failfast
-    def addUnexpectedSuccess(self, test, bugnumber):
-        """Called when a test was expected to fail, but succeed."""
-        self.unexpectedSuccesses.append(test)
-
-    def wasSuccessful(self):
-        "Tells whether or not this result was a success"
-        return (len(self.failures) +
-                len(self.errors) +
-                len(self.unexpectedSuccesses) == 0)
-
-    def stop(self):
-        "Indicates that the tests should be aborted"
-        self.shouldStop = True
-
-    def _exc_info_to_string(self, err, test):
-        """Converts a sys.exc_info()-style tuple of values into a string."""
-        exctype, value, tb = err
-        # Skip test runner traceback levels
-        while tb and self._is_relevant_tb_level(tb):
-            tb = tb.tb_next
-        if exctype is test.failureException:
-            # Skip assert*() traceback levels
-            length = self._count_relevant_tb_levels(tb)
-            msgLines = traceback.format_exception(exctype, value, tb, length)
-        else:
-            msgLines = traceback.format_exception(exctype, value, tb)
-
-        if self.buffer:
-            output = sys.stdout.getvalue()
-            error = sys.stderr.getvalue()
-            if output:
-                if not output.endswith('\n'):
-                    output += '\n'
-                msgLines.append(STDOUT_LINE % output)
-            if error:
-                if not error.endswith('\n'):
-                    error += '\n'
-                msgLines.append(STDERR_LINE % error)
-        return ''.join(msgLines)
-
-    def _is_relevant_tb_level(self, tb):
-        return '__unittest' in tb.tb_frame.f_globals
-
-    def _count_relevant_tb_levels(self, tb):
-        length = 0
-        while tb and not self._is_relevant_tb_level(tb):
-            length += 1
-            tb = tb.tb_next
-        return length
-
-    def __repr__(self):
-        return "<%s run=%i errors=%i failures=%i>" % \
-               (util.strclass(self.__class__), self.testsRun, len(self.errors),
-                len(self.failures))
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/runner.py b/lldb/third_party/Python/module/unittest2/unittest2/runner.py
deleted file mode 100644
index 93fc104a58619..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/runner.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""Running tests"""
-
-import sys
-import time
-import unittest
-import progress
-
-from unittest2 import result
-
-try:
-    from unittest2.signals import registerResult
-except ImportError:
-    def registerResult(_):
-        pass
-
-__unittest = True
-
-
-class _WritelnDecorator(object):
-    """Used to decorate file-like objects with a handy 'writeln' method"""
-
-    def __init__(self, stream):
-        self.stream = stream
-
-    def __getattr__(self, attr):
-        if attr in ('stream', '__getstate__'):
-            raise AttributeError(attr)
-        return getattr(self.stream, attr)
-
-    def writeln(self, arg=None):
-        if arg:
-            self.write(arg)
-        self.write('\n')  # text-mode streams translate to \r\n if needed
-
-
-class TextTestResult(result.TestResult):
-    """A test result class that can print formatted text results to a stream.
-
-    Used by TextTestRunner.
-    """
-    separator1 = '=' * 70
-    separator2 = '-' * 70
-
-    def __init__(self, stream, descriptions, verbosity):
-        super(TextTestResult, self).__init__()
-        self.stream = stream
-        self.showAll = verbosity > 1
-        self.dots = verbosity == 1
-        self.descriptions = descriptions
-        self.progressbar = None
-
-        if self.dots:
-            self.stream.writeln(
-                ".=success F=fail E=error s=skipped x=expected-fail u=unexpected-success")
-            self.stream.writeln("")
-            self.stream.flush()
-
-    def getDescription(self, test):
-        doc_first_line = test.shortDescription()
-        if self.descriptions and doc_first_line:
-            return '\n'.join((str(test), doc_first_line))
-        else:
-            return str(test)
-
-    def startTest(self, test):
-        super(TextTestResult, self).startTest(test)
-        if self.showAll:
-            self.stream.write(self.getDescription(test))
-            self.stream.write(" ... ")
-            self.stream.flush()
-
-    def newTestResult(self, test, result_short, result_long):
-        if self.showAll:
-            self.stream.writeln(result_long)
-        elif self.progressbar:
-            self.progressbar.__add__(1)
-            self.progressbar.add_event(result_short)
-            self.progressbar.show_progress()
-        elif self.dots:
-            self.stream.write(result_short)
-            self.stream.flush()
-
-    def addSuccess(self, test):
-        super(TextTestResult, self).addSuccess(test)
-        if self.progressbar:
-            self.newTestResult(test, "ok", "ok")
-        else:
-            self.newTestResult(test, ".", "ok")
-
-    def addError(self, test, err):
-        super(TextTestResult, self).addError(test, err)
-        self.newTestResult(test, "E", "ERROR")
-
-    def addFailure(self, test, err):
-        super(TextTestResult, self).addFailure(test, err)
-        self.newTestResult(test, "F", "FAILURE")
-
-    def addSkip(self, test, reason):
-        super(TextTestResult, self).addSkip(test, reason)
-        self.newTestResult(test, "s", "skipped %r" % (reason,))
-
-    def addExpectedFailure(self, test, err, bugnumber):
-        super(TextTestResult, self).addExpectedFailure(test, err, bugnumber)
-        self.newTestResult(test, "x", "expected failure")
-
-    def addUnexpectedSuccess(self, test, bugnumber):
-        super(TextTestResult, self).addUnexpectedSuccess(test, bugnumber)
-        self.newTestResult(test, "u", "unexpected success")
-
-    def printErrors(self):
-        if self.progressbar:
-            self.progressbar.complete()
-            self.progressbar.show_progress()
-        if self.dots or self.showAll:
-            self.stream.writeln()
-        self.printErrorList('ERROR', self.errors)
-        self.printErrorList('FAIL', self.failures)
-
-    def printErrorList(self, flavour, errors):
-        for test, err in errors:
-            self.stream.writeln(self.separator1)
-            self.stream.writeln("%s: %s" %
-                                (flavour, self.getDescription(test)))
-            self.stream.writeln(self.separator2)
-            self.stream.writeln("%s" % err)
-
-    def stopTestRun(self):
-        super(TextTestResult, self).stopTestRun()
-        self.printErrors()
-
-
-class TextTestRunner(unittest.TextTestRunner):
-    """A test runner class that displays results in textual form.
-
-    It prints out the names of tests as they are run, errors as they
-    occur, and a summary of the results at the end of the test run.
-    """
-    resultclass = TextTestResult
-
-    def __init__(self, stream=sys.stderr, descriptions=True, verbosity=1,
-                 failfast=False, buffer=False, resultclass=None):
-        self.stream = _WritelnDecorator(stream)
-        self.descriptions = descriptions
-        self.verbosity = verbosity
-        self.failfast = failfast
-        self.buffer = buffer
-        if resultclass is not None:
-            self.resultclass = resultclass
-
-    def _makeResult(self):
-        return self.resultclass(self.stream, self.descriptions, self.verbosity)
-
-    def run(self, test):
-        "Run the given test case or test suite."
-        result = self._makeResult()
-        result.failfast = self.failfast
-        result.buffer = self.buffer
-        registerResult(result)
-
-        startTime = time.time()
-        startTestRun = getattr(result, 'startTestRun', None)
-        if startTestRun is not None:
-            startTestRun()
-        try:
-            test(result)
-        finally:
-            stopTestRun = getattr(result, 'stopTestRun', None)
-            if stopTestRun is not None:
-                stopTestRun()
-            else:
-                result.printErrors()
-        stopTime = time.time()
-        timeTaken = stopTime - startTime
-        if hasattr(result, 'separator2'):
-            self.stream.writeln(result.separator2)
-        run = result.testsRun
-        self.stream.writeln("Ran %d test%s in %.3fs" %
-                            (run, run != 1 and "s" or "", timeTaken))
-        self.stream.writeln()
-
-        expectedFails = unexpectedSuccesses = skipped = passed = failed = errored = 0
-        try:
-            results = map(len, (result.expectedFailures,
-                                result.unexpectedSuccesses,
-                                result.skipped,
-                                result.passes,
-                                result.failures,
-                                result.errors))
-            expectedFails, unexpectedSuccesses, skipped, passed, failed, errored = results
-        except AttributeError:
-            pass
-        infos = []
-        infos.append("%d passes" % passed)
-        infos.append("%d failures" % failed)
-        infos.append("%d errors" % errored)
-        infos.append("%d skipped" % skipped)
-        infos.append("%d expected failures" % expectedFails)
-        infos.append("%d unexpected successes" % unexpectedSuccesses)
-        self.stream.write("RESULT: ")
-        if not result.wasSuccessful():
-            self.stream.write("FAILED")
-        else:
-            self.stream.write("PASSED")
-
-        self.stream.writeln(" (%s)" % (", ".join(infos),))
-        return result
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/signals.py b/lldb/third_party/Python/module/unittest2/unittest2/signals.py
deleted file mode 100644
index dc0dc3daf3bd2..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/signals.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import signal
-import weakref
-
-from unittest2.compatibility import wraps
-
-__unittest = True
-
-
-class _InterruptHandler(object):
-
-    def __init__(self, default_handler):
-        self.called = False
-        self.default_handler = default_handler
-
-    def __call__(self, signum, frame):
-        installed_handler = signal.getsignal(signal.SIGINT)
-        if installed_handler is not self:
-            # if we aren't the installed handler, then delegate immediately
-            # to the default handler
-            self.default_handler(signum, frame)
-
-        if self.called:
-            self.default_handler(signum, frame)
-        self.called = True
-        for result in _results.keys():
-            result.stop()
-
-_results = weakref.WeakKeyDictionary()
-
-
-def registerResult(result):
-    _results[result] = 1
-
-
-def removeResult(result):
-    return bool(_results.pop(result, None))
-
-_interrupt_handler = None
-
-
-def installHandler():
-    global _interrupt_handler
-    if _interrupt_handler is None:
-        default_handler = signal.getsignal(signal.SIGINT)
-        _interrupt_handler = _InterruptHandler(default_handler)
-        signal.signal(signal.SIGINT, _interrupt_handler)
-
-
-def removeHandler(method=None):
-    if method is not None:
-        @wraps(method)
-        def inner(*args, **kwargs):
-            initial = signal.getsignal(signal.SIGINT)
-            removeHandler()
-            try:
-                return method(*args, **kwargs)
-            finally:
-                signal.signal(signal.SIGINT, initial)
-        return inner
-
-    global _interrupt_handler
-    if _interrupt_handler is not None:
-        signal.signal(signal.SIGINT, _interrupt_handler.default_handler)
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/suite.py b/lldb/third_party/Python/module/unittest2/unittest2/suite.py
deleted file mode 100644
index f2554447cc91a..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/suite.py
+++ /dev/null
@@ -1,286 +0,0 @@
-"""TestSuite"""
-
-import sys
-import unittest
-from unittest2 import case, util
-
-__unittest = True
-
-
-class BaseTestSuite(unittest.TestSuite):
-    """A simple test suite that doesn't provide class or module shared fixtures.
-    """
-
-    def __init__(self, tests=()):
-        self._tests = []
-        self.addTests(tests)
-
-    def __repr__(self):
-        return "<%s tests=%s>" % (util.strclass(self.__class__), list(self))
-
-    def __eq__(self, other):
-        if not isinstance(other, self.__class__):
-            return NotImplemented
-        return list(self) == list(other)
-
-    def __ne__(self, other):
-        return not self == other
-
-    # Can't guarantee hash invariant, so flag as unhashable
-    __hash__ = None
-
-    def __iter__(self):
-        return iter(self._tests)
-
-    def countTestCases(self):
-        cases = 0
-        for test in self:
-            cases += test.countTestCases()
-        return cases
-
-    def addTest(self, test):
-        # sanity checks
-        if not hasattr(test, '__call__'):
-            raise TypeError("%r is not callable" % (repr(test),))
-        if isinstance(test, type) and issubclass(test,
-                                                 (case.TestCase, TestSuite)):
-            raise TypeError("TestCases and TestSuites must be instantiated "
-                            "before passing them to addTest()")
-        self._tests.append(test)
-
-    def addTests(self, tests):
-        if isinstance(tests, str):
-            raise TypeError("tests must be an iterable of tests, not a string")
-        for test in tests:
-            self.addTest(test)
-
-    def run(self, result):
-        for test in self:
-            if result.shouldStop:
-                break
-            test(result)
-        return result
-
-    def __call__(self, *args, **kwds):
-        return self.run(*args, **kwds)
-
-    def debug(self):
-        """Run the tests without collecting errors in a TestResult"""
-        for test in self:
-            test.debug()
-
-
-class TestSuite(BaseTestSuite):
-    """A test suite is a composite test consisting of a number of TestCases.
-
-    For use, create an instance of TestSuite, then add test case instances.
-    When all tests have been added, the suite can be passed to a test
-    runner, such as TextTestRunner. It will run the individual test cases
-    in the order in which they were added, aggregating the results. When
-    subclassing, do not forget to call the base class constructor.
-    """
-
-    def run(self, result):
-        self._wrapped_run(result)
-        self._tearDownPreviousClass(None, result)
-        self._handleModuleTearDown(result)
-        return result
-
-    def debug(self):
-        """Run the tests without collecting errors in a TestResult"""
-        debug = _DebugResult()
-        self._wrapped_run(debug, True)
-        self._tearDownPreviousClass(None, debug)
-        self._handleModuleTearDown(debug)
-
-    ################################
-    # private methods
-    def _wrapped_run(self, result, debug=False):
-        for test in self:
-            if result.shouldStop:
-                break
-
-            if _isnotsuite(test):
-                self._tearDownPreviousClass(test, result)
-                self._handleModuleFixture(test, result)
-                self._handleClassSetUp(test, result)
-                result._previousTestClass = test.__class__
-
-                if (getattr(test.__class__, '_classSetupFailed', False) or
-                        getattr(result, '_moduleSetUpFailed', False)):
-                    continue
-
-            if hasattr(test, '_wrapped_run'):
-                test._wrapped_run(result, debug)
-            elif not debug:
-                test(result)
-            else:
-                test.debug()
-
-    def _handleClassSetUp(self, test, result):
-        previousClass = getattr(result, '_previousTestClass', None)
-        currentClass = test.__class__
-        if currentClass == previousClass:
-            return
-        if result._moduleSetUpFailed:
-            return
-        if getattr(currentClass, "__unittest_skip__", False):
-            return
-
-        try:
-            currentClass._classSetupFailed = False
-        except TypeError:
-            # test may actually be a function
-            # so its class will be a builtin-type
-            pass
-
-        setUpClass = getattr(currentClass, 'setUpClass', None)
-        if setUpClass is not None:
-            try:
-                setUpClass()
-            except Exception as e:
-                if isinstance(result, _DebugResult):
-                    raise
-                currentClass._classSetupFailed = True
-                className = util.strclass(currentClass)
-                errorName = 'setUpClass (%s)' % className
-                self._addClassOrModuleLevelException(result, e, errorName)
-
-    def _get_previous_module(self, result):
-        previousModule = None
-        previousClass = getattr(result, '_previousTestClass', None)
-        if previousClass is not None:
-            previousModule = previousClass.__module__
-        return previousModule
-
-    def _handleModuleFixture(self, test, result):
-        previousModule = self._get_previous_module(result)
-        currentModule = test.__class__.__module__
-        if currentModule == previousModule:
-            return
-
-        self._handleModuleTearDown(result)
-
-        result._moduleSetUpFailed = False
-        try:
-            module = sys.modules[currentModule]
-        except KeyError:
-            return
-        setUpModule = getattr(module, 'setUpModule', None)
-        if setUpModule is not None:
-            try:
-                setUpModule()
-            except Exception as e:
-                if isinstance(result, _DebugResult):
-                    raise
-                result._moduleSetUpFailed = True
-                errorName = 'setUpModule (%s)' % currentModule
-                self._addClassOrModuleLevelException(result, e, errorName)
-
-    def _addClassOrModuleLevelException(self, result, exception, errorName):
-        error = _ErrorHolder(errorName)
-        addSkip = getattr(result, 'addSkip', None)
-        if addSkip is not None and isinstance(exception, case.SkipTest):
-            addSkip(error, str(exception))
-        else:
-            result.addError(error, sys.exc_info())
-
-    def _handleModuleTearDown(self, result):
-        previousModule = self._get_previous_module(result)
-        if previousModule is None:
-            return
-        if result._moduleSetUpFailed:
-            return
-
-        try:
-            module = sys.modules[previousModule]
-        except KeyError:
-            return
-
-        tearDownModule = getattr(module, 'tearDownModule', None)
-        if tearDownModule is not None:
-            try:
-                tearDownModule()
-            except Exception as e:
-                if isinstance(result, _DebugResult):
-                    raise
-                errorName = 'tearDownModule (%s)' % previousModule
-                self._addClassOrModuleLevelException(result, e, errorName)
-
-    def _tearDownPreviousClass(self, test, result):
-        previousClass = getattr(result, '_previousTestClass', None)
-        currentClass = test.__class__
-        if currentClass == previousClass:
-            return
-        if getattr(previousClass, '_classSetupFailed', False):
-            return
-        if getattr(result, '_moduleSetUpFailed', False):
-            return
-        if getattr(previousClass, "__unittest_skip__", False):
-            return
-
-        tearDownClass = getattr(previousClass, 'tearDownClass', None)
-        if tearDownClass is not None:
-            try:
-                tearDownClass()
-            except Exception as e:
-                if isinstance(result, _DebugResult):
-                    raise
-                className = util.strclass(previousClass)
-                errorName = 'tearDownClass (%s)' % className
-                self._addClassOrModuleLevelException(result, e, errorName)
-
-
-class _ErrorHolder(object):
-    """
-    Placeholder for a TestCase inside a result. As far as a TestResult
-    is concerned, this looks exactly like a unit test. Used to insert
-    arbitrary errors into a test suite run.
-    """
-    # Inspired by the ErrorHolder from Twisted:
-    # http://twistedmatrix.com/trac/browser/trunk/twisted/trial/runner.py
-
-    # attribute used by TestResult._exc_info_to_string
-    failureException = None
-
-    def __init__(self, description):
-        self.description = description
-
-    def id(self):
-        return self.description
-
-    def shortDescription(self):
-        return None
-
-    def __repr__(self):
-        return "<ErrorHolder description=%r>" % (self.description,)
-
-    def __str__(self):
-        return self.id()
-
-    def run(self, result):
-        # could call result.addError(...) - but this test-like object
-        # shouldn't be run anyway
-        pass
-
-    def __call__(self, result):
-        return self.run(result)
-
-    def countTestCases(self):
-        return 0
-
-
-def _isnotsuite(test):
-    "A crude way to tell apart testcases and suites with duck-typing"
-    try:
-        iter(test)
-    except TypeError:
-        return True
-    return False
-
-
-class _DebugResult(object):
-    "Used by the TestSuite to hold previous class when running in debug."
-    _previousTestClass = None
-    _moduleSetUpFailed = False
-    shouldStop = False
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/__init__.py b/lldb/third_party/Python/module/unittest2/unittest2/test/__init__.py
deleted file mode 100644
index 792d6005489eb..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-#
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/dummy.py b/lldb/third_party/Python/module/unittest2/unittest2/test/dummy.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/support.py b/lldb/third_party/Python/module/unittest2/unittest2/test/support.py
deleted file mode 100644
index 5c9bcf2749634..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/support.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import sys
-import warnings
-
-import unittest2
-
-
-def resultFactory(*_):
-    return unittest2.TestResult()
-
-
-class OldTestResult(object):
-    """An object honouring TestResult before startTestRun/stopTestRun."""
-
-    def __init__(self, *_):
-        self.failures = []
-        self.errors = []
-        self.testsRun = 0
-        self.shouldStop = False
-
-    def startTest(self, test):
-        pass
-
-    def stopTest(self, test):
-        pass
-
-    def addError(self, test, err):
-        self.errors.append((test, err))
-
-    def addFailure(self, test, err):
-        self.failures.append((test, err))
-
-    def addSuccess(self, test):
-        pass
-
-    def wasSuccessful(self):
-        return True
-
-    def printErrors(self):
-        pass
-
-
-class LoggingResult(unittest2.TestResult):
-
-    def __init__(self, log):
-        self._events = log
-        super(LoggingResult, self).__init__()
-
-    def startTest(self, test):
-        self._events.append('startTest')
-        super(LoggingResult, self).startTest(test)
-
-    def startTestRun(self):
-        self._events.append('startTestRun')
-        super(LoggingResult, self).startTestRun()
-
-    def stopTest(self, test):
-        self._events.append('stopTest')
-        super(LoggingResult, self).stopTest(test)
-
-    def stopTestRun(self):
-        self._events.append('stopTestRun')
-        super(LoggingResult, self).stopTestRun()
-
-    def addFailure(self, *args):
-        self._events.append('addFailure')
-        super(LoggingResult, self).addFailure(*args)
-
-    def addSuccess(self, *args):
-        self._events.append('addSuccess')
-        super(LoggingResult, self).addSuccess(*args)
-
-    def addError(self, *args):
-        self._events.append('addError')
-        super(LoggingResult, self).addError(*args)
-
-    def addSkip(self, *args):
-        self._events.append('addSkip')
-        super(LoggingResult, self).addSkip(*args)
-
-    def addExpectedFailure(self, *args):
-        self._events.append('addExpectedFailure')
-        super(LoggingResult, self).addExpectedFailure(*args)
-
-    def addUnexpectedSuccess(self, *args):
-        self._events.append('addUnexpectedSuccess')
-        super(LoggingResult, self).addUnexpectedSuccess(*args)
-
-
-class EqualityMixin(object):
-    """Used as a mixin for TestCase"""
-
-    # Check for a valid __eq__ implementation
-    def test_eq(self):
-        for obj_1, obj_2 in self.eq_pairs:
-            self.assertEqual(obj_1, obj_2)
-            self.assertEqual(obj_2, obj_1)
-
-    # Check for a valid __ne__ implementation
-    def test_ne(self):
-        for obj_1, obj_2 in self.ne_pairs:
-            self.assertNotEqual(obj_1, obj_2)
-            self.assertNotEqual(obj_2, obj_1)
-
-
-class HashingMixin(object):
-    """Used as a mixin for TestCase"""
-
-    # Check for a valid __hash__ implementation
-    def test_hash(self):
-        for obj_1, obj_2 in self.eq_pairs:
-            try:
-                if not hash(obj_1) == hash(obj_2):
-                    self.fail("%r and %r do not hash equal" % (obj_1, obj_2))
-            except KeyboardInterrupt:
-                raise
-            except Exception as e:
-                self.fail("Problem hashing %r and %r: %s" % (obj_1, obj_2, e))
-
-        for obj_1, obj_2 in self.ne_pairs:
-            try:
-                if hash(obj_1) == hash(obj_2):
-                    self.fail("%s and %s hash equal, but shouldn't" %
-                              (obj_1, obj_2))
-            except KeyboardInterrupt:
-                raise
-            except Exception as e:
-                self.fail("Problem hashing %s and %s: %s" % (obj_1, obj_2, e))
-
-
-# copied from Python 2.6
-try:
-    from warnings import catch_warnings
-except ImportError:
-    class catch_warnings(object):
-
-        def __init__(self, record=False, module=None):
-            self._record = record
-            self._module = sys.modules['warnings']
-            self._entered = False
-
-        def __repr__(self):
-            args = []
-            if self._record:
-                args.append("record=True")
-            name = type(self).__name__
-            return "%s(%s)" % (name, ", ".join(args))
-
-        def __enter__(self):
-            if self._entered:
-                raise RuntimeError("Cannot enter %r twice" % self)
-            self._entered = True
-            self._filters = self._module.filters
-            self._module.filters = self._filters[:]
-            self._showwarning = self._module.showwarning
-            if self._record:
-                log = []
-
-                def showwarning(*args, **kwargs):
-                    log.append(WarningMessage(*args, **kwargs))
-                self._module.showwarning = showwarning
-                return log
-            else:
-                return None
-
-        def __exit__(self, *exc_info):
-            if not self._entered:
-                raise RuntimeError(
-                    "Cannot exit %r without entering first" %
-                    self)
-            self._module.filters = self._filters
-            self._module.showwarning = self._showwarning
-
-    class WarningMessage(object):
-        _WARNING_DETAILS = (
-            "message",
-            "category",
-            "filename",
-            "lineno",
-            "file",
-            "line")
-
-        def __init__(self, message, category, filename, lineno, file=None,
-                     line=None):
-            local_values = locals()
-            for attr in self._WARNING_DETAILS:
-                setattr(self, attr, local_values[attr])
-            self._category_name = None
-            if category.__name__:
-                self._category_name = category.__name__
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_assertions.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_assertions.py
deleted file mode 100644
index c96aaafa082e2..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_assertions.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import datetime
-
-import unittest2
-
-
-class Test_Assertions(unittest2.TestCase):
-
-    def test_AlmostEqual(self):
-        self.assertAlmostEqual(1.00000001, 1.0)
-        self.assertNotAlmostEqual(1.0000001, 1.0)
-        self.assertRaises(self.failureException,
-                          self.assertAlmostEqual, 1.0000001, 1.0)
-        self.assertRaises(self.failureException,
-                          self.assertNotAlmostEqual, 1.00000001, 1.0)
-
-        self.assertAlmostEqual(1.1, 1.0, places=0)
-        self.assertRaises(self.failureException,
-                          self.assertAlmostEqual, 1.1, 1.0, places=1)
-
-        self.assertAlmostEqual(0, .1 + .1j, places=0)
-        self.assertNotAlmostEqual(0, .1 + .1j, places=1)
-        self.assertRaises(self.failureException,
-                          self.assertAlmostEqual, 0, .1 + .1j, places=1)
-        self.assertRaises(self.failureException,
-                          self.assertNotAlmostEqual, 0, .1 + .1j, places=0)
-
-        try:
-            self.assertAlmostEqual(float('inf'), float('inf'))
-            self.assertRaises(self.failureException, self.assertNotAlmostEqual,
-                              float('inf'), float('inf'))
-        except ValueError:
-            # float('inf') is invalid on Windows in Python 2.4 / 2.5
-            x = object()
-            self.assertAlmostEqual(x, x)
-            self.assertRaises(self.failureException, self.assertNotAlmostEqual,
-                              x, x)
-
-    def test_AmostEqualWithDelta(self):
-        self.assertAlmostEqual(1.1, 1.0, delta=0.5)
-        self.assertAlmostEqual(1.0, 1.1, delta=0.5)
-        self.assertNotAlmostEqual(1.1, 1.0, delta=0.05)
-        self.assertNotAlmostEqual(1.0, 1.1, delta=0.05)
-
-        self.assertRaises(self.failureException, self.assertAlmostEqual,
-                          1.1, 1.0, delta=0.05)
-        self.assertRaises(self.failureException, self.assertNotAlmostEqual,
-                          1.1, 1.0, delta=0.5)
-
-        self.assertRaises(TypeError, self.assertAlmostEqual,
-                          1.1, 1.0, places=2, delta=2)
-        self.assertRaises(TypeError, self.assertNotAlmostEqual,
-                          1.1, 1.0, places=2, delta=2)
-
-        first = datetime.datetime.now()
-        second = first + datetime.timedelta(seconds=10)
-        self.assertAlmostEqual(first, second,
-                               delta=datetime.timedelta(seconds=20))
-        self.assertNotAlmostEqual(first, second,
-                                  delta=datetime.timedelta(seconds=5))
-
-    def testAssertNotRegexpMatches(self):
-        self.assertNotRegexpMatches('Ala ma kota', r'r+')
-        try:
-            self.assertNotRegexpMatches('Ala ma kota', r'k.t', 'Message')
-        except self.failureException as e:
-            self.assertIn("'kot'", e.args[0])
-            self.assertIn('Message', e.args[0])
-        else:
-            self.fail('assertNotRegexpMatches should have failed.')
-
-
-class TestLongMessage(unittest2.TestCase):
-    """Test that the individual asserts honour longMessage.
-    This actually tests all the message behaviour for
-    asserts that use longMessage."""
-
-    def setUp(self):
-        class TestableTestFalse(unittest2.TestCase):
-            longMessage = False
-            failureException = self.failureException
-
-            def testTest(self):
-                pass
-
-        class TestableTestTrue(unittest2.TestCase):
-            longMessage = True
-            failureException = self.failureException
-
-            def testTest(self):
-                pass
-
-        self.testableTrue = TestableTestTrue('testTest')
-        self.testableFalse = TestableTestFalse('testTest')
-
-    def testDefault(self):
-        self.assertTrue(unittest2.TestCase.longMessage)
-
-    def test_formatMsg(self):
-        self.assertEquals(
-            self.testableFalse._formatMessage(
-                None, "foo"), "foo")
-        self.assertEquals(
-            self.testableFalse._formatMessage(
-                "foo", "bar"), "foo")
-
-        self.assertEquals(self.testableTrue._formatMessage(None, "foo"), "foo")
-        self.assertEquals(
-            self.testableTrue._formatMessage(
-                "foo", "bar"), "bar : foo")
-
-        # This blows up if _formatMessage uses string concatenation
-        self.testableTrue._formatMessage(object(), 'foo')
-
-    def assertMessages(self, methodName, args, errors):
-        def getMethod(i):
-            useTestableFalse = i < 2
-            if useTestableFalse:
-                test = self.testableFalse
-            else:
-                test = self.testableTrue
-            return getattr(test, methodName)
-
-        for i, expected_regexp in enumerate(errors):
-            testMethod = getMethod(i)
-            kwargs = {}
-            withMsg = i % 2
-            if withMsg:
-                kwargs = {"msg": "oops"}
-
-            self.assertRaisesRegexp(self.failureException,
-                                    expected_regexp,
-                                    lambda: testMethod(*args, **kwargs))
-
-    def testAssertTrue(self):
-        self.assertMessages('assertTrue', (False,),
-                            ["^False is not True$", "^oops$", "^False is not True$",
-                             "^False is not True : oops$"])
-
-    def testAssertFalse(self):
-        self.assertMessages('assertFalse', (True,),
-                            ["^True is not False$", "^oops$", "^True is not False$",
-                             "^True is not False : oops$"])
-
-    def testNotEqual(self):
-        self.assertMessages('assertNotEqual', (1, 1),
-                            ["^1 == 1$", "^oops$", "^1 == 1$",
-                             "^1 == 1 : oops$"])
-
-    def testAlmostEqual(self):
-        self.assertMessages('assertAlmostEqual',
-                            (1,
-                             2),
-                            ["^1 != 2 within 7 places$",
-                             "^oops$",
-                             "^1 != 2 within 7 places$",
-                             "^1 != 2 within 7 places : oops$"])
-
-    def testNotAlmostEqual(self):
-        self.assertMessages('assertNotAlmostEqual',
-                            (1,
-                             1),
-                            ["^1 == 1 within 7 places$",
-                             "^oops$",
-                             "^1 == 1 within 7 places$",
-                             "^1 == 1 within 7 places : oops$"])
-
-    def test_baseAssertEqual(self):
-        self.assertMessages(
-            '_baseAssertEqual', (1, 2), [
-                "^1 != 2$", "^oops$", "^1 != 2$", "^1 != 2 : oops$"])
-
-    def testAssertSequenceEqual(self):
-        # Error messages are multiline so not testing on full message
-        # assertTupleEqual and assertListEqual delegate to this method
-        self.assertMessages('assertSequenceEqual', ([], [None]),
-                            ["\+ \[None\]$", "^oops$", r"\+ \[None\]$",
-                             r"\+ \[None\] : oops$"])
-
-    def testAssertSetEqual(self):
-        self.assertMessages('assertSetEqual', (set(), set([None])),
-                            ["None$", "^oops$", "None$",
-                             "None : oops$"])
-
-    def testAssertIn(self):
-        self.assertMessages('assertIn', (None, []),
-                            ['^None not found in \[\]$', "^oops$",
-                             '^None not found in \[\]$',
-                             '^None not found in \[\] : oops$'])
-
-    def testAssertNotIn(self):
-        self.assertMessages('assertNotIn', (None, [None]),
-                            ['^None unexpectedly found in \[None\]$', "^oops$",
-                             '^None unexpectedly found in \[None\]$',
-                             '^None unexpectedly found in \[None\] : oops$'])
-
-    def testAssertDictEqual(self):
-        self.assertMessages('assertDictEqual', ({}, {'key': 'value'}),
-                            [r"\+ \{'key': 'value'\}$", "^oops$",
-                             "\+ \{'key': 'value'\}$",
-                             "\+ \{'key': 'value'\} : oops$"])
-
-    def testAssertDictContainsSubset(self):
-        self.assertMessages('assertDictContainsSubset', ({'key': 'value'}, {}),
-                            ["^Missing: 'key'$", "^oops$",
-                             "^Missing: 'key'$",
-                             "^Missing: 'key' : oops$"])
-
-    def testAssertItemsEqual(self):
-        self.assertMessages('assertItemsEqual', ([], [None]),
-                            [r"\[None\]$", "^oops$",
-                             r"\[None\]$",
-                             r"\[None\] : oops$"])
-
-    def testAssertMultiLineEqual(self):
-        self.assertMessages('assertMultiLineEqual', ("", "foo"),
-                            [r"\+ foo$", "^oops$",
-                             r"\+ foo$",
-                             r"\+ foo : oops$"])
-
-    def testAssertLess(self):
-        self.assertMessages('assertLess', (2, 1),
-                            ["^2 not less than 1$", "^oops$",
-                             "^2 not less than 1$", "^2 not less than 1 : oops$"])
-
-    def testAssertLessEqual(self):
-        self.assertMessages('assertLessEqual', (2, 1),
-                            ["^2 not less than or equal to 1$", "^oops$",
-                             "^2 not less than or equal to 1$",
-                             "^2 not less than or equal to 1 : oops$"])
-
-    def testAssertGreater(self):
-        self.assertMessages('assertGreater', (1, 2),
-                            ["^1 not greater than 2$", "^oops$",
-                             "^1 not greater than 2$",
-                             "^1 not greater than 2 : oops$"])
-
-    def testAssertGreaterEqual(self):
-        self.assertMessages('assertGreaterEqual', (1, 2),
-                            ["^1 not greater than or equal to 2$", "^oops$",
-                             "^1 not greater than or equal to 2$",
-                             "^1 not greater than or equal to 2 : oops$"])
-
-    def testAssertIsNone(self):
-        self.assertMessages('assertIsNone', ('not None',),
-                            ["^'not None' is not None$", "^oops$",
-                             "^'not None' is not None$",
-                             "^'not None' is not None : oops$"])
-
-    def testAssertIsNotNone(self):
-        self.assertMessages('assertIsNotNone', (None,),
-                            ["^unexpectedly None$", "^oops$",
-                             "^unexpectedly None$",
-                             "^unexpectedly None : oops$"])
-
-    def testAssertIs(self):
-        self.assertMessages('assertIs', (None, 'foo'),
-                            ["^None is not 'foo'$", "^oops$",
-                             "^None is not 'foo'$",
-                             "^None is not 'foo' : oops$"])
-
-    def testAssertIsNot(self):
-        self.assertMessages('assertIsNot', (None, None),
-                            ["^unexpectedly identical: None$", "^oops$",
-                             "^unexpectedly identical: None$",
-                             "^unexpectedly identical: None : oops$"])
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_break.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_break.py
deleted file mode 100644
index 390718da22ce0..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_break.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import gc
-import os
-import weakref
-
-from cStringIO import StringIO
-
-try:
-    import signal
-except ImportError:
-    signal = None
-
-import unittest2
-
-
-class TestBreak(unittest2.TestCase):
-
-    def setUp(self):
-        self._default_handler = signal.getsignal(signal.SIGINT)
-
-    def tearDown(self):
-        signal.signal(signal.SIGINT, self._default_handler)
-        unittest2.signals._results = weakref.WeakKeyDictionary()
-        unittest2.signals._interrupt_handler = None
-
-    def testInstallHandler(self):
-        default_handler = signal.getsignal(signal.SIGINT)
-        unittest2.installHandler()
-        self.assertNotEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-        try:
-            pid = os.getpid()
-            os.kill(pid, signal.SIGINT)
-        except KeyboardInterrupt:
-            self.fail("KeyboardInterrupt not handled")
-
-        self.assertTrue(unittest2.signals._interrupt_handler.called)
-
-    def testRegisterResult(self):
-        result = unittest2.TestResult()
-        unittest2.registerResult(result)
-
-        for ref in unittest2.signals._results:
-            if ref is result:
-                break
-            elif ref is not result:
-                self.fail("odd object in result set")
-        else:
-            self.fail("result not found")
-
-    def testInterruptCaught(self):
-        default_handler = signal.getsignal(signal.SIGINT)
-
-        result = unittest2.TestResult()
-        unittest2.installHandler()
-        unittest2.registerResult(result)
-
-        self.assertNotEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-        def test(result):
-            pid = os.getpid()
-            os.kill(pid, signal.SIGINT)
-            result.breakCaught = True
-            self.assertTrue(result.shouldStop)
-
-        try:
-            test(result)
-        except KeyboardInterrupt:
-            self.fail("KeyboardInterrupt not handled")
-        self.assertTrue(result.breakCaught)
-
-    def testSecondInterrupt(self):
-        result = unittest2.TestResult()
-        unittest2.installHandler()
-        unittest2.registerResult(result)
-
-        def test(result):
-            pid = os.getpid()
-            os.kill(pid, signal.SIGINT)
-            result.breakCaught = True
-            self.assertTrue(result.shouldStop)
-            os.kill(pid, signal.SIGINT)
-            self.fail("Second KeyboardInterrupt not raised")
-
-        try:
-            test(result)
-        except KeyboardInterrupt:
-            pass
-        else:
-            self.fail("Second KeyboardInterrupt not raised")
-        self.assertTrue(result.breakCaught)
-
-    def testTwoResults(self):
-        unittest2.installHandler()
-
-        result = unittest2.TestResult()
-        unittest2.registerResult(result)
-        new_handler = signal.getsignal(signal.SIGINT)
-
-        result2 = unittest2.TestResult()
-        unittest2.registerResult(result2)
-        self.assertEqual(signal.getsignal(signal.SIGINT), new_handler)
-
-        result3 = unittest2.TestResult()
-
-        def test(result):
-            pid = os.getpid()
-            os.kill(pid, signal.SIGINT)
-
-        try:
-            test(result)
-        except KeyboardInterrupt:
-            self.fail("KeyboardInterrupt not handled")
-
-        self.assertTrue(result.shouldStop)
-        self.assertTrue(result2.shouldStop)
-        self.assertFalse(result3.shouldStop)
-
-    def testHandlerReplacedButCalled(self):
-        # If our handler has been replaced (is no longer installed) but is
-        # called by the *new* handler, then it isn't safe to delay the
-        # SIGINT and we should immediately delegate to the default handler
-        unittest2.installHandler()
-
-        handler = signal.getsignal(signal.SIGINT)
-
-        def new_handler(frame, signum):
-            handler(frame, signum)
-        signal.signal(signal.SIGINT, new_handler)
-
-        try:
-            pid = os.getpid()
-            os.kill(pid, signal.SIGINT)
-        except KeyboardInterrupt:
-            pass
-        else:
-            self.fail("replaced but delegated handler doesn't raise interrupt")
-
-    def testRunner(self):
-        # Creating a TextTestRunner with the appropriate argument should
-        # register the TextTestResult it creates
-        runner = unittest2.TextTestRunner(stream=StringIO())
-
-        result = runner.run(unittest2.TestSuite())
-        self.assertIn(result, unittest2.signals._results)
-
-    def testWeakReferences(self):
-        # Calling registerResult on a result should not keep it alive
-        result = unittest2.TestResult()
-        unittest2.registerResult(result)
-
-        ref = weakref.ref(result)
-        del result
-
-        # For non-reference counting implementations
-        gc.collect()
-        gc.collect()
-        self.assertIsNone(ref())
-
-    def testRemoveResult(self):
-        result = unittest2.TestResult()
-        unittest2.registerResult(result)
-
-        unittest2.installHandler()
-        self.assertTrue(unittest2.removeResult(result))
-
-        # Should this raise an error instead?
-        self.assertFalse(unittest2.removeResult(unittest2.TestResult()))
-
-        try:
-            pid = os.getpid()
-            os.kill(pid, signal.SIGINT)
-        except KeyboardInterrupt:
-            pass
-
-        self.assertFalse(result.shouldStop)
-
-    def testMainInstallsHandler(self):
-        failfast = object()
-        test = object()
-        verbosity = object()
-        result = object()
-        default_handler = signal.getsignal(signal.SIGINT)
-
-        class FakeRunner(object):
-            initArgs = []
-            runArgs = []
-
-            def __init__(self, *args, **kwargs):
-                self.initArgs.append((args, kwargs))
-
-            def run(self, test):
-                self.runArgs.append(test)
-                return result
-
-        class Program(unittest2.TestProgram):
-
-            def __init__(self, catchbreak):
-                self.exit = False
-                self.verbosity = verbosity
-                self.failfast = failfast
-                self.catchbreak = catchbreak
-                self.testRunner = FakeRunner
-                self.test = test
-                self.result = None
-
-        p = Program(False)
-        p.runTests()
-
-        self.assertEqual(FakeRunner.initArgs, [((), {'verbosity': verbosity,
-                                                     'failfast': failfast,
-                                                     'buffer': None})])
-        self.assertEqual(FakeRunner.runArgs, [test])
-        self.assertEqual(p.result, result)
-
-        self.assertEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-        FakeRunner.initArgs = []
-        FakeRunner.runArgs = []
-        p = Program(True)
-        p.runTests()
-
-        self.assertEqual(FakeRunner.initArgs, [((), {'verbosity': verbosity,
-                                                     'failfast': failfast,
-                                                     'buffer': None})])
-        self.assertEqual(FakeRunner.runArgs, [test])
-        self.assertEqual(p.result, result)
-
-        self.assertNotEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-    def testRemoveHandler(self):
-        default_handler = signal.getsignal(signal.SIGINT)
-        unittest2.installHandler()
-        unittest2.removeHandler()
-        self.assertEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-        # check that calling removeHandler multiple times has no ill-effect
-        unittest2.removeHandler()
-        self.assertEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-    def testRemoveHandlerAsDecorator(self):
-        default_handler = signal.getsignal(signal.SIGINT)
-        unittest2.installHandler()
-
-        @unittest2.removeHandler
-        def test():
-            self.assertEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-        test()
-        self.assertNotEqual(signal.getsignal(signal.SIGINT), default_handler)
-
-
-# Should also skip some tests on Jython
-skipper = unittest2.skipUnless(hasattr(os, 'kill') and signal is not None,
-                               "test uses os.kill(...) and the signal module")
-TestBreak = skipper(TestBreak)
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_case.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_case.py
deleted file mode 100644
index 795fa39be7350..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_case.py
+++ /dev/null
@@ -1,1244 +0,0 @@
-import difflib
-import pprint
-import re
-
-from copy import deepcopy
-
-import unittest2
-
-from unittest2.test.support import (
-    OldTestResult, EqualityMixin, HashingMixin, LoggingResult
-)
-
-
-class MyException(Exception):
-    pass
-
-
-class Test(object):
-    "Keep these TestCase classes out of the main namespace"
-
-    class Foo(unittest2.TestCase):
-
-        def runTest(self): pass
-
-        def test1(self): pass
-
-    class Bar(Foo):
-
-        def test2(self): pass
-
-    class LoggingTestCase(unittest2.TestCase):
-        """A test case which logs its calls."""
-
-        def __init__(self, events):
-            super(Test.LoggingTestCase, self).__init__('test')
-            self.events = events
-
-        def setUp(self):
-            self.events.append('setUp')
-
-        def test(self):
-            self.events.append('test')
-
-        def tearDown(self):
-            self.events.append('tearDown')
-
-
-class TestCleanUp(unittest2.TestCase):
-
-    def testCleanUp(self):
-        class TestableTest(unittest2.TestCase):
-
-            def testNothing(self):
-                pass
-
-        test = TestableTest('testNothing')
-        self.assertEqual(test._cleanups, [])
-
-        cleanups = []
-
-        def cleanup1(*args, **kwargs):
-            cleanups.append((1, args, kwargs))
-
-        def cleanup2(*args, **kwargs):
-            cleanups.append((2, args, kwargs))
-
-        test.addCleanup(cleanup1, 1, 2, 3, four='hello', five='goodbye')
-        test.addCleanup(cleanup2)
-
-        self.assertEqual(
-            test._cleanups, [
-                (cleanup1, (1, 2, 3), dict(
-                    four='hello', five='goodbye')), (cleanup2, (), {})])
-
-        result = test.doCleanups()
-        self.assertTrue(result)
-
-        self.assertEqual(
-            cleanups, [
-                (2, (), {}), (1, (1, 2, 3), dict(
-                    four='hello', five='goodbye'))])
-
-    def testCleanUpWithErrors(self):
-        class TestableTest(unittest2.TestCase):
-
-            def testNothing(self):
-                pass
-
-        class MockResult(object):
-            errors = []
-
-            def addError(self, test, exc_info):
-                self.errors.append((test, exc_info))
-
-        result = MockResult()
-        test = TestableTest('testNothing')
-        test._resultForDoCleanups = result
-
-        exc1 = Exception('foo')
-        exc2 = Exception('bar')
-
-        def cleanup1():
-            raise exc1
-
-        def cleanup2():
-            raise exc2
-
-        test.addCleanup(cleanup1)
-        test.addCleanup(cleanup2)
-
-        self.assertFalse(test.doCleanups())
-
-        (test1, (Type1, instance1, _)), (test2,
-                                         (Type2, instance2, _)) = reversed(MockResult.errors)
-        self.assertEqual((test1, Type1, instance1), (test, Exception, exc1))
-        self.assertEqual((test2, Type2, instance2), (test, Exception, exc2))
-
-    def testCleanupInRun(self):
-        blowUp = False
-        ordering = []
-
-        class TestableTest(unittest2.TestCase):
-
-            def setUp(self):
-                ordering.append('setUp')
-                if blowUp:
-                    raise Exception('foo')
-
-            def testNothing(self):
-                ordering.append('test')
-
-            def tearDown(self):
-                ordering.append('tearDown')
-
-        test = TestableTest('testNothing')
-
-        def cleanup1():
-            ordering.append('cleanup1')
-
-        def cleanup2():
-            ordering.append('cleanup2')
-        test.addCleanup(cleanup1)
-        test.addCleanup(cleanup2)
-
-        def success(some_test):
-            self.assertEqual(some_test, test)
-            ordering.append('success')
-
-        result = unittest2.TestResult()
-        result.addSuccess = success
-
-        test.run(result)
-        self.assertEqual(ordering, ['setUp', 'test', 'tearDown',
-                                    'cleanup2', 'cleanup1', 'success'])
-
-        blowUp = True
-        ordering = []
-        test = TestableTest('testNothing')
-        test.addCleanup(cleanup1)
-        test.run(result)
-        self.assertEqual(ordering, ['setUp', 'cleanup1'])
-
-    def testTestCaseDebugExecutesCleanups(self):
-        ordering = []
-
-        class TestableTest(unittest2.TestCase):
-
-            def setUp(self):
-                ordering.append('setUp')
-                self.addCleanup(cleanup1)
-
-            def testNothing(self):
-                ordering.append('test')
-
-            def tearDown(self):
-                ordering.append('tearDown')
-
-        test = TestableTest('testNothing')
-
-        def cleanup1():
-            ordering.append('cleanup1')
-            test.addCleanup(cleanup2)
-
-        def cleanup2():
-            ordering.append('cleanup2')
-
-        test.debug()
-        self.assertEqual(
-            ordering, [
-                'setUp', 'test', 'tearDown', 'cleanup1', 'cleanup2'])
-
-
-class Test_TestCase(unittest2.TestCase, EqualityMixin, HashingMixin):
-
-    # Set up attributes used by inherited tests
-    ################################################################
-
-    # Used by HashingMixin.test_hash and EqualityMixin.test_eq
-    eq_pairs = [(Test.Foo('test1'), Test.Foo('test1'))]
-
-    # Used by EqualityMixin.test_ne
-    ne_pairs = [(Test.Foo('test1'), Test.Foo('runTest')),
-                (Test.Foo('test1'), Test.Bar('test1')),
-                (Test.Foo('test1'), Test.Bar('test2'))]
-
-    ################################################################
-    # /Set up attributes used by inherited tests
-
-    # "class TestCase([methodName])"
-    # ...
-    # "Each instance of TestCase will run a single test method: the
-    # method named methodName."
-    # ...
-    # "methodName defaults to "runTest"."
-    #
-    # Make sure it really is optional, and that it defaults to the proper
-    # thing.
-    def test_init__no_test_name(self):
-        class Test(unittest2.TestCase):
-
-            def runTest(self): raise MyException()
-
-            def test(self): pass
-
-        self.assertEqual(Test().id()[-13:], '.Test.runTest')
-
-    # "class TestCase([methodName])"
-    # ...
-    # "Each instance of TestCase will run a single test method: the
-    # method named methodName."
-    def test_init__test_name__valid(self):
-        class Test(unittest2.TestCase):
-
-            def runTest(self): raise MyException()
-
-            def test(self): pass
-
-        self.assertEqual(Test('test').id()[-10:], '.Test.test')
-
-    # "class unittest2.TestCase([methodName])"
-    # ...
-    # "Each instance of TestCase will run a single test method: the
-    # method named methodName."
-    def test_init__test_name__invalid(self):
-        class Test(unittest2.TestCase):
-
-            def runTest(self): raise MyException()
-
-            def test(self): pass
-
-        try:
-            Test('testfoo')
-        except ValueError:
-            pass
-        else:
-            self.fail("Failed to raise ValueError")
-
-    # "Return the number of tests represented by the this test object. For
-    # TestCase instances, this will always be 1"
-    def test_countTestCases(self):
-        class Foo(unittest2.TestCase):
-
-            def test(self): pass
-
-        self.assertEqual(Foo('test').countTestCases(), 1)
-
-    # "Return the default type of test result object to be used to run this
-    # test. For TestCase instances, this will always be
-    # unittest2.TestResult;  subclasses of TestCase should
-    # override this as necessary."
-    def test_defaultTestResult(self):
-        class Foo(unittest2.TestCase):
-
-            def runTest(self):
-                pass
-
-        result = Foo().defaultTestResult()
-        self.assertEqual(type(result), unittest2.TestResult)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if setUp() raises
-    # an exception.
-    def test_run_call_order__error_in_setUp(self):
-        events = []
-        result = LoggingResult(events)
-
-        class Foo(Test.LoggingTestCase):
-
-            def setUp(self):
-                super(Foo, self).setUp()
-                raise RuntimeError('raised by Foo.setUp')
-
-        Foo(events).run(result)
-        expected = ['startTest', 'setUp', 'addError', 'stopTest']
-        self.assertEqual(events, expected)
-
-    # "With a temporary result stopTestRun is called when setUp errors.
-    def test_run_call_order__error_in_setUp_default_result(self):
-        events = []
-
-        class Foo(Test.LoggingTestCase):
-
-            def defaultTestResult(self):
-                return LoggingResult(self.events)
-
-            def setUp(self):
-                super(Foo, self).setUp()
-                raise RuntimeError('raised by Foo.setUp')
-
-        Foo(events).run()
-        expected = ['startTestRun', 'startTest', 'setUp', 'addError',
-                    'stopTest', 'stopTestRun']
-        self.assertEqual(events, expected)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if the test raises
-    # an error (as opposed to a failure).
-    def test_run_call_order__error_in_test(self):
-        events = []
-        result = LoggingResult(events)
-
-        class Foo(Test.LoggingTestCase):
-
-            def test(self):
-                super(Foo, self).test()
-                raise RuntimeError('raised by Foo.test')
-
-        expected = ['startTest', 'setUp', 'test', 'addError', 'tearDown',
-                    'stopTest']
-        Foo(events).run(result)
-        self.assertEqual(events, expected)
-
-    # "With a default result, an error in the test still results in stopTestRun
-    # being called."
-    def test_run_call_order__error_in_test_default_result(self):
-        events = []
-
-        class Foo(Test.LoggingTestCase):
-
-            def defaultTestResult(self):
-                return LoggingResult(self.events)
-
-            def test(self):
-                super(Foo, self).test()
-                raise RuntimeError('raised by Foo.test')
-
-        expected = ['startTestRun', 'startTest', 'setUp', 'test', 'addError',
-                    'tearDown', 'stopTest', 'stopTestRun']
-        Foo(events).run()
-        self.assertEqual(events, expected)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if the test signals
-    # a failure (as opposed to an error).
-    def test_run_call_order__failure_in_test(self):
-        events = []
-        result = LoggingResult(events)
-
-        class Foo(Test.LoggingTestCase):
-
-            def test(self):
-                super(Foo, self).test()
-                self.fail('raised by Foo.test')
-
-        expected = ['startTest', 'setUp', 'test', 'addFailure', 'tearDown',
-                    'stopTest']
-        Foo(events).run(result)
-        self.assertEqual(events, expected)
-
-    # "When a test fails with a default result stopTestRun is still called."
-    def test_run_call_order__failure_in_test_default_result(self):
-
-        class Foo(Test.LoggingTestCase):
-
-            def defaultTestResult(self):
-                return LoggingResult(self.events)
-
-            def test(self):
-                super(Foo, self).test()
-                self.fail('raised by Foo.test')
-
-        expected = ['startTestRun', 'startTest', 'setUp', 'test', 'addFailure',
-                    'tearDown', 'stopTest', 'stopTestRun']
-        events = []
-        Foo(events).run()
-        self.assertEqual(events, expected)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if tearDown() raises
-    # an exception.
-    def test_run_call_order__error_in_tearDown(self):
-        events = []
-        result = LoggingResult(events)
-
-        class Foo(Test.LoggingTestCase):
-
-            def tearDown(self):
-                super(Foo, self).tearDown()
-                raise RuntimeError('raised by Foo.tearDown')
-
-        Foo(events).run(result)
-        expected = ['startTest', 'setUp', 'test', 'tearDown', 'addError',
-                    'stopTest']
-        self.assertEqual(events, expected)
-
-    # "When tearDown errors with a default result stopTestRun is still called."
-    def test_run_call_order__error_in_tearDown_default_result(self):
-
-        class Foo(Test.LoggingTestCase):
-
-            def defaultTestResult(self):
-                return LoggingResult(self.events)
-
-            def tearDown(self):
-                super(Foo, self).tearDown()
-                raise RuntimeError('raised by Foo.tearDown')
-
-        events = []
-        Foo(events).run()
-        expected = ['startTestRun', 'startTest', 'setUp', 'test', 'tearDown',
-                    'addError', 'stopTest', 'stopTestRun']
-        self.assertEqual(events, expected)
-
-    # "TestCase.run() still works when the defaultTestResult is a TestResult
-    # that does not support startTestRun and stopTestRun.
-    def test_run_call_order_default_result(self):
-
-        class Foo(unittest2.TestCase):
-
-            def defaultTestResult(self):
-                return OldTestResult()
-
-            def test(self):
-                pass
-
-        Foo('test').run()
-
-    # "This class attribute gives the exception raised by the test() method.
-    # If a test framework needs to use a specialized exception, possibly to
-    # carry additional information, it must subclass this exception in
-    # order to ``play fair'' with the framework.  The initial value of this
-    # attribute is AssertionError"
-    def test_failureException__default(self):
-        class Foo(unittest2.TestCase):
-
-            def test(self):
-                pass
-
-        self.assertTrue(Foo('test').failureException is AssertionError)
-
-    # "This class attribute gives the exception raised by the test() method.
-    # If a test framework needs to use a specialized exception, possibly to
-    # carry additional information, it must subclass this exception in
-    # order to ``play fair'' with the framework."
-    #
-    # Make sure TestCase.run() respects the designated failureException
-    def test_failureException__subclassing__explicit_raise(self):
-        events = []
-        result = LoggingResult(events)
-
-        class Foo(unittest2.TestCase):
-
-            def test(self):
-                raise RuntimeError()
-
-            failureException = RuntimeError
-
-        self.assertTrue(Foo('test').failureException is RuntimeError)
-
-        Foo('test').run(result)
-        expected = ['startTest', 'addFailure', 'stopTest']
-        self.assertEqual(events, expected)
-
-    # "This class attribute gives the exception raised by the test() method.
-    # If a test framework needs to use a specialized exception, possibly to
-    # carry additional information, it must subclass this exception in
-    # order to ``play fair'' with the framework."
-    #
-    # Make sure TestCase.run() respects the designated failureException
-    def test_failureException__subclassing__implicit_raise(self):
-        events = []
-        result = LoggingResult(events)
-
-        class Foo(unittest2.TestCase):
-
-            def test(self):
-                self.fail("foo")
-
-            failureException = RuntimeError
-
-        self.assertTrue(Foo('test').failureException is RuntimeError)
-
-        Foo('test').run(result)
-        expected = ['startTest', 'addFailure', 'stopTest']
-        self.assertEqual(events, expected)
-
-    # "The default implementation does nothing."
-    def test_setUp(self):
-        class Foo(unittest2.TestCase):
-
-            def runTest(self):
-                pass
-
-        # ... and nothing should happen
-        Foo().setUp()
-
-    # "The default implementation does nothing."
-    def test_tearDown(self):
-        class Foo(unittest2.TestCase):
-
-            def runTest(self):
-                pass
-
-        # ... and nothing should happen
-        Foo().tearDown()
-
-    # "Return a string identifying the specific test case."
-    #
-    # Because of the vague nature of the docs, I'm not going to lock this
-    # test down too much. Really all that can be asserted is that the id()
-    # will be a string (either 8-byte or unicode -- again, because the docs
-    # just say "string")
-    def test_id(self):
-        class Foo(unittest2.TestCase):
-
-            def runTest(self):
-                pass
-
-        self.assertIsInstance(Foo().id(), str)
-
-    # "If result is omitted or None, a temporary result object is created
-    # and used, but is not made available to the caller. As TestCase owns the
-    # temporary result startTestRun and stopTestRun are called.
-
-    def test_run__uses_defaultTestResult(self):
-        events = []
-
-        class Foo(unittest2.TestCase):
-
-            def test(self):
-                events.append('test')
-
-            def defaultTestResult(self):
-                return LoggingResult(events)
-
-        # Make run() find a result object on its own
-        Foo('test').run()
-
-        expected = ['startTestRun', 'startTest', 'test', 'addSuccess',
-                    'stopTest', 'stopTestRun']
-        self.assertEqual(events, expected)
-
-    def testShortDescriptionWithoutDocstring(self):
-        self.assertIsNone(self.shortDescription())
-
-    def testShortDescriptionWithOneLineDocstring(self):
-        """Tests shortDescription() for a method with a docstring."""
-        self.assertEqual(
-            self.shortDescription(),
-            'Tests shortDescription() for a method with a docstring.')
-
-    def testShortDescriptionWithMultiLineDocstring(self):
-        """Tests shortDescription() for a method with a longer docstring.
-
-        This method ensures that only the first line of a docstring is
-        returned used in the short description, no matter how long the
-        whole thing is.
-        """
-        self.assertEqual(
-            self.shortDescription(),
-            'Tests shortDescription() for a method with a longer '
-            'docstring.')
-
-    def testAddTypeEqualityFunc(self):
-        class SadSnake(object):
-            """Dummy class for test_addTypeEqualityFunc."""
-        s1, s2 = SadSnake(), SadSnake()
-        self.assertNotEqual(s1, s2)
-
-        def AllSnakesCreatedEqual(a, b, msg=None):
-            return type(a) is type(b) is SadSnake
-        self.addTypeEqualityFunc(SadSnake, AllSnakesCreatedEqual)
-        self.assertEqual(s1, s2)
-        # No this doesn't clean up and remove the SadSnake equality func
-        # from this TestCase instance but since its a local nothing else
-        # will ever notice that.
-
-    def testAssertIs(self):
-        thing = object()
-        self.assertIs(thing, thing)
-        self.assertRaises(
-            self.failureException,
-            self.assertIs,
-            thing,
-            object())
-
-    def testAssertIsNot(self):
-        thing = object()
-        self.assertIsNot(thing, object())
-        self.assertRaises(
-            self.failureException,
-            self.assertIsNot,
-            thing,
-            thing)
-
-    def testAssertIsInstance(self):
-        thing = []
-        self.assertIsInstance(thing, list)
-        self.assertRaises(self.failureException, self.assertIsInstance,
-                          thing, dict)
-
-    def testAssertNotIsInstance(self):
-        thing = []
-        self.assertNotIsInstance(thing, dict)
-        self.assertRaises(self.failureException, self.assertNotIsInstance,
-                          thing, list)
-
-    def testAssertIn(self):
-        animals = {'monkey': 'banana', 'cow': 'grass', 'seal': 'fish'}
-
-        self.assertIn('a', 'abc')
-        self.assertIn(2, [1, 2, 3])
-        self.assertIn('monkey', animals)
-
-        self.assertNotIn('d', 'abc')
-        self.assertNotIn(0, [1, 2, 3])
-        self.assertNotIn('otter', animals)
-
-        self.assertRaises(self.failureException, self.assertIn, 'x', 'abc')
-        self.assertRaises(self.failureException, self.assertIn, 4, [1, 2, 3])
-        self.assertRaises(self.failureException, self.assertIn, 'elephant',
-                          animals)
-
-        self.assertRaises(self.failureException, self.assertNotIn, 'c', 'abc')
-        self.assertRaises(
-            self.failureException, self.assertNotIn, 1, [
-                1, 2, 3])
-        self.assertRaises(self.failureException, self.assertNotIn, 'cow',
-                          animals)
-
-    def testAssertDictContainsSubset(self):
-        self.assertDictContainsSubset({}, {})
-        self.assertDictContainsSubset({}, {'a': 1})
-        self.assertDictContainsSubset({'a': 1}, {'a': 1})
-        self.assertDictContainsSubset({'a': 1}, {'a': 1, 'b': 2})
-        self.assertDictContainsSubset({'a': 1, 'b': 2}, {'a': 1, 'b': 2})
-
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertDictContainsSubset, {'a': 2}, {'a': 1},
-                          '.*Mismatched values:.*')
-
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertDictContainsSubset, {'c': 1}, {'a': 1},
-                          '.*Missing:.*')
-
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertDictContainsSubset, {'a': 1, 'c': 1},
-                          {'a': 1}, '.*Missing:.*')
-
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertDictContainsSubset, {'a': 1, 'c': 1},
-                          {'a': 1}, '.*Missing:.*Mismatched values:.*')
-
-        self.assertRaises(self.failureException,
-                          self.assertDictContainsSubset, {1: "one"}, {})
-
-    def testAssertEqual(self):
-        equal_pairs = [
-            ((), ()),
-            ({}, {}),
-            ([], []),
-            (set(), set()),
-            (frozenset(), frozenset())]
-        for a, b in equal_pairs:
-            # This mess of try excepts is to test the assertEqual behavior
-            # itself.
-            try:
-                self.assertEqual(a, b)
-            except self.failureException:
-                self.fail('assertEqual(%r, %r) failed' % (a, b))
-            try:
-                self.assertEqual(a, b, msg='foo')
-            except self.failureException:
-                self.fail('assertEqual(%r, %r) with msg= failed' % (a, b))
-            try:
-                self.assertEqual(a, b, 'foo')
-            except self.failureException:
-                self.fail('assertEqual(%r, %r) with third parameter failed' %
-                          (a, b))
-
-        unequal_pairs = [
-            ((), []),
-            ({}, set()),
-            (set([4, 1]), frozenset([4, 2])),
-            (frozenset([4, 5]), set([2, 3])),
-            (set([3, 4]), set([5, 4]))]
-        for a, b in unequal_pairs:
-            self.assertRaises(self.failureException, self.assertEqual, a, b)
-            self.assertRaises(self.failureException, self.assertEqual, a, b,
-                              'foo')
-            self.assertRaises(self.failureException, self.assertEqual, a, b,
-                              msg='foo')
-
-    def testEquality(self):
-        self.assertListEqual([], [])
-        self.assertTupleEqual((), ())
-        self.assertSequenceEqual([], ())
-
-        a = [0, 'a', []]
-        b = []
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertListEqual, a, b)
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertListEqual, tuple(a), tuple(b))
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertSequenceEqual, a, tuple(b))
-
-        b.extend(a)
-        self.assertListEqual(a, b)
-        self.assertTupleEqual(tuple(a), tuple(b))
-        self.assertSequenceEqual(a, tuple(b))
-        self.assertSequenceEqual(tuple(a), b)
-
-        self.assertRaises(self.failureException, self.assertListEqual,
-                          a, tuple(b))
-        self.assertRaises(self.failureException, self.assertTupleEqual,
-                          tuple(a), b)
-        self.assertRaises(self.failureException, self.assertListEqual, None, b)
-        self.assertRaises(self.failureException, self.assertTupleEqual, None,
-                          tuple(b))
-        self.assertRaises(self.failureException, self.assertSequenceEqual,
-                          None, tuple(b))
-        self.assertRaises(self.failureException, self.assertListEqual, 1, 1)
-        self.assertRaises(self.failureException, self.assertTupleEqual, 1, 1)
-        self.assertRaises(self.failureException, self.assertSequenceEqual,
-                          1, 1)
-
-        self.assertDictEqual({}, {})
-
-        c = {'x': 1}
-        d = {}
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertDictEqual, c, d)
-
-        d.update(c)
-        self.assertDictEqual(c, d)
-
-        d['x'] = 0
-        self.assertRaises(unittest2.TestCase.failureException,
-                          self.assertDictEqual, c, d, 'These are unequal')
-
-        self.assertRaises(self.failureException, self.assertDictEqual, None, d)
-        self.assertRaises(self.failureException, self.assertDictEqual, [], d)
-        self.assertRaises(self.failureException, self.assertDictEqual, 1, 1)
-
-    def testAssertItemsEqual(self):
-        self.assertItemsEqual([1, 2, 3], [3, 2, 1])
-        self.assertItemsEqual(['foo', 'bar', 'baz'], ['bar', 'baz', 'foo'])
-        self.assertRaises(self.failureException, self.assertItemsEqual,
-                          [10], [10, 11])
-        self.assertRaises(self.failureException, self.assertItemsEqual,
-                          [10, 11], [10])
-        self.assertRaises(self.failureException, self.assertItemsEqual,
-                          [10, 11, 10], [10, 11])
-
-        # Test that sequences of unhashable objects can be tested for sameness:
-        self.assertItemsEqual([[1, 2], [3, 4]], [[3, 4], [1, 2]])
-
-        self.assertItemsEqual([{'a': 1}, {'b': 2}], [{'b': 2}, {'a': 1}])
-        self.assertRaises(self.failureException, self.assertItemsEqual,
-                          [[1]], [[2]])
-
-        # Test unsortable objects
-        self.assertItemsEqual([2j, None], [None, 2j])
-        self.assertRaises(self.failureException, self.assertItemsEqual,
-                          [2j, None], [None, 3j])
-
-    def testAssertSetEqual(self):
-        set1 = set()
-        set2 = set()
-        self.assertSetEqual(set1, set2)
-
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            None,
-            set2)
-        self.assertRaises(self.failureException, self.assertSetEqual, [], set2)
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set1,
-            None)
-        self.assertRaises(self.failureException, self.assertSetEqual, set1, [])
-
-        set1 = set(['a'])
-        set2 = set()
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set1,
-            set2)
-
-        set1 = set(['a'])
-        set2 = set(['a'])
-        self.assertSetEqual(set1, set2)
-
-        set1 = set(['a'])
-        set2 = set(['a', 'b'])
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set1,
-            set2)
-
-        set1 = set(['a'])
-        set2 = frozenset(['a', 'b'])
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set1,
-            set2)
-
-        set1 = set(['a', 'b'])
-        set2 = frozenset(['a', 'b'])
-        self.assertSetEqual(set1, set2)
-
-        set1 = set()
-        set2 = "foo"
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set1,
-            set2)
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set2,
-            set1)
-
-        # make sure any string formatting is tuple-safe
-        set1 = set([(0, 1), (2, 3)])
-        set2 = set([(4, 5)])
-        self.assertRaises(
-            self.failureException,
-            self.assertSetEqual,
-            set1,
-            set2)
-
-    def testInequality(self):
-        # Try ints
-        self.assertGreater(2, 1)
-        self.assertGreaterEqual(2, 1)
-        self.assertGreaterEqual(1, 1)
-        self.assertLess(1, 2)
-        self.assertLessEqual(1, 2)
-        self.assertLessEqual(1, 1)
-        self.assertRaises(self.failureException, self.assertGreater, 1, 2)
-        self.assertRaises(self.failureException, self.assertGreater, 1, 1)
-        self.assertRaises(self.failureException, self.assertGreaterEqual, 1, 2)
-        self.assertRaises(self.failureException, self.assertLess, 2, 1)
-        self.assertRaises(self.failureException, self.assertLess, 1, 1)
-        self.assertRaises(self.failureException, self.assertLessEqual, 2, 1)
-
-        # Try Floats
-        self.assertGreater(1.1, 1.0)
-        self.assertGreaterEqual(1.1, 1.0)
-        self.assertGreaterEqual(1.0, 1.0)
-        self.assertLess(1.0, 1.1)
-        self.assertLessEqual(1.0, 1.1)
-        self.assertLessEqual(1.0, 1.0)
-        self.assertRaises(self.failureException, self.assertGreater, 1.0, 1.1)
-        self.assertRaises(self.failureException, self.assertGreater, 1.0, 1.0)
-        self.assertRaises(
-            self.failureException,
-            self.assertGreaterEqual,
-            1.0,
-            1.1)
-        self.assertRaises(self.failureException, self.assertLess, 1.1, 1.0)
-        self.assertRaises(self.failureException, self.assertLess, 1.0, 1.0)
-        self.assertRaises(
-            self.failureException,
-            self.assertLessEqual,
-            1.1,
-            1.0)
-
-        # Try Strings
-        self.assertGreater('bug', 'ant')
-        self.assertGreaterEqual('bug', 'ant')
-        self.assertGreaterEqual('ant', 'ant')
-        self.assertLess('ant', 'bug')
-        self.assertLessEqual('ant', 'bug')
-        self.assertLessEqual('ant', 'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            'ant',
-            'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            'ant',
-            'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreaterEqual,
-            'ant',
-            'bug')
-        self.assertRaises(self.failureException, self.assertLess, 'bug', 'ant')
-        self.assertRaises(self.failureException, self.assertLess, 'ant', 'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLessEqual,
-            'bug',
-            'ant')
-
-        # Try Unicode
-        self.assertGreater(u'bug', u'ant')
-        self.assertGreaterEqual(u'bug', u'ant')
-        self.assertGreaterEqual(u'ant', u'ant')
-        self.assertLess(u'ant', u'bug')
-        self.assertLessEqual(u'ant', u'bug')
-        self.assertLessEqual(u'ant', u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            u'ant',
-            u'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            u'ant',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreaterEqual,
-            u'ant',
-            u'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertLess,
-            u'bug',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLess,
-            u'ant',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLessEqual,
-            u'bug',
-            u'ant')
-
-        # Try Mixed String/Unicode
-        self.assertGreater('bug', u'ant')
-        self.assertGreater(u'bug', 'ant')
-        self.assertGreaterEqual('bug', u'ant')
-        self.assertGreaterEqual(u'bug', 'ant')
-        self.assertGreaterEqual('ant', u'ant')
-        self.assertGreaterEqual(u'ant', 'ant')
-        self.assertLess('ant', u'bug')
-        self.assertLess(u'ant', 'bug')
-        self.assertLessEqual('ant', u'bug')
-        self.assertLessEqual(u'ant', 'bug')
-        self.assertLessEqual('ant', u'ant')
-        self.assertLessEqual(u'ant', 'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            'ant',
-            u'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            u'ant',
-            'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            'ant',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreater,
-            u'ant',
-            'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreaterEqual,
-            'ant',
-            u'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertGreaterEqual,
-            u'ant',
-            'bug')
-        self.assertRaises(
-            self.failureException,
-            self.assertLess,
-            'bug',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLess,
-            u'bug',
-            'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLess,
-            'ant',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLess,
-            u'ant',
-            'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLessEqual,
-            'bug',
-            u'ant')
-        self.assertRaises(
-            self.failureException,
-            self.assertLessEqual,
-            u'bug',
-            'ant')
-
-    def testAssertMultiLineEqual(self):
-        sample_text = """\
-http://www.python.org/doc/2.3/lib/module-unittest.html
-test case
-    A test case is the smallest unit of testing. [...]
-"""
-        revised_sample_text = """\
-http://www.python.org/doc/2.4.1/lib/module-unittest.html
-test case
-    A test case is the smallest unit of testing. [...] You may provide your
-    own implementation that does not subclass from TestCase, of course.
-"""
-        sample_text_error = """\
-- http://www.python.org/doc/2.3/lib/module-unittest.html
-?                             ^
-+ http://www.python.org/doc/2.4.1/lib/module-unittest.html
-?                             ^^^
-  test case
--     A test case is the smallest unit of testing. [...]
-+     A test case is the smallest unit of testing. [...] You may provide your
-?                                                       +++++++++++++++++++++
-+     own implementation that does not subclass from TestCase, of course.
-"""
-        self.maxDiff = None
-        for type_changer in (lambda x: x, lambda x: x.decode('utf8')):
-            try:
-                self.assertMultiLineEqual(type_changer(sample_text),
-                                          type_changer(revised_sample_text))
-            except self.failureException as e:
-                # need to remove the first line of the error message
-                error = str(e).encode('utf8').split('\n', 1)[1]
-
-                # assertMultiLineEqual is hooked up as the default for
-                # unicode strings - so we can't use it for this check
-                self.assertTrue(sample_text_error == error)
-
-    def testAssertSequenceEqualMaxDiff(self):
-        self.assertEqual(self.maxDiff, 80 * 8)
-        seq1 = 'a' + 'x' * 80**2
-        seq2 = 'b' + 'x' * 80**2
-        diff = '\n'.join(difflib.ndiff(pprint.pformat(seq1).splitlines(),
-                                       pprint.pformat(seq2).splitlines()))
-        # the +1 is the leading \n added by assertSequenceEqual
-        omitted = unittest2.case.DIFF_OMITTED % (len(diff) + 1,)
-
-        self.maxDiff = len(diff) // 2
-        try:
-            self.assertSequenceEqual(seq1, seq2)
-        except self.failureException as e:
-            msg = e.args[0]
-        else:
-            self.fail('assertSequenceEqual did not fail.')
-        self.assertTrue(len(msg) < len(diff))
-        self.assertIn(omitted, msg)
-
-        self.maxDiff = len(diff) * 2
-        try:
-            self.assertSequenceEqual(seq1, seq2)
-        except self.failureException as e:
-            msg = e.args[0]
-        else:
-            self.fail('assertSequenceEqual did not fail.')
-        self.assertTrue(len(msg) > len(diff))
-        self.assertNotIn(omitted, msg)
-
-        self.maxDiff = None
-        try:
-            self.assertSequenceEqual(seq1, seq2)
-        except self.failureException as e:
-            msg = e.args[0]
-        else:
-            self.fail('assertSequenceEqual did not fail.')
-        self.assertTrue(len(msg) > len(diff))
-        self.assertNotIn(omitted, msg)
-
-    def testTruncateMessage(self):
-        self.maxDiff = 1
-        message = self._truncateMessage('foo', 'bar')
-        omitted = unittest2.case.DIFF_OMITTED % len('bar')
-        self.assertEqual(message, 'foo' + omitted)
-
-        self.maxDiff = None
-        message = self._truncateMessage('foo', 'bar')
-        self.assertEqual(message, 'foobar')
-
-        self.maxDiff = 4
-        message = self._truncateMessage('foo', 'bar')
-        self.assertEqual(message, 'foobar')
-
-    def testAssertDictEqualTruncates(self):
-        test = unittest2.TestCase('assertEqual')
-
-        def truncate(msg, diff):
-            return 'foo'
-        test._truncateMessage = truncate
-        try:
-            test.assertDictEqual({}, {1: 0})
-        except self.failureException as e:
-            self.assertEqual(str(e), 'foo')
-        else:
-            self.fail('assertDictEqual did not fail')
-
-    def testAssertMultiLineEqualTruncates(self):
-        test = unittest2.TestCase('assertEqual')
-
-        def truncate(msg, diff):
-            return 'foo'
-        test._truncateMessage = truncate
-        try:
-            test.assertMultiLineEqual('foo', 'bar')
-        except self.failureException as e:
-            self.assertEqual(str(e), 'foo')
-        else:
-            self.fail('assertMultiLineEqual did not fail')
-
-    def testAssertIsNone(self):
-        self.assertIsNone(None)
-        self.assertRaises(self.failureException, self.assertIsNone, False)
-        self.assertIsNotNone('DjZoPloGears on Rails')
-        self.assertRaises(self.failureException, self.assertIsNotNone, None)
-
-    def testAssertRegexpMatches(self):
-        self.assertRegexpMatches('asdfabasdf', r'ab+')
-        self.assertRaises(self.failureException, self.assertRegexpMatches,
-                          'saaas', r'aaaa')
-
-    def testAssertRaisesRegexp(self):
-        class ExceptionMock(Exception):
-            pass
-
-        def Stub():
-            raise ExceptionMock('We expect')
-
-        self.assertRaisesRegexp(ExceptionMock, re.compile('expect$'), Stub)
-        self.assertRaisesRegexp(ExceptionMock, 'expect$', Stub)
-        self.assertRaisesRegexp(ExceptionMock, u'expect$', Stub)
-
-    def testAssertNotRaisesRegexp(self):
-        self.assertRaisesRegexp(
-            self.failureException, '^Exception not raised$',
-            self.assertRaisesRegexp, Exception, re.compile('x'),
-            lambda: None)
-        self.assertRaisesRegexp(
-            self.failureException, '^Exception not raised$',
-            self.assertRaisesRegexp, Exception, 'x',
-            lambda: None)
-        self.assertRaisesRegexp(
-            self.failureException, '^Exception not raised$',
-            self.assertRaisesRegexp, Exception, u'x',
-            lambda: None)
-
-    def testAssertRaisesRegexpMismatch(self):
-        def Stub():
-            raise Exception('Unexpected')
-
-        self.assertRaisesRegexp(
-            self.failureException,
-            r'"\^Expected\$" does not match "Unexpected"',
-            self.assertRaisesRegexp, Exception, '^Expected$',
-            Stub)
-        self.assertRaisesRegexp(
-            self.failureException,
-            r'"\^Expected\$" does not match "Unexpected"',
-            self.assertRaisesRegexp, Exception, u'^Expected$',
-            Stub)
-        self.assertRaisesRegexp(
-            self.failureException,
-            r'"\^Expected\$" does not match "Unexpected"',
-            self.assertRaisesRegexp, Exception,
-            re.compile('^Expected$'), Stub)
-
-    def testSynonymAssertMethodNames(self):
-        """Test undocumented method name synonyms.
-
-        Please do not use these methods names in your own code.
-
-        This test confirms their continued existence and functionality
-        in order to avoid breaking existing code.
-        """
-        self.assertNotEquals(3, 5)
-        self.assertEquals(3, 3)
-        self.assertAlmostEquals(2.0, 2.0)
-        self.assertNotAlmostEquals(3.0, 5.0)
-        self.assert_(True)
-
-    def testDeepcopy(self):
-        # Issue: 5660
-        class TestableTest(unittest2.TestCase):
-
-            def testNothing(self):
-                pass
-
-        test = TestableTest('testNothing')
-
-        # This shouldn't blow up
-        deepcopy(test)
-
-
-if __name__ == "__main__":
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_discovery.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_discovery.py
deleted file mode 100644
index 70bc588e5ac56..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_discovery.py
+++ /dev/null
@@ -1,392 +0,0 @@
-import os
-import re
-import sys
-
-import unittest2
-
-
-class TestDiscovery(unittest2.TestCase):
-
-    # Heavily mocked tests so I can avoid hitting the filesystem
-    def test_get_name_from_path(self):
-        loader = unittest2.TestLoader()
-
-        loader._top_level_dir = '/foo'
-        name = loader._get_name_from_path('/foo/bar/baz.py')
-        self.assertEqual(name, 'bar.baz')
-
-        if not __debug__:
-            # asserts are off
-            return
-
-        self.assertRaises(AssertionError,
-                          loader._get_name_from_path,
-                          '/bar/baz.py')
-
-    def test_find_tests(self):
-        loader = unittest2.TestLoader()
-
-        original_listdir = os.listdir
-
-        def restore_listdir():
-            os.listdir = original_listdir
-        original_isfile = os.path.isfile
-
-        def restore_isfile():
-            os.path.isfile = original_isfile
-        original_isdir = os.path.isdir
-
-        def restore_isdir():
-            os.path.isdir = original_isdir
-
-        path_lists = [['test1.py', 'test2.py', 'not_a_test.py', 'test_dir',
-                       'test.foo', 'test-not-a-module.py', 'another_dir'],
-                      ['test3.py', 'test4.py', ]]
-        os.listdir = lambda path: path_lists.pop(0)
-        self.addCleanup(restore_listdir)
-
-        def isdir(path):
-            return path.endswith('dir')
-        os.path.isdir = isdir
-        self.addCleanup(restore_isdir)
-
-        def isfile(path):
-            # another_dir is not a package and so shouldn't be recursed into
-            return not path.endswith('dir') and not 'another_dir' in path
-        os.path.isfile = isfile
-        self.addCleanup(restore_isfile)
-
-        loader._get_module_from_name = lambda path: path + ' module'
-        loader.loadTestsFromModule = lambda module: module + ' tests'
-
-        top_level = os.path.abspath('/foo')
-        loader._top_level_dir = top_level
-        suite = list(loader._find_tests(top_level, 'test*.py'))
-
-        expected = [name + ' module tests' for name in
-                    ('test1', 'test2')]
-        expected.extend([('test_dir.%s' % name) + ' module tests' for name in
-                         ('test3', 'test4')])
-        self.assertEqual(suite, expected)
-
-    def test_find_tests_with_package(self):
-        loader = unittest2.TestLoader()
-
-        original_listdir = os.listdir
-
-        def restore_listdir():
-            os.listdir = original_listdir
-        original_isfile = os.path.isfile
-
-        def restore_isfile():
-            os.path.isfile = original_isfile
-        original_isdir = os.path.isdir
-
-        def restore_isdir():
-            os.path.isdir = original_isdir
-
-        directories = ['a_directory', 'test_directory', 'test_directory2']
-        path_lists = [directories, [], [], []]
-        os.listdir = lambda path: path_lists.pop(0)
-        self.addCleanup(restore_listdir)
-
-        os.path.isdir = lambda path: True
-        self.addCleanup(restore_isdir)
-
-        os.path.isfile = lambda path: os.path.basename(path) not in directories
-        self.addCleanup(restore_isfile)
-
-        class Module(object):
-            paths = []
-            load_tests_args = []
-
-            def __init__(self, path):
-                self.path = path
-                self.paths.append(path)
-                if os.path.basename(path) == 'test_directory':
-                    def load_tests(loader, tests, pattern):
-                        self.load_tests_args.append((loader, tests, pattern))
-                        return 'load_tests'
-                    self.load_tests = load_tests
-
-            def __eq__(self, other):
-                return self.path == other.path
-
-            # Silence py3k warning
-            __hash__ = None
-
-        loader._get_module_from_name = lambda name: Module(name)
-
-        def loadTestsFromModule(module, use_load_tests):
-            if use_load_tests:
-                raise self.failureException(
-                    'use_load_tests should be False for packages')
-            return module.path + ' module tests'
-        loader.loadTestsFromModule = loadTestsFromModule
-
-        loader._top_level_dir = '/foo'
-        # this time no '.py' on the pattern so that it can match
-        # a test package
-        suite = list(loader._find_tests('/foo', 'test*'))
-
-        # We should have loaded tests from the test_directory package by calling load_tests
-        # and directly from the test_directory2 package
-        self.assertEqual(suite,
-                         ['load_tests', 'test_directory2' + ' module tests'])
-        self.assertEqual(Module.paths, ['test_directory', 'test_directory2'])
-
-        # load_tests should have been called once with loader, tests and
-        # pattern
-        self.assertEqual(
-            Module.load_tests_args, [
-                (loader, 'test_directory' + ' module tests', 'test*')])
-
-    def test_discover(self):
-        loader = unittest2.TestLoader()
-
-        original_isfile = os.path.isfile
-        original_isdir = os.path.isdir
-
-        def restore_isfile():
-            os.path.isfile = original_isfile
-
-        os.path.isfile = lambda path: False
-        self.addCleanup(restore_isfile)
-
-        orig_sys_path = sys.path[:]
-
-        def restore_path():
-            sys.path[:] = orig_sys_path
-        self.addCleanup(restore_path)
-
-        full_path = os.path.abspath(os.path.normpath('/foo'))
-        self.assertRaises(ImportError,
-                          loader.discover,
-                          '/foo/bar', top_level_dir='/foo')
-
-        self.assertEqual(loader._top_level_dir, full_path)
-        self.assertIn(full_path, sys.path)
-
-        os.path.isfile = lambda path: True
-        os.path.isdir = lambda path: True
-
-        def restore_isdir():
-            os.path.isdir = original_isdir
-        self.addCleanup(restore_isdir)
-
-        _find_tests_args = []
-
-        def _find_tests(start_dir, pattern):
-            _find_tests_args.append((start_dir, pattern))
-            return ['tests']
-        loader._find_tests = _find_tests
-        loader.suiteClass = str
-
-        suite = loader.discover('/foo/bar/baz', 'pattern', '/foo/bar')
-
-        top_level_dir = os.path.abspath(os.path.normpath('/foo/bar'))
-        start_dir = os.path.abspath(os.path.normpath('/foo/bar/baz'))
-        self.assertEqual(suite, "['tests']")
-        self.assertEqual(loader._top_level_dir, top_level_dir)
-        self.assertEqual(_find_tests_args, [(start_dir, 'pattern')])
-        self.assertIn(top_level_dir, sys.path)
-
-    def test_discover_with_modules_that_fail_to_import(self):
-        loader = unittest2.TestLoader()
-
-        listdir = os.listdir
-        os.listdir = lambda _: ['test_this_does_not_exist.py']
-        isfile = os.path.isfile
-        os.path.isfile = lambda _: True
-        orig_sys_path = sys.path[:]
-
-        def restore():
-            os.path.isfile = isfile
-            os.listdir = listdir
-            sys.path[:] = orig_sys_path
-        self.addCleanup(restore)
-
-        suite = loader.discover('.')
-        self.assertIn(os.getcwd(), sys.path)
-        self.assertEqual(suite.countTestCases(), 1)
-        test = list(list(suite)[0])[0]  # extract test from suite
-
-        self.assertRaises(ImportError,
-                          lambda: test.test_this_does_not_exist())
-
-    def test_command_line_handling_parseArgs(self):
-        # Haha - take that uninstantiable class
-        program = object.__new__(unittest2.TestProgram)
-
-        args = []
-
-        def do_discovery(argv):
-            args.extend(argv)
-        program._do_discovery = do_discovery
-        program.parseArgs(['something', 'discover'])
-        self.assertEqual(args, [])
-
-        program.parseArgs(['something', 'discover', 'foo', 'bar'])
-        self.assertEqual(args, ['foo', 'bar'])
-
-    def test_command_line_handling_do_discovery_too_many_arguments(self):
-        class Stop(Exception):
-            pass
-
-        def usageExit():
-            raise Stop
-
-        program = object.__new__(unittest2.TestProgram)
-        program.usageExit = usageExit
-
-        self.assertRaises(Stop,
-                          # too many args
-                          lambda: program._do_discovery(['one', 'two', 'three', 'four']))
-
-    def test_command_line_handling_do_discovery_calls_loader(self):
-        program = object.__new__(unittest2.TestProgram)
-
-        class Loader(object):
-            args = []
-
-            def discover(self, start_dir, pattern, top_level_dir):
-                self.args.append((start_dir, pattern, top_level_dir))
-                return 'tests'
-
-        program._do_discovery(['-v'], Loader=Loader)
-        self.assertEqual(program.verbosity, 2)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('.', 'test*.py', None)])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['--verbose'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('.', 'test*.py', None)])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery([], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('.', 'test*.py', None)])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['fish'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('fish', 'test*.py', None)])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['fish', 'eggs'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('fish', 'eggs', None)])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['fish', 'eggs', 'ham'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('fish', 'eggs', 'ham')])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['-s', 'fish'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('fish', 'test*.py', None)])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['-t', 'fish'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('.', 'test*.py', 'fish')])
-
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(['-p', 'fish'], Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('.', 'fish', None)])
-        self.assertFalse(program.failfast)
-        self.assertFalse(program.catchbreak)
-
-        args = ['-p', 'eggs', '-s', 'fish', '-v', '-f']
-        try:
-            import signal
-        except ImportError:
-            signal = None
-        else:
-            args.append('-c')
-        Loader.args = []
-        program = object.__new__(unittest2.TestProgram)
-        program._do_discovery(args, Loader=Loader)
-        self.assertEqual(program.test, 'tests')
-        self.assertEqual(Loader.args, [('fish', 'eggs', None)])
-        self.assertEqual(program.verbosity, 2)
-        self.assertTrue(program.failfast)
-        if signal is not None:
-            self.assertTrue(program.catchbreak)
-
-    def test_detect_module_clash(self):
-        class Module(object):
-            __file__ = 'bar/foo.py'
-        sys.modules['foo'] = Module
-        full_path = os.path.abspath('foo')
-        original_listdir = os.listdir
-        original_isfile = os.path.isfile
-        original_isdir = os.path.isdir
-
-        def cleanup():
-            os.listdir = original_listdir
-            os.path.isfile = original_isfile
-            os.path.isdir = original_isdir
-            del sys.modules['foo']
-            if full_path in sys.path:
-                sys.path.remove(full_path)
-        self.addCleanup(cleanup)
-
-        def listdir(_):
-            return ['foo.py']
-
-        def isfile(_):
-            return True
-
-        def isdir(_):
-            return True
-        os.listdir = listdir
-        os.path.isfile = isfile
-        os.path.isdir = isdir
-
-        loader = unittest2.TestLoader()
-
-        mod_dir = os.path.abspath('bar')
-        expected_dir = os.path.abspath('foo')
-        msg = re.escape(
-            r"'foo' module incorrectly imported from %r. Expected %r. "
-            "Is this module globally installed?" %
-            (mod_dir, expected_dir))
-        self.assertRaisesRegexp(
-            ImportError, '^%s$' % msg, loader.discover,
-            start_dir='foo', pattern='foo.py'
-        )
-        self.assertEqual(sys.path[0], full_path)
-
-    def test_discovery_from_dotted_path(self):
-        loader = unittest2.TestLoader()
-
-        tests = [self]
-        expectedPath = os.path.abspath(
-            os.path.dirname(unittest2.test.__file__))
-
-        self.wasRun = False
-
-        def _find_tests(start_dir, pattern):
-            self.wasRun = True
-            self.assertEqual(start_dir, expectedPath)
-            return tests
-        loader._find_tests = _find_tests
-        suite = loader.discover('unittest2.test')
-        self.assertTrue(self.wasRun)
-        self.assertEqual(suite._tests, tests)
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_functiontestcase.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_functiontestcase.py
deleted file mode 100644
index 9bafb54be5a6e..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_functiontestcase.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import unittest2
-
-from unittest2.test.support import LoggingResult
-
-
-class Test_FunctionTestCase(unittest2.TestCase):
-
-    # "Return the number of tests represented by the this test object. For
-    # unittest2.TestCase instances, this will always be 1"
-    def test_countTestCases(self):
-        test = unittest2.FunctionTestCase(lambda: None)
-
-        self.assertEqual(test.countTestCases(), 1)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if setUp() raises
-    # an exception.
-    def test_run_call_order__error_in_setUp(self):
-        events = []
-        result = LoggingResult(events)
-
-        def setUp():
-            events.append('setUp')
-            raise RuntimeError('raised by setUp')
-
-        def test():
-            events.append('test')
-
-        def tearDown():
-            events.append('tearDown')
-
-        expected = ['startTest', 'setUp', 'addError', 'stopTest']
-        unittest2.FunctionTestCase(test, setUp, tearDown).run(result)
-        self.assertEqual(events, expected)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if the test raises
-    # an error (as opposed to a failure).
-    def test_run_call_order__error_in_test(self):
-        events = []
-        result = LoggingResult(events)
-
-        def setUp():
-            events.append('setUp')
-
-        def test():
-            events.append('test')
-            raise RuntimeError('raised by test')
-
-        def tearDown():
-            events.append('tearDown')
-
-        expected = ['startTest', 'setUp', 'test', 'addError', 'tearDown',
-                    'stopTest']
-        unittest2.FunctionTestCase(test, setUp, tearDown).run(result)
-        self.assertEqual(events, expected)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if the test signals
-    # a failure (as opposed to an error).
-    def test_run_call_order__failure_in_test(self):
-        events = []
-        result = LoggingResult(events)
-
-        def setUp():
-            events.append('setUp')
-
-        def test():
-            events.append('test')
-            self.fail('raised by test')
-
-        def tearDown():
-            events.append('tearDown')
-
-        expected = ['startTest', 'setUp', 'test', 'addFailure', 'tearDown',
-                    'stopTest']
-        unittest2.FunctionTestCase(test, setUp, tearDown).run(result)
-        self.assertEqual(events, expected)
-
-    # "When a setUp() method is defined, the test runner will run that method
-    # prior to each test. Likewise, if a tearDown() method is defined, the
-    # test runner will invoke that method after each test. In the example,
-    # setUp() was used to create a fresh sequence for each test."
-    #
-    # Make sure the proper call order is maintained, even if tearDown() raises
-    # an exception.
-    def test_run_call_order__error_in_tearDown(self):
-        events = []
-        result = LoggingResult(events)
-
-        def setUp():
-            events.append('setUp')
-
-        def test():
-            events.append('test')
-
-        def tearDown():
-            events.append('tearDown')
-            raise RuntimeError('raised by tearDown')
-
-        expected = ['startTest', 'setUp', 'test', 'tearDown', 'addError',
-                    'stopTest']
-        unittest2.FunctionTestCase(test, setUp, tearDown).run(result)
-        self.assertEqual(events, expected)
-
-    # "Return a string identifying the specific test case."
-    #
-    # Because of the vague nature of the docs, I'm not going to lock this
-    # test down too much. Really all that can be asserted is that the id()
-    # will be a string (either 8-byte or unicode -- again, because the docs
-    # just say "string")
-    def test_id(self):
-        test = unittest2.FunctionTestCase(lambda: None)
-
-        self.assertIsInstance(test.id(), str)
-
-    # "Returns a one-line description of the test, or None if no description
-    # has been provided. The default implementation of this method returns
-    # the first line of the test method's docstring, if available, or None."
-    def test_shortDescription__no_docstring(self):
-        test = unittest2.FunctionTestCase(lambda: None)
-
-        self.assertEqual(test.shortDescription(), None)
-
-    # "Returns a one-line description of the test, or None if no description
-    # has been provided. The default implementation of this method returns
-    # the first line of the test method's docstring, if available, or None."
-    def test_shortDescription__singleline_docstring(self):
-        desc = "this tests foo"
-        test = unittest2.FunctionTestCase(lambda: None, description=desc)
-
-        self.assertEqual(test.shortDescription(), "this tests foo")
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_loader.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_loader.py
deleted file mode 100644
index 45fea56aad026..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_loader.py
+++ /dev/null
@@ -1,1380 +0,0 @@
-import sys
-import types
-
-import unittest2
-
-
-class Test_TestLoader(unittest2.TestCase):
-
-    # Tests for TestLoader.loadTestsFromTestCase
-    ################################################################
-
-    # "Return a suite of all tests cases contained in the TestCase-derived
-    # class testCaseClass"
-    def test_loadTestsFromTestCase(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-
-        tests = unittest2.TestSuite([Foo('test_1'), Foo('test_2')])
-
-        loader = unittest2.TestLoader()
-        self.assertEqual(loader.loadTestsFromTestCase(Foo), tests)
-
-    # "Return a suite of all tests cases contained in the TestCase-derived
-    # class testCaseClass"
-    #
-    # Make sure it does the right thing even if no tests were found
-    def test_loadTestsFromTestCase__no_matches(self):
-        class Foo(unittest2.TestCase):
-
-            def foo_bar(self): pass
-
-        empty_suite = unittest2.TestSuite()
-
-        loader = unittest2.TestLoader()
-        self.assertEqual(loader.loadTestsFromTestCase(Foo), empty_suite)
-
-    # "Return a suite of all tests cases contained in the TestCase-derived
-    # class testCaseClass"
-    #
-    # What happens if loadTestsFromTestCase() is given an object
-    # that isn't a subclass of TestCase? Specifically, what happens
-    # if testCaseClass is a subclass of TestSuite?
-    #
-    # This is checked for specifically in the code, so we better add a
-    # test for it.
-    def test_loadTestsFromTestCase__TestSuite_subclass(self):
-        class NotATestCase(unittest2.TestSuite):
-            pass
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromTestCase(NotATestCase)
-        except TypeError:
-            pass
-        else:
-            self.fail('Should raise TypeError')
-
-    # "Return a suite of all tests cases contained in the TestCase-derived
-    # class testCaseClass"
-    #
-    # Make sure loadTestsFromTestCase() picks up the default test method
-    # name (as specified by TestCase), even though the method name does
-    # not match the default TestLoader.testMethodPrefix string
-    def test_loadTestsFromTestCase__default_method_name(self):
-        class Foo(unittest2.TestCase):
-
-            def runTest(self):
-                pass
-
-        loader = unittest2.TestLoader()
-        # This has to be false for the test to succeed
-        self.assertFalse('runTest'.startswith(loader.testMethodPrefix))
-
-        suite = loader.loadTestsFromTestCase(Foo)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [Foo('runTest')])
-
-    ################################################################
-    # /Tests for TestLoader.loadTestsFromTestCase
-
-    # Tests for TestLoader.loadTestsFromModule
-    ################################################################
-
-    # "This method searches `module` for classes derived from TestCase"
-    def test_loadTestsFromModule__TestCase_subclass(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromModule(m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        expected = [loader.suiteClass([MyTestCase('test')])]
-        self.assertEqual(list(suite), expected)
-
-    # "This method searches `module` for classes derived from TestCase"
-    #
-    # What happens if no tests are found (no TestCase instances)?
-    def test_loadTestsFromModule__no_TestCase_instances(self):
-        m = types.ModuleType('m')
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromModule(m)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [])
-
-    # "This method searches `module` for classes derived from TestCase"
-    #
-    # What happens if no tests are found (TestCases instances, but no tests)?
-    def test_loadTestsFromModule__no_TestCase_tests(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-            pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromModule(m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        self.assertEqual(list(suite), [loader.suiteClass()])
-
-    # "This method searches `module` for classes derived from TestCase"s
-    #
-    # What happens if loadTestsFromModule() is given something other
-    # than a module?
-    #
-    # XXX Currently, it succeeds anyway. This flexibility
-    # should either be documented or loadTestsFromModule() should
-    # raise a TypeError
-    #
-    # XXX Certain people are using this behaviour. We'll add a test for it
-    def test_loadTestsFromModule__not_a_module(self):
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-
-        class NotAModule(object):
-            test_2 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromModule(NotAModule)
-
-        reference = [unittest2.TestSuite([MyTestCase('test')])]
-        self.assertEqual(list(suite), reference)
-
-    # Check that loadTestsFromModule honors (or not) a module
-    # with a load_tests function.
-    def test_loadTestsFromModule__load_tests(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        load_tests_args = []
-
-        def load_tests(loader, tests, pattern):
-            self.assertIsInstance(tests, unittest2.TestSuite)
-            load_tests_args.extend((loader, tests, pattern))
-            return tests
-        m.load_tests = load_tests
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromModule(m)
-        self.assertIsInstance(suite, unittest2.TestSuite)
-        self.assertEquals(load_tests_args, [loader, suite, None])
-
-        load_tests_args = []
-        suite = loader.loadTestsFromModule(m, use_load_tests=False)
-        self.assertEquals(load_tests_args, [])
-
-    def test_loadTestsFromModule__faulty_load_tests(self):
-        m = types.ModuleType('m')
-
-        def load_tests(loader, tests, pattern):
-            raise TypeError('some failure')
-        m.load_tests = load_tests
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromModule(m)
-        self.assertIsInstance(suite, unittest2.TestSuite)
-        self.assertEqual(suite.countTestCases(), 1)
-        test = list(suite)[0]
-
-        self.assertRaisesRegexp(TypeError, "some failure", test.m)
-
-    ################################################################
-    # /Tests for TestLoader.loadTestsFromModule()
-
-    # Tests for TestLoader.loadTestsFromName()
-    ################################################################
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # Is ValueError raised in response to an empty name?
-    def test_loadTestsFromName__empty_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromName('')
-        except ValueError as e:
-            self.assertEqual(str(e), "Empty module name")
-        else:
-            self.fail("TestLoader.loadTestsFromName failed to raise ValueError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # What happens when the name contains invalid characters?
-    def test_loadTestsFromName__malformed_name(self):
-        loader = unittest2.TestLoader()
-
-        # XXX Should this raise ValueError or ImportError?
-        try:
-            loader.loadTestsFromName('abc () //')
-        except ValueError:
-            pass
-        except ImportError:
-            pass
-        else:
-            self.fail("TestLoader.loadTestsFromName failed to raise ValueError")
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to a
-    # module"
-    #
-    # What happens when a module by that name can't be found?
-    def test_loadTestsFromName__unknown_module_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromName('sdasfasfasdf')
-        except ImportError as e:
-            self.assertEqual(str(e), "No module named sdasfasfasdf")
-        else:
-            self.fail("TestLoader.loadTestsFromName failed to raise ImportError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # What happens when the module is found, but the attribute can't?
-    def test_loadTestsFromName__unknown_attr_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromName('unittest2.sdasfasfasdf')
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "'module' object has no attribute 'sdasfasfasdf'")
-        else:
-            self.fail(
-                "TestLoader.loadTestsFromName failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # What happens when we provide the module, but the attribute can't be
-    # found?
-    def test_loadTestsFromName__relative_unknown_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromName('sdasfasfasdf', unittest2)
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "'module' object has no attribute 'sdasfasfasdf'")
-        else:
-            self.fail(
-                "TestLoader.loadTestsFromName failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # Does loadTestsFromName raise ValueError when passed an empty
-    # name relative to a provided module?
-    #
-    # XXX Should probably raise a ValueError instead of an AttributeError
-    def test_loadTestsFromName__relative_empty_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromName('', unittest2)
-        except AttributeError:
-            pass
-        else:
-            self.fail("Failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # What happens when an impossible name is given, relative to the provided
-    # `module`?
-    def test_loadTestsFromName__relative_malformed_name(self):
-        loader = unittest2.TestLoader()
-
-        # XXX Should this raise AttributeError or ValueError?
-        try:
-            loader.loadTestsFromName('abc () //', unittest2)
-        except ValueError:
-            pass
-        except AttributeError:
-            pass
-        else:
-            self.fail("TestLoader.loadTestsFromName failed to raise ValueError")
-
-    # "The method optionally resolves name relative to the given module"
-    #
-    # Does loadTestsFromName raise TypeError when the `module` argument
-    # isn't a module object?
-    #
-    # XXX Accepts the not-a-module object, ignorning the object's type
-    # This should raise an exception or the method name should be changed
-    #
-    # XXX Some people are relying on this, so keep it for now
-    def test_loadTestsFromName__relative_not_a_module(self):
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-
-        class NotAModule(object):
-            test_2 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromName('test_2', NotAModule)
-
-        reference = [MyTestCase('test')]
-        self.assertEqual(list(suite), reference)
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # Does it raise an exception if the name resolves to an invalid
-    # object?
-    def test_loadTestsFromName__relative_bad_object(self):
-        m = types.ModuleType('m')
-        m.testcase_1 = object()
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromName('testcase_1', m)
-        except TypeError:
-            pass
-        else:
-            self.fail("Should have raised TypeError")
-
-    # "The specifier name is a ``dotted name'' that may
-    # resolve either to ... a test case class"
-    def test_loadTestsFromName__relative_TestCase_subclass(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromName('testcase_1', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [MyTestCase('test')])
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    def test_loadTestsFromName__relative_TestSuite(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testsuite = unittest2.TestSuite([MyTestCase('test')])
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromName('testsuite', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        self.assertEqual(list(suite), [MyTestCase('test')])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a test method within a test case class"
-    def test_loadTestsFromName__relative_testmethod(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromName('testcase_1.test', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        self.assertEqual(list(suite), [MyTestCase('test')])
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # Does loadTestsFromName() raise the proper exception when trying to
-    # resolve "a test method within a test case class" that doesn't exist
-    # for the given name (relative to a provided module)?
-    def test_loadTestsFromName__relative_invalid_testmethod(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromName('testcase_1.testfoo', m)
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "type object 'MyTestCase' has no attribute 'testfoo'")
-        else:
-            self.fail("Failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a ... TestSuite instance"
-    def test_loadTestsFromName__callable__TestSuite(self):
-        m = types.ModuleType('m')
-        testcase_1 = unittest2.FunctionTestCase(lambda: None)
-        testcase_2 = unittest2.FunctionTestCase(lambda: None)
-
-        def return_TestSuite():
-            return unittest2.TestSuite([testcase_1, testcase_2])
-        m.return_TestSuite = return_TestSuite
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromName('return_TestSuite', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [testcase_1, testcase_2])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a TestCase ... instance"
-    def test_loadTestsFromName__callable__TestCase_instance(self):
-        m = types.ModuleType('m')
-        testcase_1 = unittest2.FunctionTestCase(lambda: None)
-
-        def return_TestCase():
-            return testcase_1
-        m.return_TestCase = return_TestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromName('return_TestCase', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [testcase_1])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a TestCase ... instance"
-    #*****************************************************************
-    # Override the suiteClass attribute to ensure that the suiteClass
-    #attribute is used
-    def test_loadTestsFromName__callable__TestCase_instance_ProperSuiteClass(
-            self):
-        class SubTestSuite(unittest2.TestSuite):
-            pass
-        m = types.ModuleType('m')
-        testcase_1 = unittest2.FunctionTestCase(lambda: None)
-
-        def return_TestCase():
-            return testcase_1
-        m.return_TestCase = return_TestCase
-
-        loader = unittest2.TestLoader()
-        loader.suiteClass = SubTestSuite
-        suite = loader.loadTestsFromName('return_TestCase', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [testcase_1])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a test method within a test case class"
-    #*****************************************************************
-    # Override the suiteClass attribute to ensure that the suiteClass
-    #attribute is used
-    def test_loadTestsFromName__relative_testmethod_ProperSuiteClass(self):
-        class SubTestSuite(unittest2.TestSuite):
-            pass
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        loader.suiteClass = SubTestSuite
-        suite = loader.loadTestsFromName('testcase_1.test', m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        self.assertEqual(list(suite), [MyTestCase('test')])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a TestCase or TestSuite instance"
-    #
-    # What happens if the callable returns something else?
-    def test_loadTestsFromName__callable__wrong_type(self):
-        m = types.ModuleType('m')
-
-        def return_wrong():
-            return 6
-        m.return_wrong = return_wrong
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromName('return_wrong', m)
-        except TypeError:
-            pass
-        else:
-            self.fail("TestLoader.loadTestsFromName failed to raise TypeError")
-
-    # "The specifier can refer to modules and packages which have not been
-    # imported; they will be imported as a side-effect"
-    def test_loadTestsFromName__module_not_loaded(self):
-        # We're going to try to load this module as a side-effect, so it
-        # better not be loaded before we try.
-        #
-        module_name = 'unittest2.test.dummy'
-        sys.modules.pop(module_name, None)
-
-        loader = unittest2.TestLoader()
-        try:
-            suite = loader.loadTestsFromName(module_name)
-
-            self.assertIsInstance(suite, loader.suiteClass)
-            self.assertEqual(list(suite), [])
-
-            # module should now be loaded, thanks to loadTestsFromName()
-            self.assertIn(module_name, sys.modules)
-        finally:
-            if module_name in sys.modules:
-                del sys.modules[module_name]
-
-    ################################################################
-    # Tests for TestLoader.loadTestsFromName()
-
-    # Tests for TestLoader.loadTestsFromNames()
-    ################################################################
-
-    # "Similar to loadTestsFromName(), but takes a sequence of names rather
-    # than a single name."
-    #
-    # What happens if that sequence of names is empty?
-    def test_loadTestsFromNames__empty_name_list(self):
-        loader = unittest2.TestLoader()
-
-        suite = loader.loadTestsFromNames([])
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [])
-
-    # "Similar to loadTestsFromName(), but takes a sequence of names rather
-    # than a single name."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # What happens if that sequence of names is empty?
-    #
-    # XXX Should this raise a ValueError or just return an empty TestSuite?
-    def test_loadTestsFromNames__relative_empty_name_list(self):
-        loader = unittest2.TestLoader()
-
-        suite = loader.loadTestsFromNames([], unittest2)
-        self.assertIsInstance(suite, loader.suiteClass)
-        self.assertEqual(list(suite), [])
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # Is ValueError raised in response to an empty name?
-    def test_loadTestsFromNames__empty_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromNames([''])
-        except ValueError as e:
-            self.assertEqual(str(e), "Empty module name")
-        else:
-            self.fail("TestLoader.loadTestsFromNames failed to raise ValueError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # What happens when presented with an impossible module name?
-    def test_loadTestsFromNames__malformed_name(self):
-        loader = unittest2.TestLoader()
-
-        # XXX Should this raise ValueError or ImportError?
-        try:
-            loader.loadTestsFromNames(['abc () //'])
-        except ValueError:
-            pass
-        except ImportError:
-            pass
-        else:
-            self.fail("TestLoader.loadTestsFromNames failed to raise ValueError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # What happens when no module can be found for the given name?
-    def test_loadTestsFromNames__unknown_module_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromNames(['sdasfasfasdf'])
-        except ImportError as e:
-            self.assertEqual(str(e), "No module named sdasfasfasdf")
-        else:
-            self.fail("TestLoader.loadTestsFromNames failed to raise ImportError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # What happens when the module can be found, but not the attribute?
-    def test_loadTestsFromNames__unknown_attr_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromNames(['unittest2.sdasfasfasdf', 'unittest2'])
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "'module' object has no attribute 'sdasfasfasdf'")
-        else:
-            self.fail(
-                "TestLoader.loadTestsFromNames failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # What happens when given an unknown attribute on a specified `module`
-    # argument?
-    def test_loadTestsFromNames__unknown_name_relative_1(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromNames(['sdasfasfasdf'], unittest2)
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "'module' object has no attribute 'sdasfasfasdf'")
-        else:
-            self.fail(
-                "TestLoader.loadTestsFromName failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # Do unknown attributes (relative to a provided module) still raise an
-    # exception even in the presence of valid attribute names?
-    def test_loadTestsFromNames__unknown_name_relative_2(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromNames(['TestCase', 'sdasfasfasdf'], unittest2)
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "'module' object has no attribute 'sdasfasfasdf'")
-        else:
-            self.fail(
-                "TestLoader.loadTestsFromName failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # What happens when faced with the empty string?
-    #
-    # XXX This currently raises AttributeError, though ValueError is probably
-    # more appropriate
-    def test_loadTestsFromNames__relative_empty_name(self):
-        loader = unittest2.TestLoader()
-
-        try:
-            loader.loadTestsFromNames([''], unittest2)
-        except AttributeError:
-            pass
-        else:
-            self.fail("Failed to raise ValueError")
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    # ...
-    # "The method optionally resolves name relative to the given module"
-    #
-    # What happens when presented with an impossible attribute name?
-    def test_loadTestsFromNames__relative_malformed_name(self):
-        loader = unittest2.TestLoader()
-
-        # XXX Should this raise AttributeError or ValueError?
-        try:
-            loader.loadTestsFromNames(['abc () //'], unittest2)
-        except AttributeError:
-            pass
-        except ValueError:
-            pass
-        else:
-            self.fail("TestLoader.loadTestsFromNames failed to raise ValueError")
-
-    # "The method optionally resolves name relative to the given module"
-    #
-    # Does loadTestsFromNames() make sure the provided `module` is in fact
-    # a module?
-    #
-    # XXX This validation is currently not done. This flexibility should
-    # either be documented or a TypeError should be raised.
-    def test_loadTestsFromNames__relative_not_a_module(self):
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-
-        class NotAModule(object):
-            test_2 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['test_2'], NotAModule)
-
-        reference = [unittest2.TestSuite([MyTestCase('test')])]
-        self.assertEqual(list(suite), reference)
-
-    # "The specifier name is a ``dotted name'' that may resolve either to
-    # a module, a test case class, a TestSuite instance, a test method
-    # within a test case class, or a callable object which returns a
-    # TestCase or TestSuite instance."
-    #
-    # Does it raise an exception if the name resolves to an invalid
-    # object?
-    def test_loadTestsFromNames__relative_bad_object(self):
-        m = types.ModuleType('m')
-        m.testcase_1 = object()
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromNames(['testcase_1'], m)
-        except TypeError:
-            pass
-        else:
-            self.fail("Should have raised TypeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a test case class"
-    def test_loadTestsFromNames__relative_TestCase_subclass(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['testcase_1'], m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        expected = loader.suiteClass([MyTestCase('test')])
-        self.assertEqual(list(suite), [expected])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a TestSuite instance"
-    def test_loadTestsFromNames__relative_TestSuite(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testsuite = unittest2.TestSuite([MyTestCase('test')])
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['testsuite'], m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        self.assertEqual(list(suite), [m.testsuite])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to ... a
-    # test method within a test case class"
-    def test_loadTestsFromNames__relative_testmethod(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['testcase_1.test'], m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        ref_suite = unittest2.TestSuite([MyTestCase('test')])
-        self.assertEqual(list(suite), [ref_suite])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to ... a
-    # test method within a test case class"
-    #
-    # Does the method gracefully handle names that initially look like they
-    # resolve to "a test method within a test case class" but don't?
-    def test_loadTestsFromNames__relative_invalid_testmethod(self):
-        m = types.ModuleType('m')
-
-        class MyTestCase(unittest2.TestCase):
-
-            def test(self):
-                pass
-        m.testcase_1 = MyTestCase
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromNames(['testcase_1.testfoo'], m)
-        except AttributeError as e:
-            self.assertEqual(
-                str(e), "type object 'MyTestCase' has no attribute 'testfoo'")
-        else:
-            self.fail("Failed to raise AttributeError")
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a ... TestSuite instance"
-    def test_loadTestsFromNames__callable__TestSuite(self):
-        m = types.ModuleType('m')
-        testcase_1 = unittest2.FunctionTestCase(lambda: None)
-        testcase_2 = unittest2.FunctionTestCase(lambda: None)
-
-        def return_TestSuite():
-            return unittest2.TestSuite([testcase_1, testcase_2])
-        m.return_TestSuite = return_TestSuite
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['return_TestSuite'], m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        expected = unittest2.TestSuite([testcase_1, testcase_2])
-        self.assertEqual(list(suite), [expected])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a TestCase ... instance"
-    def test_loadTestsFromNames__callable__TestCase_instance(self):
-        m = types.ModuleType('m')
-        testcase_1 = unittest2.FunctionTestCase(lambda: None)
-
-        def return_TestCase():
-            return testcase_1
-        m.return_TestCase = return_TestCase
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['return_TestCase'], m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        ref_suite = unittest2.TestSuite([testcase_1])
-        self.assertEqual(list(suite), [ref_suite])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a TestCase or TestSuite instance"
-    #
-    # Are staticmethods handled correctly?
-    def test_loadTestsFromNames__callable__call_staticmethod(self):
-        m = types.ModuleType('m')
-
-        class Test1(unittest2.TestCase):
-
-            def test(self):
-                pass
-
-        testcase_1 = Test1('test')
-
-        class Foo(unittest2.TestCase):
-
-            @staticmethod
-            def foo():
-                return testcase_1
-        m.Foo = Foo
-
-        loader = unittest2.TestLoader()
-        suite = loader.loadTestsFromNames(['Foo.foo'], m)
-        self.assertIsInstance(suite, loader.suiteClass)
-
-        ref_suite = unittest2.TestSuite([testcase_1])
-        self.assertEqual(list(suite), [ref_suite])
-
-    # "The specifier name is a ``dotted name'' that may resolve ... to
-    # ... a callable object which returns a TestCase or TestSuite instance"
-    #
-    # What happens when the callable returns something else?
-    def test_loadTestsFromNames__callable__wrong_type(self):
-        m = types.ModuleType('m')
-
-        def return_wrong():
-            return 6
-        m.return_wrong = return_wrong
-
-        loader = unittest2.TestLoader()
-        try:
-            loader.loadTestsFromNames(['return_wrong'], m)
-        except TypeError:
-            pass
-        else:
-            self.fail("TestLoader.loadTestsFromNames failed to raise TypeError")
-
-    # "The specifier can refer to modules and packages which have not been
-    # imported; they will be imported as a side-effect"
-    def test_loadTestsFromNames__module_not_loaded(self):
-        # We're going to try to load this module as a side-effect, so it
-        # better not be loaded before we try.
-        #
-        module_name = 'unittest2.test.dummy'
-        sys.modules.pop(module_name, None)
-
-        loader = unittest2.TestLoader()
-        try:
-            suite = loader.loadTestsFromNames([module_name])
-
-            self.assertIsInstance(suite, loader.suiteClass)
-            self.assertEqual(list(suite), [unittest2.TestSuite()])
-
-            # module should now be loaded, thanks to loadTestsFromName()
-            self.assertIn(module_name, sys.modules)
-        finally:
-            if module_name in sys.modules:
-                del sys.modules[module_name]
-
-    ################################################################
-    # /Tests for TestLoader.loadTestsFromNames()
-
-    # Tests for TestLoader.getTestCaseNames()
-    ################################################################
-
-    # "Return a sorted sequence of method names found within testCaseClass"
-    #
-    # Test.foobar is defined to make sure getTestCaseNames() respects
-    # loader.testMethodPrefix
-    def test_getTestCaseNames(self):
-        class Test(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foobar(self): pass
-
-        loader = unittest2.TestLoader()
-
-        self.assertEqual(loader.getTestCaseNames(Test), ['test_1', 'test_2'])
-
-    # "Return a sorted sequence of method names found within testCaseClass"
-    #
-    # Does getTestCaseNames() behave appropriately if no tests are found?
-    def test_getTestCaseNames__no_tests(self):
-        class Test(unittest2.TestCase):
-
-            def foobar(self): pass
-
-        loader = unittest2.TestLoader()
-
-        self.assertEqual(loader.getTestCaseNames(Test), [])
-
-    # "Return a sorted sequence of method names found within testCaseClass"
-    #
-    # Are not-TestCases handled gracefully?
-    #
-    # XXX This should raise a TypeError, not return a list
-    #
-    # XXX It's too late in the 2.5 release cycle to fix this, but it should
-    # probably be revisited for 2.6
-    def test_getTestCaseNames__not_a_TestCase(self):
-        class BadCase(int):
-
-            def test_foo(self):
-                pass
-
-        loader = unittest2.TestLoader()
-        names = loader.getTestCaseNames(BadCase)
-
-        self.assertEqual(names, ['test_foo'])
-
-    # "Return a sorted sequence of method names found within testCaseClass"
-    #
-    # Make sure inherited names are handled.
-    #
-    # TestP.foobar is defined to make sure getTestCaseNames() respects
-    # loader.testMethodPrefix
-    def test_getTestCaseNames__inheritance(self):
-        class TestP(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foobar(self): pass
-
-        class TestC(TestP):
-
-            def test_1(self): pass
-
-            def test_3(self): pass
-
-        loader = unittest2.TestLoader()
-
-        names = ['test_1', 'test_2', 'test_3']
-        self.assertEqual(loader.getTestCaseNames(TestC), names)
-
-    ################################################################
-    # /Tests for TestLoader.getTestCaseNames()
-
-    # Tests for TestLoader.testMethodPrefix
-    ################################################################
-
-    # "String giving the prefix of method names which will be interpreted as
-    # test methods"
-    #
-    # Implicit in the documentation is that testMethodPrefix is respected by
-    # all loadTestsFrom* methods.
-    def test_testMethodPrefix__loadTestsFromTestCase(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-
-        tests_1 = unittest2.TestSuite([Foo('foo_bar')])
-        tests_2 = unittest2.TestSuite([Foo('test_1'), Foo('test_2')])
-
-        loader = unittest2.TestLoader()
-        loader.testMethodPrefix = 'foo'
-        self.assertEqual(loader.loadTestsFromTestCase(Foo), tests_1)
-
-        loader.testMethodPrefix = 'test'
-        self.assertEqual(loader.loadTestsFromTestCase(Foo), tests_2)
-
-    # "String giving the prefix of method names which will be interpreted as
-    # test methods"
-    #
-    # Implicit in the documentation is that testMethodPrefix is respected by
-    # all loadTestsFrom* methods.
-    def test_testMethodPrefix__loadTestsFromModule(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-        m.Foo = Foo
-
-        tests_1 = [unittest2.TestSuite([Foo('foo_bar')])]
-        tests_2 = [unittest2.TestSuite([Foo('test_1'), Foo('test_2')])]
-
-        loader = unittest2.TestLoader()
-        loader.testMethodPrefix = 'foo'
-        self.assertEqual(list(loader.loadTestsFromModule(m)), tests_1)
-
-        loader.testMethodPrefix = 'test'
-        self.assertEqual(list(loader.loadTestsFromModule(m)), tests_2)
-
-    # "String giving the prefix of method names which will be interpreted as
-    # test methods"
-    #
-    # Implicit in the documentation is that testMethodPrefix is respected by
-    # all loadTestsFrom* methods.
-    def test_testMethodPrefix__loadTestsFromName(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-        m.Foo = Foo
-
-        tests_1 = unittest2.TestSuite([Foo('foo_bar')])
-        tests_2 = unittest2.TestSuite([Foo('test_1'), Foo('test_2')])
-
-        loader = unittest2.TestLoader()
-        loader.testMethodPrefix = 'foo'
-        self.assertEqual(loader.loadTestsFromName('Foo', m), tests_1)
-
-        loader.testMethodPrefix = 'test'
-        self.assertEqual(loader.loadTestsFromName('Foo', m), tests_2)
-
-    # "String giving the prefix of method names which will be interpreted as
-    # test methods"
-    #
-    # Implicit in the documentation is that testMethodPrefix is respected by
-    # all loadTestsFrom* methods.
-    def test_testMethodPrefix__loadTestsFromNames(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-        m.Foo = Foo
-
-        tests_1 = unittest2.TestSuite([unittest2.TestSuite([Foo('foo_bar')])])
-        tests_2 = unittest2.TestSuite([Foo('test_1'), Foo('test_2')])
-        tests_2 = unittest2.TestSuite([tests_2])
-
-        loader = unittest2.TestLoader()
-        loader.testMethodPrefix = 'foo'
-        self.assertEqual(loader.loadTestsFromNames(['Foo'], m), tests_1)
-
-        loader.testMethodPrefix = 'test'
-        self.assertEqual(loader.loadTestsFromNames(['Foo'], m), tests_2)
-
-    # "The default value is 'test'"
-    def test_testMethodPrefix__default_value(self):
-        loader = unittest2.TestLoader()
-        self.assertTrue(loader.testMethodPrefix == 'test')
-
-    ################################################################
-    # /Tests for TestLoader.testMethodPrefix
-
-    # Tests for TestLoader.sortTestMethodsUsing
-    ################################################################
-
-    # "Function to be used to compare method names when sorting them in
-    # getTestCaseNames() and all the loadTestsFromX() methods"
-    def test_sortTestMethodsUsing__loadTestsFromTestCase(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-        loader = unittest2.TestLoader()
-        loader.sortTestMethodsUsing = unittest2.reversed_cmp_
-
-        tests = loader.suiteClass([Foo('test_2'), Foo('test_1')])
-        self.assertEqual(loader.loadTestsFromTestCase(Foo), tests)
-
-    # "Function to be used to compare method names when sorting them in
-    # getTestCaseNames() and all the loadTestsFromX() methods"
-    def test_sortTestMethodsUsing__loadTestsFromModule(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-        m.Foo = Foo
-
-        loader = unittest2.TestLoader()
-        loader.sortTestMethodsUsing = unittest2.reversed_cmp_
-
-        tests = [loader.suiteClass([Foo('test_2'), Foo('test_1')])]
-        self.assertEqual(list(loader.loadTestsFromModule(m)), tests)
-
-    # "Function to be used to compare method names when sorting them in
-    # getTestCaseNames() and all the loadTestsFromX() methods"
-    def test_sortTestMethodsUsing__loadTestsFromName(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-        m.Foo = Foo
-
-        loader = unittest2.TestLoader()
-        loader.sortTestMethodsUsing = unittest2.reversed_cmp_
-
-        tests = loader.suiteClass([Foo('test_2'), Foo('test_1')])
-        self.assertEqual(loader.loadTestsFromName('Foo', m), tests)
-
-    # "Function to be used to compare method names when sorting them in
-    # getTestCaseNames() and all the loadTestsFromX() methods"
-    def test_sortTestMethodsUsing__loadTestsFromNames(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-        m.Foo = Foo
-
-        loader = unittest2.TestLoader()
-        loader.sortTestMethodsUsing = unittest2.reversed_cmp_
-
-        tests = [loader.suiteClass([Foo('test_2'), Foo('test_1')])]
-        self.assertEqual(list(loader.loadTestsFromNames(['Foo'], m)), tests)
-
-    # "Function to be used to compare method names when sorting them in
-    # getTestCaseNames()"
-    #
-    # Does it actually affect getTestCaseNames()?
-    def test_sortTestMethodsUsing__getTestCaseNames(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-        loader = unittest2.TestLoader()
-        loader.sortTestMethodsUsing = unittest2.reversed_cmp_
-
-        test_names = ['test_2', 'test_1']
-        self.assertEqual(loader.getTestCaseNames(Foo), test_names)
-
-    # "The default value is the built-in cmp() function"
-    def test_sortTestMethodsUsing__default_value(self):
-        loader = unittest2.TestLoader()
-        self.assertTrue(loader.sortTestMethodsUsing is unittest2.cmp_)
-
-    # "it can be set to None to disable the sort."
-    #
-    # XXX How is this different from reassigning cmp? Are the tests returned
-    # in a random order or something? This behaviour should die
-    def test_sortTestMethodsUsing__None(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-        loader = unittest2.TestLoader()
-        loader.sortTestMethodsUsing = None
-
-        test_names = ['test_2', 'test_1']
-        self.assertEqual(set(loader.getTestCaseNames(Foo)), set(test_names))
-
-    ################################################################
-    # /Tests for TestLoader.sortTestMethodsUsing
-
-    # Tests for TestLoader.suiteClass
-    ################################################################
-
-    # "Callable object that constructs a test suite from a list of tests."
-    def test_suiteClass__loadTestsFromTestCase(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-
-        tests = [Foo('test_1'), Foo('test_2')]
-
-        loader = unittest2.TestLoader()
-        loader.suiteClass = list
-        self.assertEqual(loader.loadTestsFromTestCase(Foo), tests)
-
-    # It is implicit in the documentation for TestLoader.suiteClass that
-    # all TestLoader.loadTestsFrom* methods respect it. Let's make sure
-    def test_suiteClass__loadTestsFromModule(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-        m.Foo = Foo
-
-        tests = [[Foo('test_1'), Foo('test_2')]]
-
-        loader = unittest2.TestLoader()
-        loader.suiteClass = list
-        self.assertEqual(loader.loadTestsFromModule(m), tests)
-
-    # It is implicit in the documentation for TestLoader.suiteClass that
-    # all TestLoader.loadTestsFrom* methods respect it. Let's make sure
-    def test_suiteClass__loadTestsFromName(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-        m.Foo = Foo
-
-        tests = [Foo('test_1'), Foo('test_2')]
-
-        loader = unittest2.TestLoader()
-        loader.suiteClass = list
-        self.assertEqual(loader.loadTestsFromName('Foo', m), tests)
-
-    # It is implicit in the documentation for TestLoader.suiteClass that
-    # all TestLoader.loadTestsFrom* methods respect it. Let's make sure
-    def test_suiteClass__loadTestsFromNames(self):
-        m = types.ModuleType('m')
-
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-            def foo_bar(self): pass
-        m.Foo = Foo
-
-        tests = [[Foo('test_1'), Foo('test_2')]]
-
-        loader = unittest2.TestLoader()
-        loader.suiteClass = list
-        self.assertEqual(loader.loadTestsFromNames(['Foo'], m), tests)
-
-    # "The default value is the TestSuite class"
-    def test_suiteClass__default_value(self):
-        loader = unittest2.TestLoader()
-        self.assertTrue(loader.suiteClass is unittest2.TestSuite)
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_new_tests.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_new_tests.py
deleted file mode 100644
index 0b456c08927ff..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_new_tests.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from cStringIO import StringIO
-
-import unittest
-import unittest2
-
-from unittest2.test.support import resultFactory
-
-
-class TestUnittest(unittest2.TestCase):
-
-    def assertIsSubclass(self, actual, klass):
-        self.assertTrue(issubclass(actual, klass), "Not a subclass.")
-
-    def testInheritance(self):
-        self.assertIsSubclass(unittest2.TestCase, unittest.TestCase)
-        self.assertIsSubclass(unittest2.TestResult, unittest.TestResult)
-        self.assertIsSubclass(unittest2.TestSuite, unittest.TestSuite)
-        self.assertIsSubclass(
-            unittest2.TextTestRunner,
-            unittest.TextTestRunner)
-        self.assertIsSubclass(unittest2.TestLoader, unittest.TestLoader)
-        self.assertIsSubclass(unittest2.TextTestResult, unittest.TestResult)
-
-    def test_new_runner_old_case(self):
-        runner = unittest2.TextTestRunner(resultclass=resultFactory,
-                                          stream=StringIO())
-
-        class Test(unittest.TestCase):
-
-            def testOne(self):
-                pass
-        suite = unittest2.TestSuite((Test('testOne'),))
-        result = runner.run(suite)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(len(result.errors), 0)
-
-    def test_old_runner_new_case(self):
-        runner = unittest.TextTestRunner(stream=StringIO())
-
-        class Test(unittest2.TestCase):
-
-            def testOne(self):
-                self.assertDictEqual({}, {})
-
-        suite = unittest.TestSuite((Test('testOne'),))
-        result = runner.run(suite)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(len(result.errors), 0)
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_program.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_program.py
deleted file mode 100644
index eb864871e16f8..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_program.py
+++ /dev/null
@@ -1,251 +0,0 @@
-from cStringIO import StringIO
-
-import sys
-import unittest2
-
-hasInstallHandler = hasattr(unittest2, 'installHandler')
-
-
-class Test_TestProgram(unittest2.TestCase):
-
-    # Horrible white box test
-    def testNoExit(self):
-        result = object()
-        test = object()
-
-        class FakeRunner(object):
-
-            def run(self, test):
-                self.test = test
-                return result
-
-        runner = FakeRunner()
-
-        oldParseArgs = unittest2.TestProgram.parseArgs
-
-        def restoreParseArgs():
-            unittest2.TestProgram.parseArgs = oldParseArgs
-        unittest2.TestProgram.parseArgs = lambda *args: None
-        self.addCleanup(restoreParseArgs)
-
-        def removeTest():
-            del unittest2.TestProgram.test
-        unittest2.TestProgram.test = test
-        self.addCleanup(removeTest)
-
-        program = unittest2.TestProgram(
-            testRunner=runner, exit=False, verbosity=2)
-
-        self.assertEqual(program.result, result)
-        self.assertEqual(runner.test, test)
-        self.assertEqual(program.verbosity, 2)
-
-    class FooBar(unittest2.TestCase):
-
-        def testPass(self):
-            assert True
-
-        def testFail(self):
-            assert False
-
-    class FooBarLoader(unittest2.TestLoader):
-        """Test loader that returns a suite containing FooBar."""
-
-        def loadTestsFromModule(self, module):
-            return self.suiteClass(
-                [self.loadTestsFromTestCase(Test_TestProgram.FooBar)])
-
-    def test_NonExit(self):
-        program = unittest2.main(
-            exit=False,
-            argv=["foobar"],
-            testRunner=unittest2.TextTestRunner(
-                stream=StringIO()),
-            testLoader=self.FooBarLoader())
-        self.assertTrue(hasattr(program, 'result'))
-
-    def test_Exit(self):
-        self.assertRaises(
-            SystemExit,
-            unittest2.main,
-            argv=["foobar"],
-            testRunner=unittest2.TextTestRunner(stream=StringIO()),
-            exit=True,
-            testLoader=self.FooBarLoader())
-
-    def test_ExitAsDefault(self):
-        self.assertRaises(
-            SystemExit,
-            unittest2.main,
-            argv=["foobar"],
-            testRunner=unittest2.TextTestRunner(stream=StringIO()),
-            testLoader=self.FooBarLoader())
-
-
-class InitialisableProgram(unittest2.TestProgram):
-    exit = False
-    result = None
-    verbosity = 1
-    defaultTest = None
-    testRunner = None
-    testLoader = unittest2.defaultTestLoader
-    progName = 'test'
-    test = 'test'
-
-    def __init__(self, *args):
-        pass
-
-RESULT = object()
-
-
-class FakeRunner(object):
-    initArgs = None
-    test = None
-    raiseError = False
-
-    def __init__(self, **kwargs):
-        FakeRunner.initArgs = kwargs
-        if FakeRunner.raiseError:
-            FakeRunner.raiseError = False
-            raise TypeError
-
-    def run(self, test):
-        FakeRunner.test = test
-        return RESULT
-
-
-class TestCommandLineArgs(unittest2.TestCase):
-
-    def setUp(self):
-        self.program = InitialisableProgram()
-        self.program.createTests = lambda: None
-        FakeRunner.initArgs = None
-        FakeRunner.test = None
-        FakeRunner.raiseError = False
-
-    def testHelpAndUnknown(self):
-        program = self.program
-
-        def usageExit(msg=None):
-            program.msg = msg
-            program.exit = True
-        program.usageExit = usageExit
-
-        for opt in '-h', '-H', '--help':
-            program.exit = False
-            program.parseArgs([None, opt])
-            self.assertTrue(program.exit)
-            self.assertIsNone(program.msg)
-
-        program.parseArgs([None, '-$'])
-        self.assertTrue(program.exit)
-        self.assertIsNotNone(program.msg)
-
-    def testVerbosity(self):
-        program = self.program
-
-        for opt in '-q', '--quiet':
-            program.verbosity = 1
-            program.parseArgs([None, opt])
-            self.assertEqual(program.verbosity, 0)
-
-        for opt in '-v', '--verbose':
-            program.verbosity = 1
-            program.parseArgs([None, opt])
-            self.assertEqual(program.verbosity, 2)
-
-    def testBufferCatchFailfast(self):
-        program = self.program
-        for arg, attr in (('buffer', 'buffer'), ('failfast', 'failfast'),
-                          ('catch', 'catchbreak')):
-            if attr == 'catch' and not hasInstallHandler:
-                continue
-
-            short_opt = '-%s' % arg[0]
-            long_opt = '--%s' % arg
-            for opt in short_opt, long_opt:
-                setattr(program, attr, None)
-
-                program.parseArgs([None, opt])
-                self.assertTrue(getattr(program, attr))
-
-            for opt in short_opt, long_opt:
-                not_none = object()
-                setattr(program, attr, not_none)
-
-                program.parseArgs([None, opt])
-                self.assertEqual(getattr(program, attr), not_none)
-
-    def testRunTestsRunnerClass(self):
-        program = self.program
-
-        program.testRunner = FakeRunner
-        program.verbosity = 'verbosity'
-        program.failfast = 'failfast'
-        program.buffer = 'buffer'
-
-        program.runTests()
-
-        self.assertEqual(FakeRunner.initArgs, {'verbosity': 'verbosity',
-                                               'failfast': 'failfast',
-                                               'buffer': 'buffer'})
-        self.assertEqual(FakeRunner.test, 'test')
-        self.assertIs(program.result, RESULT)
-
-    def testRunTestsRunnerInstance(self):
-        program = self.program
-
-        program.testRunner = FakeRunner()
-        FakeRunner.initArgs = None
-
-        program.runTests()
-
-        # A new FakeRunner should not have been instantiated
-        self.assertIsNone(FakeRunner.initArgs)
-
-        self.assertEqual(FakeRunner.test, 'test')
-        self.assertIs(program.result, RESULT)
-
-    def testRunTestsOldRunnerClass(self):
-        program = self.program
-
-        FakeRunner.raiseError = True
-        program.testRunner = FakeRunner
-        program.verbosity = 'verbosity'
-        program.failfast = 'failfast'
-        program.buffer = 'buffer'
-        program.test = 'test'
-
-        program.runTests()
-
-        # If initialising raises a type error it should be retried
-        # without the new keyword arguments
-        self.assertEqual(FakeRunner.initArgs, {})
-        self.assertEqual(FakeRunner.test, 'test')
-        self.assertIs(program.result, RESULT)
-
-    def testCatchBreakInstallsHandler(self):
-        module = sys.modules['unittest2.main']
-        original = module.installHandler
-
-        def restore():
-            module.installHandler = original
-        self.addCleanup(restore)
-
-        self.installed = False
-
-        def fakeInstallHandler():
-            self.installed = True
-        module.installHandler = fakeInstallHandler
-
-        program = self.program
-        program.catchbreak = True
-
-        program.testRunner = FakeRunner
-
-        program.runTests()
-        self.assertTrue(self.installed)
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_result.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_result.py
deleted file mode 100644
index 9b64e8259558d..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_result.py
+++ /dev/null
@@ -1,426 +0,0 @@
-import sys
-import textwrap
-from StringIO import StringIO
-
-import unittest2
-
-
-class Test_TestResult(unittest2.TestCase):
-    # Note: there are not separate tests for TestResult.wasSuccessful(),
-    # TestResult.errors, TestResult.failures, TestResult.testsRun or
-    # TestResult.shouldStop because these only have meaning in terms of
-    # other TestResult methods.
-    #
-    # Accordingly, tests for the aforenamed attributes are incorporated
-    # in with the tests for the defining methods.
-    ################################################################
-
-    def test_init(self):
-        result = unittest2.TestResult()
-
-        self.assertTrue(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 0)
-        self.assertEqual(result.shouldStop, False)
-        self.assertIsNone(result._stdout_buffer)
-        self.assertIsNone(result._stderr_buffer)
-
-    # "This method can be called to signal that the set of tests being
-    # run should be aborted by setting the TestResult's shouldStop
-    # attribute to True."
-    def test_stop(self):
-        result = unittest2.TestResult()
-
-        result.stop()
-
-        self.assertEqual(result.shouldStop, True)
-
-    # "Called when the test case test is about to be run. The default
-    # implementation simply increments the instance's testsRun counter."
-    def test_startTest(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self):
-                pass
-
-        test = Foo('test_1')
-
-        result = unittest2.TestResult()
-
-        result.startTest(test)
-
-        self.assertTrue(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(result.shouldStop, False)
-
-        result.stopTest(test)
-
-    # "Called after the test case test has been executed, regardless of
-    # the outcome. The default implementation does nothing."
-    def test_stopTest(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self):
-                pass
-
-        test = Foo('test_1')
-
-        result = unittest2.TestResult()
-
-        result.startTest(test)
-
-        self.assertTrue(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(result.shouldStop, False)
-
-        result.stopTest(test)
-
-        # Same tests as above; make sure nothing has changed
-        self.assertTrue(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(result.shouldStop, False)
-
-    # "Called before and after tests are run. The default implementation does nothing."
-    def test_startTestRun_stopTestRun(self):
-        result = unittest2.TestResult()
-        result.startTestRun()
-        result.stopTestRun()
-
-    # "addSuccess(test)"
-    # ...
-    # "Called when the test case test succeeds"
-    # ...
-    # "wasSuccessful() - Returns True if all tests run so far have passed,
-    # otherwise returns False"
-    # ...
-    # "testsRun - The total number of tests run so far."
-    # ...
-    # "errors - A list containing 2-tuples of TestCase instances and
-    # formatted tracebacks. Each tuple represents a test which raised an
-    # unexpected exception. Contains formatted
-    # tracebacks instead of sys.exc_info() results."
-    # ...
-    # "failures - A list containing 2-tuples of TestCase instances and
-    # formatted tracebacks. Each tuple represents a test where a failure was
-    # explicitly signalled using the TestCase.fail*() or TestCase.assert*()
-    # methods. Contains formatted tracebacks instead
-    # of sys.exc_info() results."
-    def test_addSuccess(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self):
-                pass
-
-        test = Foo('test_1')
-
-        result = unittest2.TestResult()
-
-        result.startTest(test)
-        result.addSuccess(test)
-        result.stopTest(test)
-
-        self.assertTrue(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(result.shouldStop, False)
-
-    # "addFailure(test, err)"
-    # ...
-    # "Called when the test case test signals a failure. err is a tuple of
-    # the form returned by sys.exc_info(): (type, value, traceback)"
-    # ...
-    # "wasSuccessful() - Returns True if all tests run so far have passed,
-    # otherwise returns False"
-    # ...
-    # "testsRun - The total number of tests run so far."
-    # ...
-    # "errors - A list containing 2-tuples of TestCase instances and
-    # formatted tracebacks. Each tuple represents a test which raised an
-    # unexpected exception. Contains formatted
-    # tracebacks instead of sys.exc_info() results."
-    # ...
-    # "failures - A list containing 2-tuples of TestCase instances and
-    # formatted tracebacks. Each tuple represents a test where a failure was
-    # explicitly signalled using the TestCase.fail*() or TestCase.assert*()
-    # methods. Contains formatted tracebacks instead
-    # of sys.exc_info() results."
-    def test_addFailure(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self):
-                pass
-
-        test = Foo('test_1')
-        try:
-            test.fail("foo")
-        except:
-            exc_info_tuple = sys.exc_info()
-
-        result = unittest2.TestResult()
-
-        result.startTest(test)
-        result.addFailure(test, exc_info_tuple)
-        result.stopTest(test)
-
-        self.assertFalse(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.failures), 1)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(result.shouldStop, False)
-
-        test_case, formatted_exc = result.failures[0]
-        self.assertTrue(test_case is test)
-        self.assertIsInstance(formatted_exc, str)
-
-    # "addError(test, err)"
-    # ...
-    # "Called when the test case test raises an unexpected exception err
-    # is a tuple of the form returned by sys.exc_info():
-    # (type, value, traceback)"
-    # ...
-    # "wasSuccessful() - Returns True if all tests run so far have passed,
-    # otherwise returns False"
-    # ...
-    # "testsRun - The total number of tests run so far."
-    # ...
-    # "errors - A list containing 2-tuples of TestCase instances and
-    # formatted tracebacks. Each tuple represents a test which raised an
-    # unexpected exception. Contains formatted
-    # tracebacks instead of sys.exc_info() results."
-    # ...
-    # "failures - A list containing 2-tuples of TestCase instances and
-    # formatted tracebacks. Each tuple represents a test where a failure was
-    # explicitly signalled using the TestCase.fail*() or TestCase.assert*()
-    # methods. Contains formatted tracebacks instead
-    # of sys.exc_info() results."
-    def test_addError(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self):
-                pass
-
-        test = Foo('test_1')
-        try:
-            raise TypeError()
-        except:
-            exc_info_tuple = sys.exc_info()
-
-        result = unittest2.TestResult()
-
-        result.startTest(test)
-        result.addError(test, exc_info_tuple)
-        result.stopTest(test)
-
-        self.assertFalse(result.wasSuccessful())
-        self.assertEqual(len(result.errors), 1)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 1)
-        self.assertEqual(result.shouldStop, False)
-
-        test_case, formatted_exc = result.errors[0]
-        self.assertTrue(test_case is test)
-        self.assertIsInstance(formatted_exc, str)
-
-    def testGetDescriptionWithoutDocstring(self):
-        result = unittest2.TextTestResult(None, True, 1)
-        self.assertEqual(
-            result.getDescription(self),
-            'testGetDescriptionWithoutDocstring (' + __name__ +
-            '.Test_TestResult)')
-
-    def testGetDescriptionWithOneLineDocstring(self):
-        """Tests getDescription() for a method with a docstring."""
-        result = unittest2.TextTestResult(None, True, 1)
-        self.assertEqual(
-            result.getDescription(self),
-            ('testGetDescriptionWithOneLineDocstring '
-             '(' + __name__ + '.Test_TestResult)\n'
-             'Tests getDescription() for a method with a docstring.'))
-
-    def testGetDescriptionWithMultiLineDocstring(self):
-        """Tests getDescription() for a method with a longer docstring.
-        The second line of the docstring.
-        """
-        result = unittest2.TextTestResult(None, True, 1)
-        self.assertEqual(
-            result.getDescription(self),
-            ('testGetDescriptionWithMultiLineDocstring '
-             '(' + __name__ + '.Test_TestResult)\n'
-             'Tests getDescription() for a method with a longer '
-             'docstring.'))
-
-    def testStackFrameTrimming(self):
-        class Frame(object):
-
-            class tb_frame(object):
-                f_globals = {}
-        result = unittest2.TestResult()
-        self.assertFalse(result._is_relevant_tb_level(Frame))
-
-        Frame.tb_frame.f_globals['__unittest'] = True
-        self.assertTrue(result._is_relevant_tb_level(Frame))
-
-    def testFailFast(self):
-        result = unittest2.TestResult()
-        result._exc_info_to_string = lambda *_: ''
-        result.failfast = True
-        result.addError(None, None)
-        self.assertTrue(result.shouldStop)
-
-        result = unittest2.TestResult()
-        result._exc_info_to_string = lambda *_: ''
-        result.failfast = True
-        result.addFailure(None, None)
-        self.assertTrue(result.shouldStop)
-
-        result = unittest2.TestResult()
-        result._exc_info_to_string = lambda *_: ''
-        result.failfast = True
-        result.addUnexpectedSuccess(None)
-        self.assertTrue(result.shouldStop)
-
-    def testFailFastSetByRunner(self):
-        runner = unittest2.TextTestRunner(stream=StringIO(), failfast=True)
-        self.testRan = False
-
-        def test(result):
-            self.testRan = True
-            self.assertTrue(result.failfast)
-        runner.run(test)
-        self.assertTrue(self.testRan)
-
-
-class TestOutputBuffering(unittest2.TestCase):
-
-    def setUp(self):
-        self._real_out = sys.stdout
-        self._real_err = sys.stderr
-
-    def tearDown(self):
-        sys.stdout = self._real_out
-        sys.stderr = self._real_err
-
-    def testBufferOutputOff(self):
-        real_out = self._real_out
-        real_err = self._real_err
-
-        result = unittest2.TestResult()
-        self.assertFalse(result.buffer)
-
-        self.assertIs(real_out, sys.stdout)
-        self.assertIs(real_err, sys.stderr)
-
-        result.startTest(self)
-
-        self.assertIs(real_out, sys.stdout)
-        self.assertIs(real_err, sys.stderr)
-
-    def testBufferOutputStartTestAddSuccess(self):
-        real_out = self._real_out
-        real_err = self._real_err
-
-        result = unittest2.TestResult()
-        self.assertFalse(result.buffer)
-
-        result.buffer = True
-
-        self.assertIs(real_out, sys.stdout)
-        self.assertIs(real_err, sys.stderr)
-
-        result.startTest(self)
-
-        self.assertIsNot(real_out, sys.stdout)
-        self.assertIsNot(real_err, sys.stderr)
-        self.assertIsInstance(sys.stdout, StringIO)
-        self.assertIsInstance(sys.stderr, StringIO)
-        self.assertIsNot(sys.stdout, sys.stderr)
-
-        out_stream = sys.stdout
-        err_stream = sys.stderr
-
-        result._original_stdout = StringIO()
-        result._original_stderr = StringIO()
-
-        print('foo')
-        print('bar', file=sys.stderr)
-
-        self.assertEqual(out_stream.getvalue(), 'foo\n')
-        self.assertEqual(err_stream.getvalue(), 'bar\n')
-
-        self.assertEqual(result._original_stdout.getvalue(), '')
-        self.assertEqual(result._original_stderr.getvalue(), '')
-
-        result.addSuccess(self)
-        result.stopTest(self)
-
-        self.assertIs(sys.stdout, result._original_stdout)
-        self.assertIs(sys.stderr, result._original_stderr)
-
-        self.assertEqual(result._original_stdout.getvalue(), '')
-        self.assertEqual(result._original_stderr.getvalue(), '')
-
-        self.assertEqual(out_stream.getvalue(), '')
-        self.assertEqual(err_stream.getvalue(), '')
-
-    def getStartedResult(self):
-        result = unittest2.TestResult()
-        result.buffer = True
-        result.startTest(self)
-        return result
-
-    def testBufferOutputAddErrorOrFailure(self):
-        for message_attr, add_attr, include_error in [
-            ('errors', 'addError', True),
-            ('failures', 'addFailure', False),
-            ('errors', 'addError', True),
-            ('failures', 'addFailure', False)
-        ]:
-            result = self.getStartedResult()
-            result._original_stderr = StringIO()
-            result._original_stdout = StringIO()
-
-            print('foo')
-            if include_error:
-                print('bar', file=sys.stderr)
-
-            addFunction = getattr(result, add_attr)
-            addFunction(self, (None, None, None))
-            result.stopTest(self)
-
-            result_list = getattr(result, message_attr)
-            self.assertEqual(len(result_list), 1)
-
-            test, message = result_list[0]
-            expectedOutMessage = textwrap.dedent("""
-                Stdout:
-                foo
-            """)
-            expectedErrMessage = ''
-            if include_error:
-                expectedErrMessage = textwrap.dedent("""
-                Stderr:
-                bar
-            """)
-            expectedFullMessage = 'None\n%s%s' % (
-                expectedOutMessage, expectedErrMessage)
-
-            self.assertIs(test, self)
-            self.assertEqual(
-                result._original_stdout.getvalue(),
-                expectedOutMessage)
-            self.assertEqual(
-                result._original_stderr.getvalue(),
-                expectedErrMessage)
-            self.assertMultiLineEqual(message, expectedFullMessage)
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_runner.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_runner.py
deleted file mode 100644
index cfdf1326fa390..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_runner.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import pickle
-
-from cStringIO import StringIO
-from unittest2.test.support import LoggingResult, OldTestResult
-
-import unittest2
-
-
-class Test_TextTestRunner(unittest2.TestCase):
-    """Tests for TextTestRunner."""
-
-    def test_init(self):
-        runner = unittest2.TextTestRunner()
-        self.assertFalse(runner.failfast)
-        self.assertFalse(runner.buffer)
-        self.assertEqual(runner.verbosity, 1)
-        self.assertTrue(runner.descriptions)
-        self.assertEqual(runner.resultclass, unittest2.TextTestResult)
-
-    def testBufferAndFailfast(self):
-        class Test(unittest2.TestCase):
-
-            def testFoo(self):
-                pass
-        result = unittest2.TestResult()
-        runner = unittest2.TextTestRunner(stream=StringIO(), failfast=True,
-                                          buffer=True)
-        # Use our result object
-        runner._makeResult = lambda: result
-        runner.run(Test('testFoo'))
-
-        self.assertTrue(result.failfast)
-        self.assertTrue(result.buffer)
-
-    def testRunnerRegistersResult(self):
-        class Test(unittest2.TestCase):
-
-            def testFoo(self):
-                pass
-        originalRegisterResult = unittest2.runner.registerResult
-
-        def cleanup():
-            unittest2.runner.registerResult = originalRegisterResult
-        self.addCleanup(cleanup)
-
-        result = unittest2.TestResult()
-        runner = unittest2.TextTestRunner(stream=StringIO())
-        # Use our result object
-        runner._makeResult = lambda: result
-
-        self.wasRegistered = 0
-
-        def fakeRegisterResult(thisResult):
-            self.wasRegistered += 1
-            self.assertEqual(thisResult, result)
-        unittest2.runner.registerResult = fakeRegisterResult
-
-        runner.run(unittest2.TestSuite())
-        self.assertEqual(self.wasRegistered, 1)
-
-    def test_works_with_result_without_startTestRun_stopTestRun(self):
-        class OldTextResult(OldTestResult):
-
-            def __init__(self, *_):
-                super(OldTextResult, self).__init__()
-            separator2 = ''
-
-            def printErrors(self):
-                pass
-
-        runner = unittest2.TextTestRunner(stream=StringIO(),
-                                          resultclass=OldTextResult)
-        runner.run(unittest2.TestSuite())
-
-    def test_startTestRun_stopTestRun_called(self):
-        class LoggingTextResult(LoggingResult):
-            separator2 = ''
-
-            def printErrors(self):
-                pass
-
-        class LoggingRunner(unittest2.TextTestRunner):
-
-            def __init__(self, events):
-                super(LoggingRunner, self).__init__(StringIO())
-                self._events = events
-
-            def _makeResult(self):
-                return LoggingTextResult(self._events)
-
-        events = []
-        runner = LoggingRunner(events)
-        runner.run(unittest2.TestSuite())
-        expected = ['startTestRun', 'stopTestRun']
-        self.assertEqual(events, expected)
-
-    def test_pickle_unpickle(self):
-        # Issue #7197: a TextTestRunner should be (un)pickleable. This is
-        # required by test_multiprocessing under Windows (in verbose mode).
-        import StringIO
-        # cStringIO objects are not pickleable, but StringIO objects are.
-        stream = StringIO.StringIO("foo")
-        runner = unittest2.TextTestRunner(stream)
-        for protocol in range(pickle.HIGHEST_PROTOCOL + 1):
-            s = pickle.dumps(runner, protocol=protocol)
-            obj = pickle.loads(s)
-            # StringIO objects never compare equal, a cheap test instead.
-            self.assertEqual(obj.stream.getvalue(), stream.getvalue())
-
-    def test_resultclass(self):
-        def MockResultClass(*args):
-            return args
-        STREAM = object()
-        DESCRIPTIONS = object()
-        VERBOSITY = object()
-        runner = unittest2.TextTestRunner(STREAM, DESCRIPTIONS, VERBOSITY,
-                                          resultclass=MockResultClass)
-        self.assertEqual(runner.resultclass, MockResultClass)
-
-        expectedresult = (runner.stream, DESCRIPTIONS, VERBOSITY)
-        self.assertEqual(runner._makeResult(), expectedresult)
-
-    def test_oldresult(self):
-        class Test(unittest2.TestCase):
-
-            def testFoo(self):
-                pass
-        runner = unittest2.TextTestRunner(resultclass=OldTestResult,
-                                          stream=StringIO())
-        # This will raise an exception if TextTestRunner can't handle old
-        # test result objects
-        runner.run(Test('testFoo'))
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_setups.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_setups.py
deleted file mode 100644
index 73526894aaae9..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_setups.py
+++ /dev/null
@@ -1,596 +0,0 @@
-import sys
-
-from cStringIO import StringIO
-
-import unittest2
-from unittest2.test.support import resultFactory
-
-
-class TestSetups(unittest2.TestCase):
-
-    def getRunner(self):
-        return unittest2.TextTestRunner(resultclass=resultFactory,
-                                        stream=StringIO())
-
-    def runTests(self, *cases):
-        suite = unittest2.TestSuite()
-        for case in cases:
-            tests = unittest2.defaultTestLoader.loadTestsFromTestCase(case)
-            suite.addTests(tests)
-
-        runner = self.getRunner()
-
-        # creating a nested suite exposes some potential bugs
-        realSuite = unittest2.TestSuite()
-        realSuite.addTest(suite)
-        # adding empty suites to the end exposes potential bugs
-        suite.addTest(unittest2.TestSuite())
-        realSuite.addTest(unittest2.TestSuite())
-        return runner.run(realSuite)
-
-    def test_setup_class(self):
-        class Test(unittest2.TestCase):
-            setUpCalled = 0
-
-            @classmethod
-            def setUpClass(cls):
-                Test.setUpCalled += 1
-                unittest2.TestCase.setUpClass()
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        result = self.runTests(Test)
-
-        self.assertEqual(Test.setUpCalled, 1)
-        self.assertEqual(result.testsRun, 2)
-        self.assertEqual(len(result.errors), 0)
-
-    def test_teardown_class(self):
-        class Test(unittest2.TestCase):
-            tearDownCalled = 0
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.tearDownCalled += 1
-                unittest2.TestCase.tearDownClass()
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        result = self.runTests(Test)
-
-        self.assertEqual(Test.tearDownCalled, 1)
-        self.assertEqual(result.testsRun, 2)
-        self.assertEqual(len(result.errors), 0)
-
-    def test_teardown_class_two_classes(self):
-        class Test(unittest2.TestCase):
-            tearDownCalled = 0
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.tearDownCalled += 1
-                unittest2.TestCase.tearDownClass()
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        class Test2(unittest2.TestCase):
-            tearDownCalled = 0
-
-            @classmethod
-            def tearDownClass(cls):
-                Test2.tearDownCalled += 1
-                unittest2.TestCase.tearDownClass()
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        result = self.runTests(Test, Test2)
-
-        self.assertEqual(Test.tearDownCalled, 1)
-        self.assertEqual(Test2.tearDownCalled, 1)
-        self.assertEqual(result.testsRun, 4)
-        self.assertEqual(len(result.errors), 0)
-
-    def test_error_in_setupclass(self):
-        class BrokenTest(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                raise TypeError('foo')
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        result = self.runTests(BrokenTest)
-
-        self.assertEqual(result.testsRun, 0)
-        self.assertEqual(len(result.errors), 1)
-        error, _ = result.errors[0]
-        self.assertEqual(str(error),
-                         'setUpClass (%s.BrokenTest)' % __name__)
-
-    def test_error_in_teardown_class(self):
-        class Test(unittest2.TestCase):
-            tornDown = 0
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.tornDown += 1
-                raise TypeError('foo')
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        class Test2(unittest2.TestCase):
-            tornDown = 0
-
-            @classmethod
-            def tearDownClass(cls):
-                Test2.tornDown += 1
-                raise TypeError('foo')
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        result = self.runTests(Test, Test2)
-        self.assertEqual(result.testsRun, 4)
-        self.assertEqual(len(result.errors), 2)
-        self.assertEqual(Test.tornDown, 1)
-        self.assertEqual(Test2.tornDown, 1)
-
-        error, _ = result.errors[0]
-        self.assertEqual(str(error),
-                         'tearDownClass (%s.Test)' % __name__)
-
-    def test_class_not_torndown_when_setup_fails(self):
-        class Test(unittest2.TestCase):
-            tornDown = False
-
-            @classmethod
-            def setUpClass(cls):
-                raise TypeError
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.tornDown = True
-                raise TypeError('foo')
-
-            def test_one(self):
-                pass
-
-        self.runTests(Test)
-        self.assertFalse(Test.tornDown)
-
-    def test_class_not_setup_or_torndown_when_skipped(self):
-        class Test(unittest2.TestCase):
-            classSetUp = False
-            tornDown = False
-
-            @classmethod
-            def setUpClass(cls):
-                Test.classSetUp = True
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.tornDown = True
-
-            def test_one(self):
-                pass
-
-        Test = unittest2.skip("hop")(Test)
-        self.runTests(Test)
-        self.assertFalse(Test.classSetUp)
-        self.assertFalse(Test.tornDown)
-
-    def test_setup_teardown_order_with_pathological_suite(self):
-        results = []
-
-        class Module1(object):
-
-            @staticmethod
-            def setUpModule():
-                results.append('Module1.setUpModule')
-
-            @staticmethod
-            def tearDownModule():
-                results.append('Module1.tearDownModule')
-
-        class Module2(object):
-
-            @staticmethod
-            def setUpModule():
-                results.append('Module2.setUpModule')
-
-            @staticmethod
-            def tearDownModule():
-                results.append('Module2.tearDownModule')
-
-        class Test1(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                results.append('setup 1')
-
-            @classmethod
-            def tearDownClass(cls):
-                results.append('teardown 1')
-
-            def testOne(self):
-                results.append('Test1.testOne')
-
-            def testTwo(self):
-                results.append('Test1.testTwo')
-
-        class Test2(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                results.append('setup 2')
-
-            @classmethod
-            def tearDownClass(cls):
-                results.append('teardown 2')
-
-            def testOne(self):
-                results.append('Test2.testOne')
-
-            def testTwo(self):
-                results.append('Test2.testTwo')
-
-        class Test3(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                results.append('setup 3')
-
-            @classmethod
-            def tearDownClass(cls):
-                results.append('teardown 3')
-
-            def testOne(self):
-                results.append('Test3.testOne')
-
-            def testTwo(self):
-                results.append('Test3.testTwo')
-
-        Test1.__module__ = Test2.__module__ = 'Module'
-        Test3.__module__ = 'Module2'
-        sys.modules['Module'] = Module1
-        sys.modules['Module2'] = Module2
-
-        first = unittest2.TestSuite((Test1('testOne'),))
-        second = unittest2.TestSuite((Test1('testTwo'),))
-        third = unittest2.TestSuite((Test2('testOne'),))
-        fourth = unittest2.TestSuite((Test2('testTwo'),))
-        fifth = unittest2.TestSuite((Test3('testOne'),))
-        sixth = unittest2.TestSuite((Test3('testTwo'),))
-        suite = unittest2.TestSuite(
-            (first, second, third, fourth, fifth, sixth))
-
-        runner = self.getRunner()
-        result = runner.run(suite)
-        self.assertEqual(result.testsRun, 6)
-        self.assertEqual(len(result.errors), 0)
-
-        self.assertEqual(results,
-                         ['Module1.setUpModule', 'setup 1',
-                          'Test1.testOne', 'Test1.testTwo', 'teardown 1',
-                          'setup 2', 'Test2.testOne', 'Test2.testTwo',
-                          'teardown 2', 'Module1.tearDownModule',
-                          'Module2.setUpModule', 'setup 3',
-                          'Test3.testOne', 'Test3.testTwo',
-                          'teardown 3', 'Module2.tearDownModule'])
-
-    def test_setup_module(self):
-        class Module(object):
-            moduleSetup = 0
-
-            @staticmethod
-            def setUpModule():
-                Module.moduleSetup += 1
-
-        class Test(unittest2.TestCase):
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-        Test.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        result = self.runTests(Test)
-        self.assertEqual(Module.moduleSetup, 1)
-        self.assertEqual(result.testsRun, 2)
-        self.assertEqual(len(result.errors), 0)
-
-    def test_error_in_setup_module(self):
-        class Module(object):
-            moduleSetup = 0
-            moduleTornDown = 0
-
-            @staticmethod
-            def setUpModule():
-                Module.moduleSetup += 1
-                raise TypeError('foo')
-
-            @staticmethod
-            def tearDownModule():
-                Module.moduleTornDown += 1
-
-        class Test(unittest2.TestCase):
-            classSetUp = False
-            classTornDown = False
-
-            @classmethod
-            def setUpClass(cls):
-                Test.classSetUp = True
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.classTornDown = True
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        class Test2(unittest2.TestCase):
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-        Test.__module__ = 'Module'
-        Test2.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        result = self.runTests(Test, Test2)
-        self.assertEqual(Module.moduleSetup, 1)
-        self.assertEqual(Module.moduleTornDown, 0)
-        self.assertEqual(result.testsRun, 0)
-        self.assertFalse(Test.classSetUp)
-        self.assertFalse(Test.classTornDown)
-        self.assertEqual(len(result.errors), 1)
-        error, _ = result.errors[0]
-        self.assertEqual(str(error), 'setUpModule (Module)')
-
-    def test_testcase_with_missing_module(self):
-        class Test(unittest2.TestCase):
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-        Test.__module__ = 'Module'
-        sys.modules.pop('Module', None)
-
-        result = self.runTests(Test)
-        self.assertEqual(result.testsRun, 2)
-
-    def test_teardown_module(self):
-        class Module(object):
-            moduleTornDown = 0
-
-            @staticmethod
-            def tearDownModule():
-                Module.moduleTornDown += 1
-
-        class Test(unittest2.TestCase):
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-        Test.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        result = self.runTests(Test)
-        self.assertEqual(Module.moduleTornDown, 1)
-        self.assertEqual(result.testsRun, 2)
-        self.assertEqual(len(result.errors), 0)
-
-    def test_error_in_teardown_module(self):
-        class Module(object):
-            moduleTornDown = 0
-
-            @staticmethod
-            def tearDownModule():
-                Module.moduleTornDown += 1
-                raise TypeError('foo')
-
-        class Test(unittest2.TestCase):
-            classSetUp = False
-            classTornDown = False
-
-            @classmethod
-            def setUpClass(cls):
-                Test.classSetUp = True
-
-            @classmethod
-            def tearDownClass(cls):
-                Test.classTornDown = True
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        class Test2(unittest2.TestCase):
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-        Test.__module__ = 'Module'
-        Test2.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        result = self.runTests(Test, Test2)
-        self.assertEqual(Module.moduleTornDown, 1)
-        self.assertEqual(result.testsRun, 4)
-        self.assertTrue(Test.classSetUp)
-        self.assertTrue(Test.classTornDown)
-        self.assertEqual(len(result.errors), 1)
-        error, _ = result.errors[0]
-        self.assertEqual(str(error), 'tearDownModule (Module)')
-
-    def test_skiptest_in_setupclass(self):
-        class Test(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                raise unittest2.SkipTest('foo')
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        result = self.runTests(Test)
-        self.assertEqual(result.testsRun, 0)
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.skipped), 1)
-        skipped = result.skipped[0][0]
-        self.assertEqual(str(skipped), 'setUpClass (%s.Test)' % __name__)
-
-    def test_skiptest_in_setupmodule(self):
-        class Test(unittest2.TestCase):
-
-            def test_one(self):
-                pass
-
-            def test_two(self):
-                pass
-
-        class Module(object):
-
-            @staticmethod
-            def setUpModule():
-                raise unittest2.SkipTest('foo')
-
-        Test.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        result = self.runTests(Test)
-        self.assertEqual(result.testsRun, 0)
-        self.assertEqual(len(result.errors), 0)
-        self.assertEqual(len(result.skipped), 1)
-        skipped = result.skipped[0][0]
-        self.assertEqual(str(skipped), 'setUpModule (Module)')
-
-    def test_suite_debug_executes_setups_and_teardowns(self):
-        ordering = []
-
-        class Module(object):
-
-            @staticmethod
-            def setUpModule():
-                ordering.append('setUpModule')
-
-            @staticmethod
-            def tearDownModule():
-                ordering.append('tearDownModule')
-
-        class Test(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                ordering.append('setUpClass')
-
-            @classmethod
-            def tearDownClass(cls):
-                ordering.append('tearDownClass')
-
-            def test_something(self):
-                ordering.append('test_something')
-
-        Test.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        suite = unittest2.defaultTestLoader.loadTestsFromTestCase(Test)
-        suite.debug()
-        expectedOrder = [
-            'setUpModule',
-            'setUpClass',
-            'test_something',
-            'tearDownClass',
-            'tearDownModule']
-        self.assertEqual(ordering, expectedOrder)
-
-    def test_suite_debug_propagates_exceptions(self):
-        class Module(object):
-
-            @staticmethod
-            def setUpModule():
-                if phase == 0:
-                    raise Exception('setUpModule')
-
-            @staticmethod
-            def tearDownModule():
-                if phase == 1:
-                    raise Exception('tearDownModule')
-
-        class Test(unittest2.TestCase):
-
-            @classmethod
-            def setUpClass(cls):
-                if phase == 2:
-                    raise Exception('setUpClass')
-
-            @classmethod
-            def tearDownClass(cls):
-                if phase == 3:
-                    raise Exception('tearDownClass')
-
-            def test_something(self):
-                if phase == 4:
-                    raise Exception('test_something')
-
-        Test.__module__ = 'Module'
-        sys.modules['Module'] = Module
-
-        _suite = unittest2.defaultTestLoader.loadTestsFromTestCase(Test)
-        suite = unittest2.TestSuite()
-
-        # nesting a suite again exposes a bug in the initial implementation
-        suite.addTest(_suite)
-        messages = (
-            'setUpModule',
-            'tearDownModule',
-            'setUpClass',
-            'tearDownClass',
-            'test_something')
-        for phase, msg in enumerate(messages):
-            self.assertRaisesRegexp(Exception, msg, suite.debug)
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_skipping.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_skipping.py
deleted file mode 100644
index 6ad12e988cb4d..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_skipping.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from unittest2.test.support import LoggingResult
-
-import unittest2
-
-
-class Test_TestSkipping(unittest2.TestCase):
-
-    def test_skipping(self):
-        class Foo(unittest2.TestCase):
-
-            def test_skip_me(self):
-                self.skipTest("skip")
-        events = []
-        result = LoggingResult(events)
-        test = Foo("test_skip_me")
-        test.run(result)
-        self.assertEqual(events, ['startTest', 'addSkip', 'stopTest'])
-        self.assertEqual(result.skipped, [(test, "skip")])
-
-        # Try letting setUp skip the test now.
-        class Foo(unittest2.TestCase):
-
-            def setUp(self):
-                self.skipTest("testing")
-
-            def test_nothing(self): pass
-        events = []
-        result = LoggingResult(events)
-        test = Foo("test_nothing")
-        test.run(result)
-        self.assertEqual(events, ['startTest', 'addSkip', 'stopTest'])
-        self.assertEqual(result.skipped, [(test, "testing")])
-        self.assertEqual(result.testsRun, 1)
-
-    def test_skipping_decorators(self):
-        op_table = ((unittest2.skipUnless, False, True),
-                    (unittest2.skipIf, True, False))
-        for deco, do_skip, dont_skip in op_table:
-            class Foo(unittest2.TestCase):
-
-                @deco(do_skip, "testing")
-                def test_skip(self):
-                    pass
-
-                @deco(dont_skip, "testing")
-                def test_dont_skip(self):
-                    pass
-
-            test_do_skip = Foo("test_skip")
-            test_dont_skip = Foo("test_dont_skip")
-            suite = unittest2.TestSuite([test_do_skip, test_dont_skip])
-            events = []
-            result = LoggingResult(events)
-            suite.run(result)
-            self.assertEqual(len(result.skipped), 1)
-            expected = ['startTest', 'addSkip', 'stopTest',
-                        'startTest', 'addSuccess', 'stopTest']
-            self.assertEqual(events, expected)
-            self.assertEqual(result.testsRun, 2)
-            self.assertEqual(result.skipped, [(test_do_skip, "testing")])
-            self.assertTrue(result.wasSuccessful())
-
-    def test_skip_class(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self):
-                record.append(1)
-
-        # was originally a class decorator...
-        Foo = unittest2.skip("testing")(Foo)
-        record = []
-        result = unittest2.TestResult()
-        test = Foo("test_1")
-        suite = unittest2.TestSuite([test])
-        suite.run(result)
-        self.assertEqual(result.skipped, [(test, "testing")])
-        self.assertEqual(record, [])
-
-    def test_expected_failure(self):
-        class Foo(unittest2.TestCase):
-
-            @unittest2.expectedFailure
-            def test_die(self):
-                self.fail("help me!")
-        events = []
-        result = LoggingResult(events)
-        test = Foo("test_die")
-        test.run(result)
-        self.assertEqual(events,
-                         ['startTest', 'addExpectedFailure', 'stopTest'])
-        self.assertEqual(result.expectedFailures[0][0], test)
-        self.assertTrue(result.wasSuccessful())
-
-    def test_unexpected_success(self):
-        class Foo(unittest2.TestCase):
-
-            @unittest2.expectedFailure
-            def test_die(self):
-                pass
-        events = []
-        result = LoggingResult(events)
-        test = Foo("test_die")
-        test.run(result)
-        self.assertEqual(events,
-                         ['startTest', 'addUnexpectedSuccess', 'stopTest'])
-        self.assertFalse(result.failures)
-        self.assertEqual(result.unexpectedSuccesses, [test])
-        self.assertTrue(result.wasSuccessful())
-
-    def test_skip_doesnt_run_setup(self):
-        class Foo(unittest2.TestCase):
-            wasSetUp = False
-            wasTornDown = False
-
-            def setUp(self):
-                Foo.wasSetUp = True
-
-            def tornDown(self):
-                Foo.wasTornDown = True
-
-            @unittest2.skip('testing')
-            def test_1(self):
-                pass
-
-        result = unittest2.TestResult()
-        test = Foo("test_1")
-        suite = unittest2.TestSuite([test])
-        suite.run(result)
-        self.assertEqual(result.skipped, [(test, "testing")])
-        self.assertFalse(Foo.wasSetUp)
-        self.assertFalse(Foo.wasTornDown)
-
-    def test_decorated_skip(self):
-        def decorator(func):
-            def inner(*a):
-                return func(*a)
-            return inner
-
-        class Foo(unittest2.TestCase):
-
-            @decorator
-            @unittest2.skip('testing')
-            def test_1(self):
-                pass
-
-        result = unittest2.TestResult()
-        test = Foo("test_1")
-        suite = unittest2.TestSuite([test])
-        suite.run(result)
-        self.assertEqual(result.skipped, [(test, "testing")])
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_suite.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_suite.py
deleted file mode 100644
index 570a9c6b17a65..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_suite.py
+++ /dev/null
@@ -1,363 +0,0 @@
-from unittest2.test.support import EqualityMixin, LoggingResult
-
-import sys
-import unittest2
-
-
-class Test(object):
-
-    class Foo(unittest2.TestCase):
-
-        def test_1(self): pass
-
-        def test_2(self): pass
-
-        def test_3(self): pass
-
-        def runTest(self): pass
-
-
-def _mk_TestSuite(*names):
-    return unittest2.TestSuite(Test.Foo(n) for n in names)
-
-
-class Test_TestSuite(unittest2.TestCase, EqualityMixin):
-
-    # Set up attributes needed by inherited tests
-    ################################################################
-
-    # Used by EqualityMixin.test_eq
-    eq_pairs = [(unittest2.TestSuite(), unittest2.TestSuite()),
-                (unittest2.TestSuite(), unittest2.TestSuite([])),
-                (_mk_TestSuite('test_1'), _mk_TestSuite('test_1'))]
-
-    # Used by EqualityMixin.test_ne
-    ne_pairs = [(unittest2.TestSuite(), _mk_TestSuite('test_1')),
-                (unittest2.TestSuite([]), _mk_TestSuite('test_1')),
-                (_mk_TestSuite('test_1', 'test_2'), _mk_TestSuite('test_1', 'test_3')),
-                (_mk_TestSuite('test_1'), _mk_TestSuite('test_2'))]
-
-    ################################################################
-    # /Set up attributes needed by inherited tests
-
-    # Tests for TestSuite.__init__
-    ################################################################
-
-    # "class TestSuite([tests])"
-    #
-    # The tests iterable should be optional
-    def test_init__tests_optional(self):
-        suite = unittest2.TestSuite()
-
-        self.assertEqual(suite.countTestCases(), 0)
-
-    # "class TestSuite([tests])"
-    # ...
-    # "If tests is given, it must be an iterable of individual test cases
-    # or other test suites that will be used to build the suite initially"
-    #
-    # TestSuite should deal with empty tests iterables by allowing the
-    # creation of an empty suite
-    def test_init__empty_tests(self):
-        suite = unittest2.TestSuite([])
-
-        self.assertEqual(suite.countTestCases(), 0)
-
-    # "class TestSuite([tests])"
-    # ...
-    # "If tests is given, it must be an iterable of individual test cases
-    # or other test suites that will be used to build the suite initially"
-    #
-    # TestSuite should allow any iterable to provide tests
-    def test_init__tests_from_any_iterable(self):
-        def tests():
-            yield unittest2.FunctionTestCase(lambda: None)
-            yield unittest2.FunctionTestCase(lambda: None)
-
-        suite_1 = unittest2.TestSuite(tests())
-        self.assertEqual(suite_1.countTestCases(), 2)
-
-        suite_2 = unittest2.TestSuite(suite_1)
-        self.assertEqual(suite_2.countTestCases(), 2)
-
-        suite_3 = unittest2.TestSuite(set(suite_1))
-        self.assertEqual(suite_3.countTestCases(), 2)
-
-    # "class TestSuite([tests])"
-    # ...
-    # "If tests is given, it must be an iterable of individual test cases
-    # or other test suites that will be used to build the suite initially"
-    #
-    # Does TestSuite() also allow other TestSuite() instances to be present
-    # in the tests iterable?
-    def test_init__TestSuite_instances_in_tests(self):
-        def tests():
-            ftc = unittest2.FunctionTestCase(lambda: None)
-            yield unittest2.TestSuite([ftc])
-            yield unittest2.FunctionTestCase(lambda: None)
-
-        suite = unittest2.TestSuite(tests())
-        self.assertEqual(suite.countTestCases(), 2)
-
-    ################################################################
-    # /Tests for TestSuite.__init__
-
-    # Container types should support the iter protocol
-    def test_iter(self):
-        test1 = unittest2.FunctionTestCase(lambda: None)
-        test2 = unittest2.FunctionTestCase(lambda: None)
-        suite = unittest2.TestSuite((test1, test2))
-
-        self.assertEqual(list(suite), [test1, test2])
-
-    # "Return the number of tests represented by the this test object.
-    # ...this method is also implemented by the TestSuite class, which can
-    # return larger [greater than 1] values"
-    #
-    # Presumably an empty TestSuite returns 0?
-    def test_countTestCases_zero_simple(self):
-        suite = unittest2.TestSuite()
-
-        self.assertEqual(suite.countTestCases(), 0)
-
-    # "Return the number of tests represented by the this test object.
-    # ...this method is also implemented by the TestSuite class, which can
-    # return larger [greater than 1] values"
-    #
-    # Presumably an empty TestSuite (even if it contains other empty
-    # TestSuite instances) returns 0?
-    def test_countTestCases_zero_nested(self):
-        class Test1(unittest2.TestCase):
-
-            def test(self):
-                pass
-
-        suite = unittest2.TestSuite([unittest2.TestSuite()])
-
-        self.assertEqual(suite.countTestCases(), 0)
-
-    # "Return the number of tests represented by the this test object.
-    # ...this method is also implemented by the TestSuite class, which can
-    # return larger [greater than 1] values"
-    def test_countTestCases_simple(self):
-        test1 = unittest2.FunctionTestCase(lambda: None)
-        test2 = unittest2.FunctionTestCase(lambda: None)
-        suite = unittest2.TestSuite((test1, test2))
-
-        self.assertEqual(suite.countTestCases(), 2)
-
-    # "Return the number of tests represented by the this test object.
-    # ...this method is also implemented by the TestSuite class, which can
-    # return larger [greater than 1] values"
-    #
-    # Make sure this holds for nested TestSuite instances, too
-    def test_countTestCases_nested(self):
-        class Test1(unittest2.TestCase):
-
-            def test1(self): pass
-
-            def test2(self): pass
-
-        test2 = unittest2.FunctionTestCase(lambda: None)
-        test3 = unittest2.FunctionTestCase(lambda: None)
-        child = unittest2.TestSuite((Test1('test2'), test2))
-        parent = unittest2.TestSuite((test3, child, Test1('test1')))
-
-        self.assertEqual(parent.countTestCases(), 4)
-
-    # "Run the tests associated with this suite, collecting the result into
-    # the test result object passed as result."
-    #
-    # And if there are no tests? What then?
-    def test_run__empty_suite(self):
-        events = []
-        result = LoggingResult(events)
-
-        suite = unittest2.TestSuite()
-
-        suite.run(result)
-
-        self.assertEqual(events, [])
-
-    # "Note that unlike TestCase.run(), TestSuite.run() requires the
-    # "result object to be passed in."
-    def test_run__requires_result(self):
-        suite = unittest2.TestSuite()
-
-        try:
-            suite.run()
-        except TypeError:
-            pass
-        else:
-            self.fail("Failed to raise TypeError")
-
-    # "Run the tests associated with this suite, collecting the result into
-    # the test result object passed as result."
-    def test_run(self):
-        events = []
-        result = LoggingResult(events)
-
-        class LoggingCase(unittest2.TestCase):
-
-            def run(self, result):
-                events.append('run %s' % self._testMethodName)
-
-            def test1(self): pass
-
-            def test2(self): pass
-
-        tests = [LoggingCase('test1'), LoggingCase('test2')]
-
-        unittest2.TestSuite(tests).run(result)
-
-        self.assertEqual(events, ['run test1', 'run test2'])
-
-    # "Add a TestCase ... to the suite"
-    def test_addTest__TestCase(self):
-        class Foo(unittest2.TestCase):
-
-            def test(self): pass
-
-        test = Foo('test')
-        suite = unittest2.TestSuite()
-
-        suite.addTest(test)
-
-        self.assertEqual(suite.countTestCases(), 1)
-        self.assertEqual(list(suite), [test])
-
-    # "Add a ... TestSuite to the suite"
-    def test_addTest__TestSuite(self):
-        class Foo(unittest2.TestCase):
-
-            def test(self): pass
-
-        suite_2 = unittest2.TestSuite([Foo('test')])
-
-        suite = unittest2.TestSuite()
-        suite.addTest(suite_2)
-
-        self.assertEqual(suite.countTestCases(), 1)
-        self.assertEqual(list(suite), [suite_2])
-
-    # "Add all the tests from an iterable of TestCase and TestSuite
-    # instances to this test suite."
-    #
-    # "This is equivalent to iterating over tests, calling addTest() for
-    # each element"
-    def test_addTests(self):
-        class Foo(unittest2.TestCase):
-
-            def test_1(self): pass
-
-            def test_2(self): pass
-
-        test_1 = Foo('test_1')
-        test_2 = Foo('test_2')
-        inner_suite = unittest2.TestSuite([test_2])
-
-        def gen():
-            yield test_1
-            yield test_2
-            yield inner_suite
-
-        suite_1 = unittest2.TestSuite()
-        suite_1.addTests(gen())
-
-        self.assertEqual(list(suite_1), list(gen()))
-
-        # "This is equivalent to iterating over tests, calling addTest() for
-        # each element"
-        suite_2 = unittest2.TestSuite()
-        for t in gen():
-            suite_2.addTest(t)
-
-        self.assertEqual(suite_1, suite_2)
-
-    # "Add all the tests from an iterable of TestCase and TestSuite
-    # instances to this test suite."
-    #
-    # What happens if it doesn't get an iterable?
-    def test_addTest__noniterable(self):
-        suite = unittest2.TestSuite()
-
-        try:
-            suite.addTests(5)
-        except TypeError:
-            pass
-        else:
-            self.fail("Failed to raise TypeError")
-
-    def test_addTest__noncallable(self):
-        suite = unittest2.TestSuite()
-        self.assertRaises(TypeError, suite.addTest, 5)
-
-    def test_addTest__casesuiteclass(self):
-        suite = unittest2.TestSuite()
-        self.assertRaises(TypeError, suite.addTest, Test_TestSuite)
-        self.assertRaises(TypeError, suite.addTest, unittest2.TestSuite)
-
-    def test_addTests__string(self):
-        suite = unittest2.TestSuite()
-        self.assertRaises(TypeError, suite.addTests, "foo")
-
-    def test_function_in_suite(self):
-        def f(_):
-            pass
-        suite = unittest2.TestSuite()
-        suite.addTest(f)
-
-        # when the bug is fixed this line will not crash
-        suite.run(unittest2.TestResult())
-
-    def test_basetestsuite(self):
-        class Test(unittest2.TestCase):
-            wasSetUp = False
-            wasTornDown = False
-
-            @classmethod
-            def setUpClass(cls):
-                cls.wasSetUp = True
-
-            @classmethod
-            def tearDownClass(cls):
-                cls.wasTornDown = True
-
-            def testPass(self):
-                pass
-
-            def testFail(self):
-                fail
-
-        class Module(object):
-            wasSetUp = False
-            wasTornDown = False
-
-            @staticmethod
-            def setUpModule():
-                Module.wasSetUp = True
-
-            @staticmethod
-            def tearDownModule():
-                Module.wasTornDown = True
-
-        Test.__module__ = 'Module'
-        sys.modules['Module'] = Module
-        self.addCleanup(sys.modules.pop, 'Module')
-
-        suite = unittest2.BaseTestSuite()
-        suite.addTests([Test('testPass'), Test('testFail')])
-        self.assertEqual(suite.countTestCases(), 2)
-
-        result = unittest2.TestResult()
-        suite.run(result)
-        self.assertFalse(Module.wasSetUp)
-        self.assertFalse(Module.wasTornDown)
-        self.assertFalse(Test.wasSetUp)
-        self.assertFalse(Test.wasTornDown)
-        self.assertEqual(len(result.errors), 1)
-        self.assertEqual(len(result.failures), 0)
-        self.assertEqual(result.testsRun, 2)
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/test/test_unittest2_with.py b/lldb/third_party/Python/module/unittest2/unittest2/test/test_unittest2_with.py
deleted file mode 100644
index 7544757ecc2ae..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/test/test_unittest2_with.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import unittest2
-from unittest2.test.support import OldTestResult, catch_warnings
-
-import warnings
-# needed to enable the deprecation warnings
-warnings.simplefilter('default')
-
-
-class TestWith(unittest2.TestCase):
-    """Tests that use the with statement live in this
-    module so that all other tests can be run with Python 2.4.
-    """
-
-    def testAssertRaisesExcValue(self):
-        class ExceptionMock(Exception):
-            pass
-
-        def Stub(foo):
-            raise ExceptionMock(foo)
-        v = "particular value"
-
-        ctx = self.assertRaises(ExceptionMock)
-        with ctx:
-            Stub(v)
-        e = ctx.exception
-        self.assertIsInstance(e, ExceptionMock)
-        self.assertEqual(e.args[0], v)
-
-    def test_assertRaises(self):
-        def _raise(e):
-            raise e
-        self.assertRaises(KeyError, _raise, KeyError)
-        self.assertRaises(KeyError, _raise, KeyError("key"))
-        try:
-            self.assertRaises(KeyError, lambda: None)
-        except self.failureException as e:
-            self.assertIn("KeyError not raised", e.args)
-        else:
-            self.fail("assertRaises() didn't fail")
-        try:
-            self.assertRaises(KeyError, _raise, ValueError)
-        except ValueError:
-            pass
-        else:
-            self.fail("assertRaises() didn't let exception pass through")
-        with self.assertRaises(KeyError) as cm:
-            try:
-                raise KeyError
-            except Exception as e:
-                raise
-        self.assertIs(cm.exception, e)
-
-        with self.assertRaises(KeyError):
-            raise KeyError("key")
-        try:
-            with self.assertRaises(KeyError):
-                pass
-        except self.failureException as e:
-            self.assertIn("KeyError not raised", e.args)
-        else:
-            self.fail("assertRaises() didn't fail")
-        try:
-            with self.assertRaises(KeyError):
-                raise ValueError
-        except ValueError:
-            pass
-        else:
-            self.fail("assertRaises() didn't let exception pass through")
-
-    def test_assert_dict_unicode_error(self):
-        with catch_warnings(record=True):
-            # This causes a UnicodeWarning due to its craziness
-            one = ''.join(chr(i) for i in range(255))
-            # this used to cause a UnicodeDecodeError constructing the failure
-            # msg
-            with self.assertRaises(self.failureException):
-                self.assertDictContainsSubset({'foo': one}, {'foo': u'\uFFFD'})
-
-    def test_formatMessage_unicode_error(self):
-        with catch_warnings(record=True):
-            # This causes a UnicodeWarning due to its craziness
-            one = ''.join(chr(i) for i in range(255))
-            # this used to cause a UnicodeDecodeError constructing msg
-            self._formatMessage(one, u'\uFFFD')
-
-    def assertOldResultWarning(self, test, failures):
-        with catch_warnings(record=True) as log:
-            result = OldTestResult()
-            test.run(result)
-            self.assertEqual(len(result.failures), failures)
-            warning, = log
-            self.assertIs(warning.category, DeprecationWarning)
-
-    def test_old_testresult(self):
-        class Test(unittest2.TestCase):
-
-            def testSkip(self):
-                self.skipTest('foobar')
-
-            @unittest2.expectedFailure
-            def testExpectedFail(self):
-                raise TypeError
-
-            @unittest2.expectedFailure
-            def testUnexpectedSuccess(self):
-                pass
-
-        for test_name, should_pass in (('testSkip', True),
-                                       ('testExpectedFail', True),
-                                       ('testUnexpectedSuccess', False)):
-            test = Test(test_name)
-            self.assertOldResultWarning(test, int(not should_pass))
-
-    def test_old_testresult_setup(self):
-        class Test(unittest2.TestCase):
-
-            def setUp(self):
-                self.skipTest('no reason')
-
-            def testFoo(self):
-                pass
-        self.assertOldResultWarning(Test('testFoo'), 0)
-
-    def test_old_testresult_class(self):
-        class Test(unittest2.TestCase):
-
-            def testFoo(self):
-                pass
-        Test = unittest2.skip('no reason')(Test)
-        self.assertOldResultWarning(Test('testFoo'), 0)
-
-    def testPendingDeprecationMethodNames(self):
-        """Test fail* methods pending deprecation, they will warn in 3.2.
-
-        Do not use these methods.  They will go away in 3.3.
-        """
-        with catch_warnings(record=True):
-            self.failIfEqual(3, 5)
-            self.failUnlessEqual(3, 3)
-            self.failUnlessAlmostEqual(2.0, 2.0)
-            self.failIfAlmostEqual(3.0, 5.0)
-            self.failUnless(True)
-            self.failUnlessRaises(TypeError, lambda _: 3.14 + u'spam')
-            self.failIf(False)
-
-
-if __name__ == '__main__':
-    unittest2.main()
diff --git a/lldb/third_party/Python/module/unittest2/unittest2/util.py b/lldb/third_party/Python/module/unittest2/unittest2/util.py
deleted file mode 100644
index d059861600ae9..0000000000000
--- a/lldb/third_party/Python/module/unittest2/unittest2/util.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""Various utility functions."""
-
-__unittest = True
-
-
-_MAX_LENGTH = 80
-
-
-def safe_repr(obj, short=False):
-    try:
-        result = repr(obj)
-    except Exception:
-        result = object.__repr__(obj)
-    if not short or len(result) < _MAX_LENGTH:
-        return result
-    return result[:_MAX_LENGTH] + ' [truncated]...'
-
-
-def safe_str(obj):
-    try:
-        return str(obj)
-    except Exception:
-        return object.__str__(obj)
-
-
-def strclass(cls):
-    return "%s.%s" % (cls.__module__, cls.__name__)
-
-
-def sorted_list_difference(expected, actual):
-    """Finds elements in only one or the other of two, sorted input lists.
-
-    Returns a two-element tuple of lists.    The first list contains those
-    elements in the "expected" list but not in the "actual" list, and the
-    second contains those elements in the "actual" list but not in the
-    "expected" list.    Duplicate elements in either input list are ignored.
-    """
-    i = j = 0
-    missing = []
-    unexpected = []
-    while True:
-        try:
-            e = expected[i]
-            a = actual[j]
-            if e < a:
-                missing.append(e)
-                i += 1
-                while expected[i] == e:
-                    i += 1
-            elif e > a:
-                unexpected.append(a)
-                j += 1
-                while actual[j] == a:
-                    j += 1
-            else:
-                i += 1
-                try:
-                    while expected[i] == e:
-                        i += 1
-                finally:
-                    j += 1
-                    while actual[j] == a:
-                        j += 1
-        except IndexError:
-            missing.extend(expected[i:])
-            unexpected.extend(actual[j:])
-            break
-    return missing, unexpected
-
-
-def unorderable_list_difference(expected, actual, ignore_duplicate=False):
-    """Same behavior as sorted_list_difference but
-    for lists of unorderable items (like dicts).
-
-    As it does a linear search per item (remove) it
-    has O(n*n) performance.
-    """
-    missing = []
-    unexpected = []
-    while expected:
-        item = expected.pop()
-        try:
-            actual.remove(item)
-        except ValueError:
-            missing.append(item)
-        if ignore_duplicate:
-            for lst in expected, actual:
-                try:
-                    while True:
-                        lst.remove(item)
-                except ValueError:
-                    pass
-    if ignore_duplicate:
-        while actual:
-            item = actual.pop()
-            unexpected.append(item)
-            try:
-                while True:
-                    actual.remove(item)
-            except ValueError:
-                pass
-        return missing, unexpected
-
-    # anything left in actual is unexpected
-    return missing, actual

From 264d828ea6399c31c210b67a050fbf084634da6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 16:37:51 +0100
Subject: [PATCH 330/546] [clang][Interp][NFC] Redo returning values from
 builtin functions

Instead of having retInt/retLong/retSizeT/etc., just add retInteger,
which takes an APSInt and returns it in form of the given QualType.
This makes the code a little neater, but is also necessary since
some builtins have a different return type with -fms-extensions.
---
 clang/lib/AST/Interp/InterpBuiltin.cpp   | 214 +++++++++--------------
 clang/test/CodeGen/ms-intrinsics-other.c |  20 +--
 2 files changed, 91 insertions(+), 143 deletions(-)

diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 2fb076d7793a8..8f45c789296b6 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -62,97 +62,32 @@ static APSInt peekToAPSInt(InterpStack &Stk, PrimType T, size_t Offset = 0) {
   return R;
 }
 
-/// Pushes \p Val to the stack, as a target-dependent 'int'.
-static void pushInt(InterpState &S, int32_t Val) {
-  PrimType IntType = getIntPrimType(S);
-  if (IntType == PT_Sint32)
-    S.Stk.push<Integral<32, true>>(Integral<32, true>::from(Val));
-  else if (IntType == PT_Sint16)
-    S.Stk.push<Integral<16, true>>(Integral<16, true>::from(Val));
-  else
-    llvm_unreachable("Int isn't 16 or 32 bit?");
-}
-
-static void pushAPSInt(InterpState &S, const APSInt &Val) {
-  bool Signed = Val.isSigned();
-
-  if (Signed) {
-    switch (Val.getBitWidth()) {
-    case 64:
-      S.Stk.push<Integral<64, true>>(
-          Integral<64, true>::from(Val.getSExtValue()));
-      break;
-    case 32:
-      S.Stk.push<Integral<32, true>>(
-          Integral<32, true>::from(Val.getSExtValue()));
-      break;
-    case 16:
-      S.Stk.push<Integral<16, true>>(
-          Integral<16, true>::from(Val.getSExtValue()));
-      break;
-    case 8:
-      S.Stk.push<Integral<8, true>>(
-          Integral<8, true>::from(Val.getSExtValue()));
-      break;
-    default:
-      llvm_unreachable("Invalid integer bitwidth");
-    }
-    return;
-  }
-
-  // Unsigned.
-  switch (Val.getBitWidth()) {
-  case 64:
-    S.Stk.push<Integral<64, false>>(
-        Integral<64, false>::from(Val.getZExtValue()));
-    break;
-  case 32:
-    S.Stk.push<Integral<32, false>>(
-        Integral<32, false>::from(Val.getZExtValue()));
-    break;
-  case 16:
-    S.Stk.push<Integral<16, false>>(
-        Integral<16, false>::from(Val.getZExtValue()));
-    break;
-  case 8:
-    S.Stk.push<Integral<8, false>>(
-        Integral<8, false>::from(Val.getZExtValue()));
-    break;
-  default:
-    llvm_unreachable("Invalid integer bitwidth");
+/// Pushes \p Val on the stack as the type given by \p QT.
+static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
+  assert(QT->isSignedIntegerOrEnumerationType() ||
+         QT->isUnsignedIntegerOrEnumerationType());
+  std::optional<PrimType> T = S.getContext().classify(QT);
+  assert(T);
+
+  if (QT->isSignedIntegerOrEnumerationType()) {
+    int64_t V = Val.getSExtValue();
+    INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V)); });
+  } else {
+    assert(QT->isUnsignedIntegerOrEnumerationType());
+    uint64_t V = Val.getZExtValue();
+    INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V)); });
   }
 }
 
-/// Pushes \p Val to the stack, as a target-dependent 'long'.
-static void pushLong(InterpState &S, int64_t Val) {
-  PrimType LongType = getLongPrimType(S);
-  if (LongType == PT_Sint64)
-    S.Stk.push<Integral<64, true>>(Integral<64, true>::from(Val));
-  else if (LongType == PT_Sint32)
-    S.Stk.push<Integral<32, true>>(Integral<32, true>::from(Val));
-  else if (LongType == PT_Sint16)
-    S.Stk.push<Integral<16, true>>(Integral<16, true>::from(Val));
+template <typename T>
+static void pushInteger(InterpState &S, T Val, QualType QT) {
+  if constexpr (std::is_same_v<T, APInt>)
+    pushInteger(S, APSInt(Val, !std::is_signed_v<T>), QT);
   else
-    llvm_unreachable("Long isn't 16, 32 or 64 bit?");
-}
-
-static void pushSizeT(InterpState &S, uint64_t Val) {
-  const TargetInfo &TI = S.getCtx().getTargetInfo();
-  unsigned SizeTWidth = TI.getTypeWidth(TI.getSizeType());
-
-  switch (SizeTWidth) {
-  case 64:
-    S.Stk.push<Integral<64, false>>(Integral<64, false>::from(Val));
-    break;
-  case 32:
-    S.Stk.push<Integral<32, false>>(Integral<32, false>::from(Val));
-    break;
-  case 16:
-    S.Stk.push<Integral<16, false>>(Integral<16, false>::from(Val));
-    break;
-  default:
-    llvm_unreachable("We don't handle this size_t size.");
-  }
+    pushInteger(S,
+                APSInt(APInt(sizeof(T) * 8, Val, std::is_signed_v<T>),
+                       !std::is_signed_v<T>),
+                QT);
 }
 
 static void assignInteger(Pointer &Dest, PrimType ValueT, const APSInt &Value) {
@@ -188,7 +123,8 @@ static bool retPrimValue(InterpState &S, CodePtr OpPC, APValue &Result,
 }
 
 static bool interp__builtin_strcmp(InterpState &S, CodePtr OpPC,
-                                   const InterpFrame *Frame) {
+                                   const InterpFrame *Frame,
+                                   const CallExpr *Call) {
   const Pointer &A = getParam<Pointer>(Frame, 0);
   const Pointer &B = getParam<Pointer>(Frame, 1);
 
@@ -222,12 +158,13 @@ static bool interp__builtin_strcmp(InterpState &S, CodePtr OpPC,
       break;
   }
 
-  pushInt(S, Result);
+  pushInteger(S, Result, Call->getType());
   return true;
 }
 
 static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
-                                   const InterpFrame *Frame) {
+                                   const InterpFrame *Frame,
+                                   const CallExpr *Call) {
   const Pointer &StrPtr = getParam<Pointer>(Frame, 0);
 
   if (!CheckArray(S, OpPC, StrPtr))
@@ -253,7 +190,8 @@ static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
       break;
   }
 
-  pushSizeT(S, Len);
+  pushInteger(S, Len, Call->getType());
+
   return true;
 }
 
@@ -383,68 +321,71 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
 /// take a float, double, long double, etc.
 /// But for us, that's all a Floating anyway.
 static bool interp__builtin_isnan(InterpState &S, CodePtr OpPC,
-                                  const InterpFrame *Frame, const Function *F) {
+                                  const InterpFrame *Frame, const Function *F,
+                                  const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
 
-  pushInt(S, Arg.isNan());
+  pushInteger(S, Arg.isNan(), Call->getType());
   return true;
 }
 
 static bool interp__builtin_issignaling(InterpState &S, CodePtr OpPC,
                                         const InterpFrame *Frame,
-                                        const Function *F) {
+                                        const Function *F,
+                                        const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
 
-  pushInt(S, Arg.isSignaling());
+  pushInteger(S, Arg.isSignaling(), Call->getType());
   return true;
 }
 
 static bool interp__builtin_isinf(InterpState &S, CodePtr OpPC,
                                   const InterpFrame *Frame, const Function *F,
-                                  bool CheckSign) {
+                                  bool CheckSign, const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
   bool IsInf = Arg.isInf();
 
   if (CheckSign)
-    pushInt(S, IsInf ? (Arg.isNegative() ? -1 : 1) : 0);
+    pushInteger(S, IsInf ? (Arg.isNegative() ? -1 : 1) : 0, Call->getType());
   else
-    pushInt(S, Arg.isInf());
+    pushInteger(S, Arg.isInf(), Call->getType());
   return true;
 }
 
 static bool interp__builtin_isfinite(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame,
-                                     const Function *F) {
+                                     const Function *F, const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
 
-  pushInt(S, Arg.isFinite());
+  pushInteger(S, Arg.isFinite(), Call->getType());
   return true;
 }
 
 static bool interp__builtin_isnormal(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame,
-                                     const Function *F) {
+                                     const Function *F, const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
 
-  pushInt(S, Arg.isNormal());
+  pushInteger(S, Arg.isNormal(), Call->getType());
   return true;
 }
 
 static bool interp__builtin_issubnormal(InterpState &S, CodePtr OpPC,
                                         const InterpFrame *Frame,
-                                        const Function *F) {
+                                        const Function *F,
+                                        const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
 
-  pushInt(S, Arg.isDenormal());
+  pushInteger(S, Arg.isDenormal(), Call->getType());
   return true;
 }
 
 static bool interp__builtin_iszero(InterpState &S, CodePtr OpPC,
-                                   const InterpFrame *Frame,
-                                   const Function *F) {
+                                   const InterpFrame *Frame, const Function *F,
+                                   const CallExpr *Call) {
   const Floating &Arg = S.Stk.peek<Floating>();
 
-  pushInt(S, Arg.isZero());
+  pushInteger(S, Arg.isZero(), Call->getType());
   return true;
 }
 
@@ -461,7 +402,7 @@ static bool interp__builtin_isfpclass(InterpState &S, CodePtr OpPC,
 
   int32_t Result =
       static_cast<int32_t>((F.classify() & FPClassArg).getZExtValue());
-  pushInt(S, Result);
+  pushInteger(S, Result, Call->getType());
 
   return true;
 }
@@ -469,7 +410,8 @@ static bool interp__builtin_isfpclass(InterpState &S, CodePtr OpPC,
 /// Five int values followed by one floating value.
 static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
                                        const InterpFrame *Frame,
-                                       const Function *Func) {
+                                       const Function *Func,
+                                       const CallExpr *Call) {
   const Floating &Val = S.Stk.peek<Floating>();
 
   unsigned Index;
@@ -495,7 +437,7 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
       align(primSize(PT_Float)) + ((1 + (4 - Index)) * align(IntSize));
 
   APSInt I = peekToAPSInt(S.Stk, getIntPrimType(S), Offset);
-  pushInt(S, I.getZExtValue());
+  pushInteger(S, I, Call->getType());
   return true;
 }
 
@@ -517,9 +459,12 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame,
                                      const Function *Func,
                                      const CallExpr *Call) {
+
+  Func->getDecl()->dump();
+
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  pushInt(S, Val.popcount());
+  pushInteger(S, APSInt(APInt(32, Val.popcount())), Call->getType());
   return true;
 }
 
@@ -528,7 +473,7 @@ static bool interp__builtin_parity(InterpState &S, CodePtr OpPC,
                                    const Function *Func, const CallExpr *Call) {
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  pushInt(S, Val.popcount() % 2);
+  pushInteger(S, Val.popcount() % 2, Call->getType());
   return true;
 }
 
@@ -537,7 +482,7 @@ static bool interp__builtin_clrsb(InterpState &S, CodePtr OpPC,
                                   const Function *Func, const CallExpr *Call) {
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  pushInt(S, Val.getBitWidth() - Val.getSignificantBits());
+  pushInteger(S, Val.getBitWidth() - Val.getSignificantBits(), Call->getType());
   return true;
 }
 
@@ -547,7 +492,8 @@ static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC,
                                        const CallExpr *Call) {
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  pushAPSInt(S, APSInt(Val.reverseBits(), /*IsUnsigned=*/true));
+  // pushAPSInt(S, APSInt(Val.reverseBits(), /*IsUnsigned=*/true));
+  pushInteger(S, Val.reverseBits(), Call->getType());
   return true;
 }
 
@@ -562,7 +508,7 @@ static bool interp__builtin_classify_type(InterpState &S, CodePtr OpPC,
   GCCTypeClass ResultClass =
       EvaluateBuiltinClassifyType(Arg->getType(), S.getLangOpts());
   int32_t ReturnVal = static_cast<int32_t>(ResultClass);
-  pushInt(S, ReturnVal);
+  pushInteger(S, ReturnVal, Call->getType());
   return true;
 }
 
@@ -582,7 +528,7 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC,
     Offset += align(primSize(PT_Float));
 
   APSInt Val = peekToAPSInt(S.Stk, ArgT, Offset);
-  pushLong(S, Val.getSExtValue());
+  pushInteger(S, Val, Call->getType());
   return true;
 }
 
@@ -605,7 +551,8 @@ static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
     Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())),
                     /*IsUnsigned=*/true);
 
-  pushAPSInt(S, Result);
+  // pushAPSInt(S, Result);
+  pushInteger(S, Result, Call->getType());
   return true;
 }
 
@@ -616,7 +563,7 @@ static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC,
   APSInt Value = peekToAPSInt(S.Stk, ArgT);
 
   uint64_t N = Value.countr_zero();
-  pushInt(S, N == Value.getBitWidth() ? 0 : N + 1);
+  pushInteger(S, N == Value.getBitWidth() ? 0 : N + 1, Call->getType());
   return true;
 }
 
@@ -659,7 +606,7 @@ static bool interp__builtin_eh_return_data_regno(InterpState &S, CodePtr OpPC,
 
   int Result =
       S.getCtx().getTargetInfo().getEHDataRegisterNumber(Arg.getZExtValue());
-  pushInt(S, Result);
+  pushInteger(S, Result, Call->getType());
   return true;
 }
 
@@ -837,7 +784,8 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC,
   CarryOutPtr.initialize();
 
   assert(Call->getType() == Call->getArg(0)->getType());
-  pushAPSInt(S, Result);
+  // pushAPSInt(S, Result);
+  pushInteger(S, Result, Call->getType());
   return true;
 }
 
@@ -857,7 +805,7 @@ static bool interp__builtin_clz(InterpState &S, CodePtr OpPC,
   if (ZeroIsUndefined && Val == 0)
     return false;
 
-  pushInt(S, Val.countl_zero());
+  pushInteger(S, APSInt(APInt(32, Val.countl_zero())), Call->getType());
   return true;
 }
 
@@ -870,7 +818,7 @@ static bool interp__builtin_ctz(InterpState &S, CodePtr OpPC,
   if (Val == 0)
     return false;
 
-  pushInt(S, Val.countr_zero());
+  pushInteger(S, Val.countr_zero(), Call->getType());
   return true;
 }
 
@@ -972,11 +920,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
   case Builtin::BI__assume:
     break;
   case Builtin::BI__builtin_strcmp:
-    if (!interp__builtin_strcmp(S, OpPC, Frame))
+    if (!interp__builtin_strcmp(S, OpPC, Frame, Call))
       return false;
     break;
   case Builtin::BI__builtin_strlen:
-    if (!interp__builtin_strlen(S, OpPC, Frame))
+    if (!interp__builtin_strlen(S, OpPC, Frame, Call))
       return false;
     break;
   case Builtin::BI__builtin_nan:
@@ -1036,38 +984,38 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
     break;
 
   case Builtin::BI__builtin_isnan:
-    if (!interp__builtin_isnan(S, OpPC, Frame, F))
+    if (!interp__builtin_isnan(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_issignaling:
-    if (!interp__builtin_issignaling(S, OpPC, Frame, F))
+    if (!interp__builtin_issignaling(S, OpPC, Frame, F, Call))
       return false;
     break;
 
   case Builtin::BI__builtin_isinf:
-    if (!interp__builtin_isinf(S, OpPC, Frame, F, /*Sign=*/false))
+    if (!interp__builtin_isinf(S, OpPC, Frame, F, /*Sign=*/false, Call))
       return false;
     break;
 
   case Builtin::BI__builtin_isinf_sign:
-    if (!interp__builtin_isinf(S, OpPC, Frame, F, /*Sign=*/true))
+    if (!interp__builtin_isinf(S, OpPC, Frame, F, /*Sign=*/true, Call))
       return false;
     break;
 
   case Builtin::BI__builtin_isfinite:
-    if (!interp__builtin_isfinite(S, OpPC, Frame, F))
+    if (!interp__builtin_isfinite(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_isnormal:
-    if (!interp__builtin_isnormal(S, OpPC, Frame, F))
+    if (!interp__builtin_isnormal(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_issubnormal:
-    if (!interp__builtin_issubnormal(S, OpPC, Frame, F))
+    if (!interp__builtin_issubnormal(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_iszero:
-    if (!interp__builtin_iszero(S, OpPC, Frame, F))
+    if (!interp__builtin_iszero(S, OpPC, Frame, F, Call))
       return false;
     break;
   case Builtin::BI__builtin_isfpclass:
@@ -1075,7 +1023,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
   case Builtin::BI__builtin_fpclassify:
-    if (!interp__builtin_fpclassify(S, OpPC, Frame, F))
+    if (!interp__builtin_fpclassify(S, OpPC, Frame, F, Call))
       return false;
     break;
 
diff --git a/clang/test/CodeGen/ms-intrinsics-other.c b/clang/test/CodeGen/ms-intrinsics-other.c
index 36c40dddcbb4f..189000d3213ac 100644
--- a/clang/test/CodeGen/ms-intrinsics-other.c
+++ b/clang/test/CodeGen/ms-intrinsics-other.c
@@ -1,38 +1,38 @@
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple x86_64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple x86_64--linux -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--linux -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM-ARM64
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple armv7--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple armv7--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple x86_64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple x86_64--linux -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--linux -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM-ARM64
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -triple armv7--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -fexperimental-new-constant-interpreter -triple armv7--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 
 // LP64 targets use 'long' as 'int' for MS intrinsics (-fms-extensions)

From f54004475110bb0a4033261041594266c8296242 Mon Sep 17 00:00:00 2001
From: mmoadeli <mahmoud.moadeli@codeplay.com>
Date: Mon, 26 Feb 2024 15:49:02 +0000
Subject: [PATCH 331/546] [NVPTX][AMDGPU][CodeGen] Fix `local_space nullptr`
 handling for NVPTX and local/private `nullptr` value for AMDGPU. (#78759)

- Address space cast of nullptr in local_space into a generic_space for
the CUDA backend. The reason for this cast was having invalid local
memory base address for the associated variable.
- In the context of AMD GPU, assigns a NULL value as ~0 for the address
spaces of sycl_local and sycl_private to match the ones for opencl_local
and opencl_private.
---
 clang/lib/Basic/Targets/AMDGPU.h              |   6 +-
 clang/lib/CodeGen/Targets/NVPTX.cpp           |  18 ++
 .../CodeGenSYCL/address-space-conversions.cpp | 171 +++++++++---------
 .../amd-address-space-conversions.cpp         | 128 +++++++++++++
 .../cuda-address-space-conversions.cpp        | 122 +++++++++++++
 5 files changed, 360 insertions(+), 85 deletions(-)
 create mode 100644 clang/test/CodeGenSYCL/amd-address-space-conversions.cpp
 create mode 100644 clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp

diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index e80589dde0ecb..94d9ba93ed226 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -414,8 +414,10 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUTargetInfo final : public TargetInfo {
   // value ~0.
   uint64_t getNullPointerValue(LangAS AS) const override {
     // FIXME: Also should handle region.
-    return (AS == LangAS::opencl_local || AS == LangAS::opencl_private)
-      ? ~0 : 0;
+    return (AS == LangAS::opencl_local || AS == LangAS::opencl_private ||
+            AS == LangAS::sycl_local || AS == LangAS::sycl_private)
+               ? ~0
+               : 0;
   }
 
   void setAuxTarget(const TargetInfo *Aux) override;
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index d0dc7c258a03a..8718f1ecf3a7e 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -47,6 +47,10 @@ class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo {
                            CodeGen::CodeGenModule &M) const override;
   bool shouldEmitStaticExternCAliases() const override;
 
+  llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
+                                 llvm::PointerType *T,
+                                 QualType QT) const override;
+
   llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const override {
     // On the device side, surface reference is represented as an object handle
     // in 64-bit integer.
@@ -285,6 +289,20 @@ void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
 bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
   return false;
 }
+
+llvm::Constant *
+NVPTXTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
+                                       llvm::PointerType *PT,
+                                       QualType QT) const {
+  auto &Ctx = CGM.getContext();
+  if (PT->getAddressSpace() != Ctx.getTargetAddressSpace(LangAS::opencl_local))
+    return llvm::ConstantPointerNull::get(PT);
+
+  auto NPT = llvm::PointerType::get(
+      PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
+  return llvm::ConstantExpr::getAddrSpaceCast(
+      llvm::ConstantPointerNull::get(NPT), PT);
+}
 }
 
 void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
diff --git a/clang/test/CodeGenSYCL/address-space-conversions.cpp b/clang/test/CodeGenSYCL/address-space-conversions.cpp
index 3933ad375412d..ee3183b74e038 100644
--- a/clang/test/CodeGenSYCL/address-space-conversions.cpp
+++ b/clang/test/CodeGenSYCL/address-space-conversions.cpp
@@ -1,138 +1,143 @@
 // RUN: %clang_cc1 -triple spir64 -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
 void bar(int &Data) {}
-// CHECK-DAG: define{{.*}} spir_func void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
+// CHECK: define{{.*}} spir_func void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
 void bar2(int &Data) {}
-// CHECK-DAG: define{{.*}} spir_func void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
+// CHECK: define{{.*}} spir_func void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
 void bar(__attribute__((opencl_local)) int &Data) {}
-// CHECK-DAG: define{{.*}} spir_func void [[LOC_REF:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
+// CHECK: define{{.*}} spir_func void [[LOC_REF:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
 void foo(int *Data) {}
-// CHECK-DAG: define{{.*}} spir_func void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
+// CHECK: define{{.*}} spir_func void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
 void foo2(int *Data) {}
-// CHECK-DAG: define{{.*}} spir_func void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
+// CHECK: define{{.*}} spir_func void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
 void foo(__attribute__((opencl_local)) int *Data) {}
-// CHECK-DAG: define{{.*}} spir_func void [[LOC_PTR:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
+// CHECK: define{{.*}} spir_func void [[LOC_PTR:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
 
 template <typename T>
 void tmpl(T t) {}
 // See Check Lines below.
 
 void usages() {
-  // CHECK-DAG: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1)
-  // CHECK-DAG: [[GLOB]].ascast = addrspacecast ptr [[GLOB]] to ptr addrspace(4)
+  int *NoAS;
+  // CHECK: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr addrspace(4)
   __attribute__((opencl_global)) int *GLOB;
-  // CHECK-DAG: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3)
-  // CHECK-DAG: [[LOC]].ascast = addrspacecast ptr [[LOC]] to ptr addrspace(4)
+  // CHECK: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1)
   __attribute__((opencl_local)) int *LOC;
-  // CHECK-DAG: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr addrspace(4)
-  // CHECK-DAG: [[NoAS]].ascast = addrspacecast ptr [[NoAS]] to ptr addrspace(4)
-  int *NoAS;
-  // CHECK-DAG: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr
-  // CHECK-DAG: [[PRIV]].ascast = addrspacecast ptr [[PRIV]] to ptr addrspace(4)
+  // CHECK: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3)
   __attribute__((opencl_private)) int *PRIV;
-  // CHECK-DAG: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5)
+  // CHECK: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr
   __attribute__((opencl_global_device)) int *GLOBDEVICE;
-  // CHECK-DAG: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(6)
+  // CHECK: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5)
   __attribute__((opencl_global_host)) int *GLOBHOST;
+  // CHECK: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(6)
+
+  // CHECK: [[NoAS]].ascast = addrspacecast ptr [[NoAS]] to ptr addrspace(4)
+  // CHECK: [[GLOB]].ascast = addrspacecast ptr [[GLOB]] to ptr addrspace(4)
+  // CHECK: [[LOC]].ascast = addrspacecast ptr [[LOC]] to ptr addrspace(4)
+  // CHECK: [[PRIV]].ascast = addrspacecast ptr [[PRIV]] to ptr addrspace(4)
+  LOC = nullptr;
+  // CHECK: store ptr addrspace(3) null, ptr addrspace(4) [[LOC]].ascast, align 8
+  GLOB = nullptr;
+  // CHECK: store ptr addrspace(1) null, ptr addrspace(4) [[GLOB]].ascast, align 8
 
   // Explicit conversions
   // From named address spaces to default address space
-  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
-  // CHECK-DAG: store ptr addrspace(4) [[GLOB_CAST]], ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
+  // CHECK: store ptr addrspace(4) [[GLOB_CAST]], ptr addrspace(4) [[NoAS]].ascast
   NoAS = (int *)GLOB;
-  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK-DAG: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr addrspace(4)
-  // CHECK-DAG: store ptr addrspace(4) [[LOC_CAST]], ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr addrspace(4)
+  // CHECK: store ptr addrspace(4) [[LOC_CAST]], ptr addrspace(4) [[NoAS]].ascast
   NoAS = (int *)LOC;
-  // CHECK-DAG: [[PRIV_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
-  // CHECK-DAG: [[PRIV_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[PRIV_LOAD]] to ptr addrspace(4)
-  // CHECK-DAG: store ptr addrspace(4) [[PRIV_CAST]], ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: [[PRIV_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
+  // CHECK: [[PRIV_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[PRIV_LOAD]] to ptr addrspace(4)
+  // CHECK: store ptr addrspace(4) [[PRIV_CAST]], ptr addrspace(4) [[NoAS]].ascast
   NoAS = (int *)PRIV;
   // From default address space to named address space
-  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(1)
-  // CHECK-DAG: store ptr addrspace(1) [[NoAS_CAST]], ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(1)
+  // CHECK: store ptr addrspace(1) [[NoAS_CAST]], ptr addrspace(4) [[GLOB]].ascast
   GLOB = (__attribute__((opencl_global)) int *)NoAS;
-  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(3)
-  // CHECK-DAG: store ptr addrspace(3) [[NoAS_CAST]], ptr addrspace(4) [[LOC]].ascast
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(3)
+  // CHECK: store ptr addrspace(3) [[NoAS_CAST]], ptr addrspace(4) [[LOC]].ascast
   LOC = (__attribute__((opencl_local)) int *)NoAS;
-  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr
-  // CHECK-DAG: store ptr [[NoAS_CAST]], ptr addrspace(4) [[PRIV]].ascast
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr
+  // CHECK: store ptr [[NoAS_CAST]], ptr addrspace(4) [[PRIV]].ascast
   PRIV = (__attribute__((opencl_private)) int *)NoAS;
   // From opencl_global_[host/device] address spaces to opencl_global
-  // CHECK-DAG: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr addrspace(4) [[GLOB_DEVICE]].ascast
-  // CHECK-DAG: [[GLOBDEVICE_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[GLOBDEVICE_LOAD]] to ptr addrspace(1)
-  // CHECK-DAG: store ptr addrspace(1) [[GLOBDEVICE_CAST]], ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr addrspace(4) [[GLOB_DEVICE]].ascast
+  // CHECK: [[GLOBDEVICE_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[GLOBDEVICE_LOAD]] to ptr addrspace(1)
+  // CHECK: store ptr addrspace(1) [[GLOBDEVICE_CAST]], ptr addrspace(4) [[GLOB]].ascast
   GLOB = (__attribute__((opencl_global)) int *)GLOBDEVICE;
-  // CHECK-DAG: [[GLOBHOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(6), ptr addrspace(4) [[GLOB_HOST]].ascast
-  // CHECK-DAG: [[GLOBHOST_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(6) [[GLOBHOST_LOAD]] to ptr addrspace(1)
-  // CHECK-DAG: store ptr addrspace(1) [[GLOBHOST_CAST]], ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOBHOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(6), ptr addrspace(4) [[GLOB_HOST]].ascast
+  // CHECK: [[GLOBHOST_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(6) [[GLOBHOST_LOAD]] to ptr addrspace(1)
+  // CHECK: store ptr addrspace(1) [[GLOBHOST_CAST]], ptr addrspace(4) [[GLOB]].ascast
   GLOB = (__attribute__((opencl_global)) int *)GLOBHOST;
 
   bar(*GLOB);
-  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
-  // CHECK-DAG: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST]])
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
+  // CHECK: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST]])
   bar2(*GLOB);
-  // CHECK-DAG: [[GLOB_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK-DAG: [[GLOB_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD2]] to ptr addrspace(4)
-  // CHECK-DAG: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST2]])
+  // CHECK: [[GLOB_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOB_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD2]] to ptr addrspace(4)
+  // CHECK: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST2]])
 
   bar(*LOC);
-  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK-DAG: call spir_func void [[LOC_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK: call spir_func void [[LOC_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
   bar2(*LOC);
-  // CHECK-DAG: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK-DAG: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr addrspace(4)
-  // CHECK-DAG: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[LOC_CAST2]])
+  // CHECK: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr addrspace(4)
+  // CHECK: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[LOC_CAST2]])
 
   bar(*NoAS);
-  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD]])
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD]])
   bar2(*NoAS);
-  // CHECK-DAG: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD2]])
+  // CHECK: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD2]])
 
   foo(GLOB);
-  // CHECK-DAG: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK-DAG: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr addrspace(4)
-  // CHECK-DAG: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[GLOB_CAST3]])
+  // CHECK: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr addrspace(4)
+  // CHECK: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[GLOB_CAST3]])
   foo2(GLOB);
-  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK-DAG: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr addrspace(4)
-  // CHECK-DAG: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[GLOB_CAST4]])
+  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr addrspace(4)
+  // CHECK: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[GLOB_CAST4]])
   foo(LOC);
-  // CHECK-DAG: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK-DAG: call spir_func void [[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
+  // CHECK: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK: call spir_func void [[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
   foo2(LOC);
-  // CHECK-DAG: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK-DAG: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr addrspace(4)
-  // CHECK-DAG: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[LOC_CAST4]])
+  // CHECK: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr addrspace(4)
+  // CHECK: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[LOC_CAST4]])
   foo(NoAS);
-  // CHECK-DAG: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[NoAS_LOAD3]])
+  // CHECK: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[NoAS_LOAD3]])
   foo2(NoAS);
-  // CHECK-DAG: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[NoAS_LOAD4]])
+  // CHECK: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[NoAS_LOAD4]])
 
   // Ensure that we still get 3 different template instantiations.
   tmpl(GLOB);
-  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK-DAG: call spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
+  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK: call spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
   tmpl(LOC);
-  // CHECK-DAG: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK-DAG: call spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
+  // CHECK: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK: call spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
   tmpl(PRIV);
-  // CHECK-DAG: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
-  // CHECK-DAG: call spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef [[PRIV_LOAD5]])
+  // CHECK: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
+  // CHECK: call spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef [[PRIV_LOAD5]])
   tmpl(NoAS);
-  // CHECK-DAG: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK-DAG: call spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef [[NoAS_LOAD5]])
+  // CHECK: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK: call spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef [[NoAS_LOAD5]])
 }
 
-// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef %
-// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef %
-// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef %
-// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef %
+// CHECK: define linkonce_odr spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef %
+// CHECK: define linkonce_odr spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef %
+// CHECK: define linkonce_odr spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef %
+// CHECK: define linkonce_odr spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef %
diff --git a/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp b/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp
new file mode 100644
index 0000000000000..d316f22096d3d
--- /dev/null
+++ b/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp
@@ -0,0 +1,128 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
+void bar(int &Data) {}
+// CHECK: define dso_local void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+void bar2(int &Data) {}
+// CHECK: define dso_local void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+void bar(__attribute__((opencl_local)) int &Data) {}
+// CHECK: define dso_local void @[[LOCAL_REF:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
+void foo(int *Data) {}
+// CHECK: define dso_local void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr noundef %
+void foo2(int *Data) {}
+// CHECK: define dso_local void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr noundef %
+void foo(__attribute__((opencl_local)) int *Data) {}
+// CHECK: define dso_local void @[[LOC_PTR:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
+
+template <typename T>
+void tmpl(T t);
+// See Check Lines below.
+
+void usages() {
+  int *NoAS;
+  // CHECK: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr, align 8, addrspace(5)
+  __attribute__((opencl_global)) int *GLOB;
+  // CHECK: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
+  __attribute__((opencl_local)) int *LOC;
+  // CHECK: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3), align 4, addrspace(5)
+  __attribute__((opencl_private)) int *PRIV;
+  // CHECK: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5), align 4, addrspace(5)
+  __attribute__((opencl_global_device)) int *GLOBDEVICE;
+  // CHECK: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
+  __attribute__((opencl_global_host)) int *GLOBHOST;
+  // CHECK: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
+  LOC = nullptr;
+  // CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr [[LOC]].ascast, align 4
+  GLOB = nullptr;
+  // CHECK: store ptr addrspace(1) null, ptr [[GLOB]].ascast, align 8
+  NoAS = (int *)GLOB;
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK: store ptr [[GLOB_CAST]], ptr [[NoAS]].ascast, align 8
+  NoAS = (int *)LOC;
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr
+  // CHECK: store ptr [[LOC_CAST]], ptr [[NoAS]].ascast, align 8
+  NoAS = (int *)PRIV;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr [[PRIV]].ascast, align 4
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[NoAS_LOAD]] to ptr
+  // CHECK: store ptr %5, ptr [[NoAS]].ascast, align 8
+  GLOB = (__attribute__((opencl_global)) int *)NoAS;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr %6 to ptr addrspace(1)
+  // CHECK: store ptr addrspace(1) %7, ptr [[GLOB]].ascast, align 8
+  LOC = (__attribute__((opencl_local)) int *)NoAS;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(3)
+  // CHECK: store ptr addrspace(3) %9, ptr [[LOC]].ascast, align 4
+  PRIV = (__attribute__((opencl_private)) int *)NoAS;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(5)
+  // CHECK: store ptr addrspace(5) [[NoAS_CAST]], ptr [[PRIV]].ascast, align 4
+  GLOB = (__attribute__((opencl_global)) int *)GLOBDEVICE;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]]DEVICE.ascast, align 8
+  // CHECK: store ptr addrspace(1) [[NoAS_LOAD]], ptr [[GLOB]].ascast, align 8
+  GLOB = (__attribute__((opencl_global)) int *)GLOBHOST;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]]HOST.ascast, align 8
+  // CHECK: tore ptr addrspace(1) [[NoAS_LOAD]], ptr [[GLOB]].ascast, align 8
+  bar(*GLOB);
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  bar2(*GLOB);
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  bar(*LOC);
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK: call void @_Z3barRU3AS3i(ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
+  bar2(*LOC);
+  // CHECK: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr
+  // CHECK: call void @_Z4bar2Ri(ptr noundef nonnull align 4 dereferenceable(4) [[LOC_CAST2]])
+  bar(*NoAS);
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: call void @_Z3barRi(ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD]])
+  bar2(*NoAS);
+  // CHECK: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: call void @_Z4bar2Ri(ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD2]])
+  foo(GLOB);
+  // CHECK: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr
+  // CHECK: call void @[[RAW_PTR]](ptr noundef [[GLOB_CAST3]])
+   foo2(GLOB);
+  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr
+  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[GLOB_CAST4]])
+  foo(LOC);
+  // CHECK: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK: call void @[[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
+  foo2(LOC);
+  // CHECK: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr
+  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[LOC_CAST4]])
+  foo(NoAS);
+  // CHECK: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: call void @[[RAW_PTR]](ptr noundef [[NoAS_LOAD3]])
+  foo2(NoAS);
+  // CHECK: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[NoAS_LOAD4]])
+
+  // Ensure that we still get 3 different template instantiations.
+  tmpl(GLOB);
+  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK: call void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
+  tmpl(LOC);
+  // CHECK: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK: call void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
+  tmpl(PRIV);
+  // CHECK: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr [[PRIV]].ascast, align 4
+  // CHECK: call void @_Z4tmplIPU3AS5iEvT_(ptr addrspace(5) noundef [[PRIV_LOAD5]])
+  tmpl(NoAS);
+  // CHECK: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK: call void @_Z4tmplIPiEvT_(ptr noundef [[NoAS_LOAD5]])
+}
+
+// CHECK: declare void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef)
+// CHECK: declare void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef)
+// CHECK: declare void @_Z4tmplIPU3AS5iEvT_(ptr addrspace(5) noundef)
+// CHECK: declare void @_Z4tmplIPiEvT_(ptr noundef)
+
diff --git a/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp b/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp
new file mode 100644
index 0000000000000..1875029de0856
--- /dev/null
+++ b/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
+void bar(int &Data) {}
+// CHECK: define dso_local void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+void bar2(int &Data) {}
+// CHECK: define dso_local void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+void bar(__attribute__((opencl_local)) int &Data) {}
+// CHECK: define dso_local void @[[LOCAL_REF:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
+void foo(int *Data) {}
+// CHECK: define dso_local void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr noundef %
+void foo2(int *Data) {}
+// CHECK: define dso_local void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr noundef %
+void foo(__attribute__((opencl_local)) int *Data) {}
+// CHECK: define dso_local void @[[LOC_PTR:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
+
+template <typename T>
+void tmpl(T t);
+// See Check Lines below.
+
+void usages() {
+  int *NoAS;
+  // CHECK: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr, align 8
+  __attribute__((opencl_global)) int *GLOB;
+  // CHECK: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
+  __attribute__((opencl_local)) int *LOC;
+  // CHECK: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3), align 8
+  __attribute__((opencl_private)) int *PRIV;
+  // CHECK: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr, align 8
+  __attribute__((opencl_global_device)) int *GLOBDEVICE;
+  // CHECK: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
+  __attribute__((opencl_global_host)) int *GLOBHOST;
+  // CHECK: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
+  LOC = nullptr;
+  // CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr [[LOC]], align 8
+  GLOB = nullptr;
+  // CHECK: store ptr addrspace(1) null, ptr [[GLOB]], align 8
+  NoAS = (int *)GLOB;
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK: store ptr [[GLOB_CAST]], ptr [[NoAS]], align 8
+  NoAS = (int *)LOC;
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr
+  // CHECK: store ptr [[LOC_CAST]], ptr [[NoAS]], align 8
+  NoAS = (int *)PRIV;
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[PRIV]], align 8
+  // CHECK: store ptr [[LOC_LOAD]], ptr [[NoAS]], align 8
+  GLOB = (__attribute__((opencl_global)) int *)NoAS;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(1)
+  // CHECK: store ptr addrspace(1) [[NoAS_CAST]], ptr [[GLOB]], align 8
+  LOC = (__attribute__((opencl_local)) int *)NoAS;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(3)
+  // CHECK: store ptr addrspace(3) [[NoAS_CAST]], ptr [[LOC]], align 8
+  PRIV = (__attribute__((opencl_private)) int *)NoAS;
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: store ptr [[NoAS_LOAD]], ptr [[PRIV]], align 8
+  GLOB = (__attribute__((opencl_global)) int *)GLOBDEVICE;
+  // CHECK: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB_DEVICE]], align 8
+  // CHECK: store ptr addrspace(1) [[GLOBDEVICE_LOAD]], ptr %GLOB, align 8
+  GLOB = (__attribute__((opencl_global)) int *)GLOBHOST;
+  // CHECK: [[GLOB_HOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB_HOST]], align 8
+  // CHECK: store ptr addrspace(1) [[GLOB_HOST_LOAD]], ptr [[GLOB]], align 8
+  bar(*GLOB);
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  bar2(*GLOB);
+  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  bar(*LOC);
+  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK: call void @[[LOCAL_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
+  bar2(*LOC);
+  // CHECK: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr
+  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[LOC_CAST2]])
+  bar(*NoAS);
+  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD]])
+  bar2(*NoAS);
+  // CHECK: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD2]])
+  foo(GLOB);
+  // CHECK: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr
+  // CHECK: call void @[[RAW_PTR]](ptr noundef [[GLOB_CAST3]])
+  foo2(GLOB);
+  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr
+  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[GLOB_CAST4]])
+  foo(LOC);
+  // CHECK: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK: call void @[[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
+  foo2(LOC);
+  // CHECK: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr
+  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[LOC_CAST4]])
+  foo(NoAS);
+  // CHECK: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: call void @[[RAW_PTR]](ptr noundef [[NoAS_LOAD3]])
+  foo2(NoAS);
+  // CHECK: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[NoAS_LOAD4]])
+  tmpl(GLOB);
+  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK: call void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
+  tmpl(LOC);
+  // CHECK: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK: call void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
+  tmpl(PRIV);
+  // CHECK: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr [[PRIV]], align 8
+  // CHECK: call void @_Z4tmplIPiEvT_(ptr noundef [[PRIV_LOAD5]])
+  tmpl(NoAS);
+// CHECK: %33 = load ptr, ptr %NoAS, align 8
+// CHECK: call void @_Z4tmplIPiEvT_(ptr noundef %33)
+}
+
+// CHECK: declare void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef)
+// CHECK: declare void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef)
+// CHECK: declare void @_Z4tmplIPiEvT_(ptr noundef)

From ebb64d8370f9f425f10c4b084aa62adc2926e2dd Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor@mac.com>
Date: Mon, 26 Feb 2024 10:50:37 -0500
Subject: [PATCH 332/546] [GlobalISel] Make the Combiner insert G_FREEZE when
 converting G_SELECT to binary operations. (#82733)

This is needed because the binary operators (G_OR and G_AND) do
not have the poison-suppressing semantics of G_SELECT.

Fixes https://github.com/llvm/llvm-project/issues/72475
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 12 +++++++----
 .../AArch64/GlobalISel/combine-select.mir     | 21 ++++++++++++-------
 llvm/test/CodeGen/AArch64/cmp-chains.ll       | 15 ++++++++-----
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e8a5c6fedc395..2f18a64ca285b 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6511,7 +6511,8 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
       B.setInstrAndDebugLoc(*Select);
       Register Ext = MRI.createGenericVirtualRegister(TrueTy);
       B.buildZExtOrTrunc(Ext, Cond);
-      B.buildOr(DstReg, Ext, False, Flags);
+      auto FreezeFalse = B.buildFreeze(TrueTy, False);
+      B.buildOr(DstReg, Ext, FreezeFalse, Flags);
     };
     return true;
   }
@@ -6523,7 +6524,8 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
       B.setInstrAndDebugLoc(*Select);
       Register Ext = MRI.createGenericVirtualRegister(TrueTy);
       B.buildZExtOrTrunc(Ext, Cond);
-      B.buildAnd(DstReg, Ext, True);
+      auto FreezeTrue = B.buildFreeze(TrueTy, True);
+      B.buildAnd(DstReg, Ext, FreezeTrue);
     };
     return true;
   }
@@ -6538,7 +6540,8 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
       // Then an ext to match the destination register.
       Register Ext = MRI.createGenericVirtualRegister(TrueTy);
       B.buildZExtOrTrunc(Ext, Inner);
-      B.buildOr(DstReg, Ext, True, Flags);
+      auto FreezeTrue = B.buildFreeze(TrueTy, True);
+      B.buildOr(DstReg, Ext, FreezeTrue, Flags);
     };
     return true;
   }
@@ -6553,7 +6556,8 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
       // Then an ext to match the destination register.
       Register Ext = MRI.createGenericVirtualRegister(TrueTy);
       B.buildZExtOrTrunc(Ext, Inner);
-      B.buildAnd(DstReg, Ext, False);
+      auto FreezeFalse = B.buildFreeze(TrueTy, False);
+      B.buildAnd(DstReg, Ext, FreezeFalse);
     };
     return true;
   }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index da35d60981ad3..7b73c8cec4774 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -118,7 +118,8 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]]
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -144,7 +145,8 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]]
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -171,7 +173,8 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d2
     ; CHECK-NEXT: %c:_(<2 x s1>) = G_TRUNC [[COPY]](<2 x s32>)
     ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[COPY1]](<2 x s32>)
-    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, %f
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s1>) = G_FREEZE %f
+    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, [[FREEZE]]
     ; CHECK-NEXT: %ext:_(<2 x s32>) = G_ANYEXT %sel(<2 x s1>)
     ; CHECK-NEXT: $d0 = COPY %ext(<2 x s32>)
     %0:_(<2 x s32>) = COPY $d0
@@ -199,7 +202,8 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -226,7 +230,8 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -255,7 +260,8 @@ body:             |
     ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
-    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], %t
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
+    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], [[FREEZE]]
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -284,7 +290,8 @@ body:             |
     ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
-    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], %f
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
+    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], [[FREEZE]]
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index c4ad84d9fa25b..1d9f39e518593 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -109,7 +109,8 @@ define i32 @cmp_or2(i32 %0, i32 %1, i32 %2, i32 %3) {
 ; GISEL-NEXT:    cset w8, lo
 ; GISEL-NEXT:    cmp w2, w3
 ; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %5 = icmp ult i32 %0, %1
   %6 = icmp ne i32 %2, %3
@@ -137,7 +138,8 @@ define i32 @cmp_or3(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
 ; GISEL-NEXT:    cmp w4, w5
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %7 = icmp ult i32 %0, %1
   %8 = icmp ugt i32 %2, %3
@@ -171,7 +173,8 @@ define i32 @cmp_or4(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    cset w11, eq
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %9 = icmp ult i32 %0, %1
   %10 = icmp ugt i32 %2, %3
@@ -199,7 +202,8 @@ define i32 @true_or2(i32 %0, i32 %1) {
 ; GISEL-NEXT:    cset w8, ne
 ; GISEL-NEXT:    cmp w1, #0
 ; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %3 = icmp ne i32 %0, 0
   %4 = icmp ne i32 %1, 0
@@ -227,7 +231,8 @@ define i32 @true_or3(i32 %0, i32 %1, i32 %2) {
 ; GISEL-NEXT:    cmp w2, #0
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    cset w9, ne
-; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
   %4 = icmp ne i32 %0, 0
   %5 = icmp ne i32 %1, 0

From c27d7085d4d5c640aba4992f5d01c0ffd1da9860 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 16:50:28 +0100
Subject: [PATCH 333/546] [clang][test] Undo an accidental test change

This was introduced in 264d828ea6399c31c210b67a050fbf084634da6a.
---
 clang/test/CodeGen/ms-intrinsics-other.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/clang/test/CodeGen/ms-intrinsics-other.c b/clang/test/CodeGen/ms-intrinsics-other.c
index 189000d3213ac..36c40dddcbb4f 100644
--- a/clang/test/CodeGen/ms-intrinsics-other.c
+++ b/clang/test/CodeGen/ms-intrinsics-other.c
@@ -1,38 +1,38 @@
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple x86_64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--linux -Oz -emit-llvm %s -o - \
+// RUN:         -triple x86_64--linux -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM-ARM64
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 // RUN: %clang_cc1 -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple armv7--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple armv7--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple x86_64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple x86_64--linux -Oz -emit-llvm %s -o - \
+// RUN:         -triple x86_64--linux -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM-ARM64
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple aarch64--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple aarch64--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 // RUN: %clang_cc1 -x c++ -std=c++11 \
 // RUN:         -ffreestanding -fms-extensions -Wno-implicit-function-declaration \
-// RUN:         -fexperimental-new-constant-interpreter -triple armv7--darwin -Oz -emit-llvm %s -o - \
+// RUN:         -triple armv7--darwin -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK-ARM
 
 // LP64 targets use 'long' as 'int' for MS intrinsics (-fms-extensions)

From 2730a5c68c6986bc8f01d047f8f31bcfd9316333 Mon Sep 17 00:00:00 2001
From: Samira Bazuzi <bazuzi@google.com>
Date: Mon, 26 Feb 2024 10:53:33 -0500
Subject: [PATCH 334/546] [clang][dataflow] Skip array types when handling
 InitListExprs. (#83013)

Crashes resulted from single-element InitListExprs for arrays with
elements of a record type after #80970.
---
 clang/lib/Analysis/FlowSensitive/Transfer.cpp   |  6 +++---
 .../Analysis/FlowSensitive/TransferTest.cpp     | 17 ++++++++++++++++-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index fe13e919bddcd..089854264f483 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -671,9 +671,9 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
     }
 
     if (!Type->isStructureOrClassType()) {
-      // Until array initialization is implemented, we don't need to care about
-      // cases where `getNumInits() > 1`.
-      if (S->getNumInits() == 1)
+      // Until array initialization is implemented, we skip arrays and don't
+      // need to care about cases where `getNumInits() > 1`.
+      if (!Type->isArrayType() && S->getNumInits() == 1)
         propagateValueOrStorageLocation(*S->getInit(0), *S, Env);
       return;
     }
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index a65b0446ac781..2be899f5b6da9 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2367,6 +2367,21 @@ TEST(TransferTest, InitListExprAsXValue) {
       });
 }
 
+TEST(TransferTest, ArrayInitListExprOneRecordElement) {
+  // This is a crash repro.
+  std::string Code = R"cc(
+    struct S {};
+
+    void target() { S foo[] = {S()}; }
+  )cc";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        // Just verify that it doesn't crash.
+      });
+}
+
 TEST(TransferTest, InitListExprAsUnion) {
   // This is a crash repro.
   std::string Code = R"cc(
@@ -3414,7 +3429,7 @@ TEST(TransferTest, AggregateInitializationFunctionPointer) {
     struct S {
       void (*const Field)();
     };
-    
+
     void target() {
       S s{nullptr};
     }

From b2ebd8b89777a1c5ba6acc4ad9f195ea2ad5f0de Mon Sep 17 00:00:00 2001
From: Krasimir Georgiev <krasimir@google.com>
Date: Mon, 26 Feb 2024 16:58:39 +0100
Subject: [PATCH 335/546] clang serialization unittests: fix some leaks
 (#82773)

No functional changes intended.

Fixes some leaks found by running under asan with `--gtest_repeat=2`.
---
 .../Serialization/ModuleCacheTest.cpp         | 19 +++++++++++++++----
 .../Serialization/VarDeclConstantInitTest.cpp |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/clang/unittests/Serialization/ModuleCacheTest.cpp b/clang/unittests/Serialization/ModuleCacheTest.cpp
index c3e347ffec660..a7ca98549b412 100644
--- a/clang/unittests/Serialization/ModuleCacheTest.cpp
+++ b/clang/unittests/Serialization/ModuleCacheTest.cpp
@@ -88,6 +88,17 @@ class ModuleCacheTest : public ::testing::Test {
         }
     )cpp");
   }
+
+  std::unique_ptr<CompilerInvocation>
+  createInvocationAndEnableFree(ArrayRef<const char *> Args,
+                                CreateInvocationOptions Opts) {
+    std::unique_ptr<CompilerInvocation> Invocation =
+        createInvocation(Args, Opts);
+    if (Invocation)
+      Invocation->getFrontendOpts().DisableFree = false;
+
+    return Invocation;
+  }
 };
 
 TEST_F(ModuleCacheTest, CachedModuleNewPath) {
@@ -106,7 +117,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPath) {
                         MCPArg.c_str(), "-working-directory", TestDir.c_str(),
                         "test.m"};
   std::shared_ptr<CompilerInvocation> Invocation =
-      createInvocation(Args, CIOpts);
+      createInvocationAndEnableFree(Args, CIOpts);
   ASSERT_TRUE(Invocation);
   CompilerInstance Instance;
   Instance.setDiagnostics(Diags.get());
@@ -129,7 +140,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPath) {
                          "-Fframeworks",  MCPArg.c_str(), "-working-directory",
                          TestDir.c_str(), "test.m"};
   std::shared_ptr<CompilerInvocation> Invocation2 =
-      createInvocation(Args2, CIOpts);
+      createInvocationAndEnableFree(Args2, CIOpts);
   ASSERT_TRUE(Invocation2);
   CompilerInstance Instance2(Instance.getPCHContainerOperations(),
                              &Instance.getModuleCache());
@@ -156,7 +167,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPathAllowErrors) {
                         MCPArg.c_str(), "-working-directory", TestDir.c_str(),
                         "test.m"};
   std::shared_ptr<CompilerInvocation> Invocation =
-      createInvocation(Args, CIOpts);
+      createInvocationAndEnableFree(Args, CIOpts);
   ASSERT_TRUE(Invocation);
   CompilerInstance Instance;
   Instance.setDiagnostics(Diags.get());
@@ -173,7 +184,7 @@ TEST_F(ModuleCacheTest, CachedModuleNewPathAllowErrors) {
       TestDir.c_str(), "-Xclang",      "-fallow-pcm-with-compiler-errors",
       "test.m"};
   std::shared_ptr<CompilerInvocation> Invocation2 =
-      createInvocation(Args2, CIOpts);
+      createInvocationAndEnableFree(Args2, CIOpts);
   ASSERT_TRUE(Invocation2);
   CompilerInstance Instance2(Instance.getPCHContainerOperations(),
                              &Instance.getModuleCache());
diff --git a/clang/unittests/Serialization/VarDeclConstantInitTest.cpp b/clang/unittests/Serialization/VarDeclConstantInitTest.cpp
index 86ae929e7f17e..7efa1c1d64a96 100644
--- a/clang/unittests/Serialization/VarDeclConstantInitTest.cpp
+++ b/clang/unittests/Serialization/VarDeclConstantInitTest.cpp
@@ -103,6 +103,7 @@ export namespace Fibonacci
   std::shared_ptr<CompilerInvocation> Invocation =
       createInvocation(Args, CIOpts);
   ASSERT_TRUE(Invocation);
+  Invocation->getFrontendOpts().DisableFree = false;
 
   CompilerInstance Instance;
   Instance.setDiagnostics(Diags.get());

From 82acec15afeb2bcba534a333c89cea33da7ffa47 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:08:59 -0500
Subject: [PATCH 336/546] [HLSL] Implementation of dot intrinsic (#81190)

This change implements https://github.com/llvm/llvm-project/issues/70073

HLSL has a dot intrinsic defined here:

https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-dot

The intrinsic itself is defined as a HLSL_LANG LangBuiltin in
Builtins.td.
This is used to associate all the dot product typdef defined
hlsl_intrinsics.h
with a single intrinsic check in CGBuiltin.cpp & SemaChecking.cpp.

In IntrinsicsDirectX.td we define the llvmIR for the dot product.
A few goals were in mind for this IR. First it should operate on only
vectors. Second the return type should be the vector element type. Third
the second parameter vector should be of the same size as the first
parameter. Finally `a dot b` should be the same as `b dot a`.

In CGBuiltin.cpp hlsl has built on top of existing clang intrinsics via
EmitBuiltinExpr. Dot
product though is language specific intrinsic and so is guarded behind
getLangOpts().HLSL.
The call chain looks like this: EmitBuiltinExpr -> EmitHLSLBuiltinExp

EmitHLSLBuiltinExp dot product intrinsics makes a destinction
between vectors and scalars. This is because HLSL supports dot product
on scalars which simplifies down to multiply.

Sema.h & SemaChecking.cpp saw the addition of
CheckHLSLBuiltinFunctionCall, a language specific semantic validation
that can be expanded for other hlsl specific intrinsics.

Fixes #70073
---
 clang/include/clang/Basic/Builtins.td         |   6 +
 clang/include/clang/Sema/Sema.h               |   3 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  51 ++++
 clang/lib/CodeGen/CodeGenFunction.h           |   1 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  98 +++++++
 clang/lib/Sema/SemaChecking.cpp               | 103 ++++++-
 .../CodeGenHLSL/builtins/dot-builtin.hlsl     |  30 ++
 clang/test/CodeGenHLSL/builtins/dot.hlsl      | 265 ++++++++++++++++++
 clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl  | 109 +++++++
 .../test/SemaHLSL/OverloadResolutionBugs.hlsl |  40 +++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   5 +
 11 files changed, 703 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
 create mode 100644 clang/test/CodeGenHLSL/builtins/dot.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index df74026c5d2d5..e3432f7925ba1 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4524,6 +4524,12 @@ def HLSLCreateHandle : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void*(unsigned char)";
 }
 
+def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_dot"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index c966a99c51968..ef4b93fac95ce 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -14063,6 +14063,7 @@ class Sema final {
   bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum);
   bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                      CallExpr *TheCall);
@@ -14128,6 +14129,8 @@ class Sema final {
 
   bool CheckPPCMMAType(QualType Type, SourceLocation TypeLoc);
 
+  bool SemaBuiltinVectorMath(CallExpr *TheCall, QualType &Res);
+  bool SemaBuiltinVectorToScalarMath(CallExpr *TheCall);
   bool SemaBuiltinElementwiseMath(CallExpr *TheCall);
   bool SemaBuiltinElementwiseTernaryMath(CallExpr *TheCall);
   bool PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 734eb5a035ca4..54d7451a9d622 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsBPF.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
 #include "llvm/IR/IntrinsicsHexagon.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsPowerPC.h"
@@ -5982,6 +5983,10 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
   }
 
+  // EmitHLSLBuiltinExpr will check getLangOpts().HLSL
+  if (Value *V = EmitHLSLBuiltinExpr(BuiltinID, E))
+    return RValue::get(V);
+
   if (getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)
     return EmitHipStdParUnsupportedBuiltin(this, FD);
 
@@ -17959,6 +17964,52 @@ llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
   return Arg;
 }
 
+Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
+                                            const CallExpr *E) {
+  if (!getLangOpts().HLSL)
+    return nullptr;
+
+  switch (BuiltinID) {
+  case Builtin::BI__builtin_hlsl_dot: {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    llvm::Type *T0 = Op0->getType();
+    llvm::Type *T1 = Op1->getType();
+    if (!T0->isVectorTy() && !T1->isVectorTy()) {
+      if (T0->isFloatingPointTy())
+        return Builder.CreateFMul(Op0, Op1, "dx.dot");
+
+      if (T0->isIntegerTy())
+        return Builder.CreateMul(Op0, Op1, "dx.dot");
+
+      // Bools should have been promoted
+      llvm_unreachable(
+          "Scalar dot product is only supported on ints and floats.");
+    }
+    // A VectorSplat should have happened
+    assert(T0->isVectorTy() && T1->isVectorTy() &&
+           "Dot product of vector and scalar is not supported.");
+
+    // A vector sext or sitofp should have happened
+    assert(T0->getScalarType() == T1->getScalarType() &&
+           "Dot product of vectors need the same element types.");
+
+    [[maybe_unused]] auto *VecTy0 =
+        E->getArg(0)->getType()->getAs<VectorType>();
+    [[maybe_unused]] auto *VecTy1 =
+        E->getArg(1)->getType()->getAs<VectorType>();
+    // A HLSLVectorTruncation should have happend
+    assert(VecTy0->getNumElements() == VecTy1->getNumElements() &&
+           "Dot product requires vectors to be of the same size.");
+
+    return Builder.CreateIntrinsic(
+        /*ReturnType*/ T0->getScalarType(), Intrinsic::dx_dot,
+        ArrayRef<Value *>{Op0, Op1}, nullptr, "dx.dot");
+  } break;
+  }
+  return nullptr;
+}
+
 Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
                                               const CallExpr *E) {
   llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 92ce0edeaf9e9..b2800f699ff4b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4405,6 +4405,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitPPCBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
+  llvm::Value *EmitHLSLBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
   llvm::Value *EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx,
                                            const CallExpr *E);
   llvm::Value *EmitSystemZBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index f87ac97799796..08e5d981a4a4c 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -179,6 +179,104 @@ double3 cos(double3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_cos)
 double4 cos(double4);
 
+//===----------------------------------------------------------------------===//
+// dot product builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn K dot(T X, T Y)
+/// \brief Return the dot product (a scalar value) of \a X and \a Y.
+/// \param X The X input value.
+/// \param Y The Y input value.
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+half dot(half, half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+half dot(half2, half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+half dot(half3, half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+half dot(half4, half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int16_t dot(int16_t, int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int16_t dot(int16_t2, int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int16_t dot(int16_t3, int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int16_t dot(int16_t4, int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint16_t dot(uint16_t, uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint16_t dot(uint16_t2, uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint16_t dot(uint16_t3, uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint16_t dot(uint16_t4, uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+float dot(float, float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+float dot(float2, float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+float dot(float3, float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+float dot(float4, float4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+double dot(double, double);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int dot(int, int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int dot(int2, int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int dot(int3, int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int dot(int4, int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint dot(uint, uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint dot(uint2, uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint dot(uint3, uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint dot(uint4, uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int64_t dot(int64_t, int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int64_t dot(int64_t2, int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int64_t dot(int64_t3, int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+int64_t dot(int64_t4, int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint64_t dot(uint64_t, uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint64_t dot(uint64_t2, uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint64_t dot(uint64_t3, uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_dot)
+uint64_t dot(uint64_t4, uint64_t4);
+
 //===----------------------------------------------------------------------===//
 // floor builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7fa295ebd9404..984088e345c80 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2120,10 +2120,11 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
 // not a valid type, emit an error message and return true. Otherwise return
 // false.
 static bool checkMathBuiltinElementType(Sema &S, SourceLocation Loc,
-                                        QualType Ty) {
-  if (!Ty->getAs<VectorType>() && !ConstantMatrixType::isValidElementType(Ty)) {
+                                        QualType ArgTy, int ArgIndex) {
+  if (!ArgTy->getAs<VectorType>() &&
+      !ConstantMatrixType::isValidElementType(ArgTy)) {
     return S.Diag(Loc, diag::err_builtin_invalid_arg_type)
-           << 1 << /* vector, integer or float ty*/ 0 << Ty;
+           << ArgIndex << /* vector, integer or float ty*/ 0 << ArgTy;
   }
 
   return false;
@@ -2961,6 +2962,9 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   }
   }
 
+  if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall))
+    return ExprError();
+
   // Since the target specific builtins for each arch overlap, only check those
   // of the arch we are compiling for.
   if (Context.BuiltinInfo.isTSBuiltin(BuiltinID)) {
@@ -5161,6 +5165,70 @@ bool Sema::CheckPPCMMAType(QualType Type, SourceLocation TypeLoc) {
   return false;
 }
 
+// Helper function for CheckHLSLBuiltinFunctionCall
+bool CheckVectorElementCallArgs(Sema *S, CallExpr *TheCall) {
+  assert(TheCall->getNumArgs() > 1);
+  ExprResult A = TheCall->getArg(0);
+  ExprResult B = TheCall->getArg(1);
+  QualType ArgTyA = A.get()->getType();
+  QualType ArgTyB = B.get()->getType();
+  auto *VecTyA = ArgTyA->getAs<VectorType>();
+  auto *VecTyB = ArgTyB->getAs<VectorType>();
+  SourceLocation BuiltinLoc = TheCall->getBeginLoc();
+  if (VecTyA == nullptr && VecTyB == nullptr)
+    return false;
+
+  if (VecTyA && VecTyB) {
+    bool retValue = false;
+    if (VecTyA->getElementType() != VecTyB->getElementType()) {
+      // Note: type promotion is intended to be handeled via the intrinsics
+      //  and not the builtin itself.
+      S->Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_incompatible_vector)
+          << TheCall->getDirectCallee()
+          << SourceRange(A.get()->getBeginLoc(), B.get()->getEndLoc());
+      retValue = true;
+    }
+    if (VecTyA->getNumElements() != VecTyB->getNumElements()) {
+      // if we get here a HLSLVectorTruncation is needed.
+      S->Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
+          << TheCall->getDirectCallee()
+          << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                         TheCall->getArg(1)->getEndLoc());
+      retValue = true;
+    }
+
+    if (retValue)
+      TheCall->setType(VecTyA->getElementType());
+
+    return retValue;
+  }
+
+  // Note: if we get here one of the args is a scalar which
+  // requires a VectorSplat on Arg0 or Arg1
+  S->Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
+      << TheCall->getDirectCallee()
+      << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                     TheCall->getArg(1)->getEndLoc());
+  return true;
+}
+
+// Note: returning true in this case results in CheckBuiltinFunctionCall
+// returning an ExprError
+bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  switch (BuiltinID) {
+  case Builtin::BI__builtin_hlsl_dot: {
+    if (checkArgCount(*this, TheCall, 2))
+      return true;
+    if (CheckVectorElementCallArgs(this, TheCall))
+      return true;
+    if (SemaBuiltinVectorToScalarMath(TheCall))
+      return true;
+    break;
+  }
+  }
+  return false;
+}
+
 bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
                                           CallExpr *TheCall) {
   // position of memory order and scope arguments in the builtin
@@ -19594,7 +19662,7 @@ bool Sema::PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall) {
   TheCall->setArg(0, A.get());
   QualType TyA = A.get()->getType();
 
-  if (checkMathBuiltinElementType(*this, A.get()->getBeginLoc(), TyA))
+  if (checkMathBuiltinElementType(*this, A.get()->getBeginLoc(), TyA, 1))
     return true;
 
   TheCall->setType(TyA);
@@ -19602,6 +19670,27 @@ bool Sema::PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall) {
 }
 
 bool Sema::SemaBuiltinElementwiseMath(CallExpr *TheCall) {
+  QualType Res;
+  if (SemaBuiltinVectorMath(TheCall, Res))
+    return true;
+  TheCall->setType(Res);
+  return false;
+}
+
+bool Sema::SemaBuiltinVectorToScalarMath(CallExpr *TheCall) {
+  QualType Res;
+  if (SemaBuiltinVectorMath(TheCall, Res))
+    return true;
+
+  if (auto *VecTy0 = Res->getAs<VectorType>())
+    TheCall->setType(VecTy0->getElementType());
+  else
+    TheCall->setType(Res);
+
+  return false;
+}
+
+bool Sema::SemaBuiltinVectorMath(CallExpr *TheCall, QualType &Res) {
   if (checkArgCount(*this, TheCall, 2))
     return true;
 
@@ -19609,8 +19698,7 @@ bool Sema::SemaBuiltinElementwiseMath(CallExpr *TheCall) {
   ExprResult B = TheCall->getArg(1);
   // Do standard promotions between the two arguments, returning their common
   // type.
-  QualType Res =
-      UsualArithmeticConversions(A, B, TheCall->getExprLoc(), ACK_Comparison);
+  Res = UsualArithmeticConversions(A, B, TheCall->getExprLoc(), ACK_Comparison);
   if (A.isInvalid() || B.isInvalid())
     return true;
 
@@ -19622,12 +19710,11 @@ bool Sema::SemaBuiltinElementwiseMath(CallExpr *TheCall) {
                 diag::err_typecheck_call_different_arg_types)
            << TyA << TyB;
 
-  if (checkMathBuiltinElementType(*this, A.get()->getBeginLoc(), TyA))
+  if (checkMathBuiltinElementType(*this, A.get()->getBeginLoc(), TyA, 1))
     return true;
 
   TheCall->setArg(0, A.get());
   TheCall->setArg(1, B.get());
-  TheCall->setType(Res);
   return false;
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
new file mode 100644
index 0000000000000..9881dabc3a110
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/dot-builtin.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK-LABEL: builtin_bool_to_float_type_promotion
+// CHECK: %conv1 = uitofp i1 %tobool to double
+// CHECK: %dx.dot = fmul double %conv, %conv1
+// CHECK: %conv2 = fptrunc double %dx.dot to float
+// CHECK: ret float %conv2
+float builtin_bool_to_float_type_promotion ( float p0, bool p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+}
+
+// CHECK-LABEL: builtin_bool_to_float_arg1_type_promotion
+// CHECK: %conv = uitofp i1 %tobool to double
+// CHECK: %conv1 = fpext float %1 to double
+// CHECK: %dx.dot = fmul double %conv, %conv1
+// CHECK: %conv2 = fptrunc double %dx.dot to float
+// CHECK: ret float %conv2
+float builtin_bool_to_float_arg1_type_promotion ( bool p0, float p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+}
+
+// CHECK-LABEL: builtin_dot_int_to_float_promotion
+// CHECK: %conv = fpext float %0 to double
+// CHECK: %conv1 = sitofp i32 %1 to double
+// CHECK: dx.dot = fmul double %conv, %conv1
+// CHECK: %conv2 = fptrunc double %dx.dot to float
+// CHECK: ret float %conv2
+float builtin_dot_int_to_float_promotion ( float p0, int p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+}
diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl
new file mode 100644
index 0000000000000..b2c1bae31d13b
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl
@@ -0,0 +1,265 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+#ifdef __HLSL_ENABLE_16_BIT
+// NATIVE_HALF: %dx.dot = mul i16 %0, %1
+// NATIVE_HALF: ret i16 %dx.dot
+int16_t test_dot_short ( int16_t p0, int16_t p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
+// NATIVE_HALF: ret i16 %dx.dot
+int16_t test_dot_short2 ( int16_t2 p0, int16_t2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
+// NATIVE_HALF: ret i16 %dx.dot
+int16_t test_dot_short3 ( int16_t3 p0, int16_t3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
+// NATIVE_HALF: ret i16 %dx.dot
+int16_t test_dot_short4 ( int16_t4 p0, int16_t4 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = mul i16 %0, %1
+// NATIVE_HALF: ret i16 %dx.dot
+uint16_t test_dot_ushort ( uint16_t p0, uint16_t p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v2i16(<2 x i16> %0, <2 x i16> %1)
+// NATIVE_HALF: ret i16 %dx.dot
+uint16_t test_dot_ushort2 ( uint16_t2 p0, uint16_t2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v3i16(<3 x i16> %0, <3 x i16> %1)
+// NATIVE_HALF: ret i16 %dx.dot
+uint16_t test_dot_ushort3 ( uint16_t3 p0, uint16_t3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call i16 @llvm.dx.dot.v4i16(<4 x i16> %0, <4 x i16> %1)
+// NATIVE_HALF: ret i16 %dx.dot
+uint16_t test_dot_ushort4 ( uint16_t4 p0, uint16_t4 p1 ) {
+  return dot ( p0, p1 );
+}
+#endif
+
+// CHECK: %dx.dot = mul i32 %0, %1
+// CHECK: ret i32 %dx.dot
+int test_dot_int ( int p0, int p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
+// CHECK: ret i32 %dx.dot
+int test_dot_int2 ( int2 p0, int2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
+// CHECK: ret i32 %dx.dot
+int test_dot_int3 ( int3 p0, int3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
+// CHECK: ret i32 %dx.dot
+int test_dot_int4 ( int4 p0, int4 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = mul i32 %0, %1
+// CHECK: ret i32 %dx.dot
+uint test_dot_uint ( uint p0, uint p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i32 @llvm.dx.dot.v2i32(<2 x i32> %0, <2 x i32> %1)
+// CHECK: ret i32 %dx.dot
+uint test_dot_uint2 ( uint2 p0, uint2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i32 @llvm.dx.dot.v3i32(<3 x i32> %0, <3 x i32> %1)
+// CHECK: ret i32 %dx.dot
+uint test_dot_uint3 ( uint3 p0, uint3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i32 @llvm.dx.dot.v4i32(<4 x i32> %0, <4 x i32> %1)
+// CHECK: ret i32 %dx.dot
+uint test_dot_uint4 ( uint4 p0, uint4 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = mul i64 %0, %1
+// CHECK: ret i64 %dx.dot
+int64_t test_dot_long ( int64_t p0, int64_t p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
+// CHECK: ret i64 %dx.dot
+int64_t test_dot_long2 ( int64_t2 p0, int64_t2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
+// CHECK: ret i64 %dx.dot
+int64_t test_dot_long3 ( int64_t3 p0, int64_t3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
+// CHECK: ret i64 %dx.dot
+int64_t test_dot_long4 ( int64_t4 p0, int64_t4 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK:  %dx.dot = mul i64 %0, %1
+// CHECK: ret i64 %dx.dot
+uint64_t test_dot_ulong ( uint64_t p0, uint64_t p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i64 @llvm.dx.dot.v2i64(<2 x i64> %0, <2 x i64> %1)
+// CHECK: ret i64 %dx.dot
+uint64_t test_dot_ulong2 ( uint64_t2 p0, uint64_t2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i64 @llvm.dx.dot.v3i64(<3 x i64> %0, <3 x i64> %1)
+// CHECK: ret i64 %dx.dot
+uint64_t test_dot_ulong3 ( uint64_t3 p0, uint64_t3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call i64 @llvm.dx.dot.v4i64(<4 x i64> %0, <4 x i64> %1)
+// CHECK: ret i64 %dx.dot
+uint64_t test_dot_ulong4 ( uint64_t4 p0, uint64_t4 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = fmul half %0, %1
+// NATIVE_HALF: ret half %dx.dot
+// NO_HALF: %dx.dot = fmul float %0, %1
+// NO_HALF: ret float %dx.dot
+half test_dot_half ( half p0, half p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call half @llvm.dx.dot.v2f16(<2 x half> %0, <2 x half> %1)
+// NATIVE_HALF: ret half %dx.dot
+// NO_HALF: %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %0, <2 x float> %1)
+// NO_HALF: ret float %dx.dot
+half test_dot_half2 ( half2 p0, half2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call half @llvm.dx.dot.v3f16(<3 x half> %0, <3 x half> %1)
+// NATIVE_HALF: ret half %dx.dot
+// NO_HALF: %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %0, <3 x float> %1)
+// NO_HALF: ret float %dx.dot
+half test_dot_half3 ( half3 p0, half3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// NATIVE_HALF: %dx.dot = call half @llvm.dx.dot.v4f16(<4 x half> %0, <4 x half> %1)
+// NATIVE_HALF: ret half %dx.dot
+// NO_HALF: %dx.dot = call float @llvm.dx.dot.v4f32(<4 x float> %0, <4 x float> %1)
+// NO_HALF: ret float %dx.dot
+half test_dot_half4 ( half4 p0, half4 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = fmul float %0, %1
+// CHECK: ret float %dx.dot
+float test_dot_float ( float p0, float p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %0, <2 x float> %1)
+// CHECK: ret float %dx.dot
+float test_dot_float2 ( float2 p0, float2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %0, <3 x float> %1)
+// CHECK: ret float %dx.dot
+float test_dot_float3 ( float3 p0, float3 p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = call float @llvm.dx.dot.v4f32(<4 x float> %0, <4 x float> %1)
+// CHECK: ret float %dx.dot
+float test_dot_float4 ( float4 p0, float4 p1) {
+  return dot ( p0, p1 );
+}
+
+// CHECK:  %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %splat.splat, <2 x float> %1)
+// CHECK: ret float %dx.dot
+float test_dot_float2_splat ( float p0, float2 p1 ) {
+  return dot( p0, p1 );
+}
+
+// CHECK:  %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %splat.splat, <3 x float> %1)
+// CHECK: ret float %dx.dot
+float test_dot_float3_splat ( float p0, float3 p1 ) {
+  return dot( p0, p1 );
+}
+
+// CHECK:  %dx.dot = call float @llvm.dx.dot.v4f32(<4 x float> %splat.splat, <4 x float> %1)
+// CHECK: ret float %dx.dot
+float test_dot_float4_splat ( float p0, float4 p1 ) {
+  return dot( p0, p1 );
+}
+
+// CHECK: %conv = sitofp i32 %1 to float
+// CHECK: %splat.splatinsert = insertelement <2 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
+// CHECK: %dx.dot = call float @llvm.dx.dot.v2f32(<2 x float> %0, <2 x float> %splat.splat)
+// CHECK: ret float %dx.dot
+float test_builtin_dot_float2_int_splat ( float2 p0, int p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %conv = sitofp i32 %1 to float
+// CHECK: %splat.splatinsert = insertelement <3 x float> poison, float %conv, i64 0
+// CHECK: %splat.splat = shufflevector <3 x float> %splat.splatinsert, <3 x float> poison, <3 x i32> zeroinitializer
+// CHECK: %dx.dot = call float @llvm.dx.dot.v3f32(<3 x float> %0, <3 x float> %splat.splat)
+// CHECK: ret float %dx.dot
+float test_builtin_dot_float3_int_splat ( float3 p0, int p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %dx.dot = fmul double %0, %1
+// CHECK: ret double %dx.dot
+double test_dot_double ( double p0, double p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %conv = zext i1 %tobool to i32
+// CHECK: %dx.dot = mul i32 %conv, %1
+// CHECK: ret i32 %dx.dot
+int test_dot_bool_scalar_arg0_type_promotion ( bool p0, int p1 ) {
+  return dot ( p0, p1 );
+}
+
+// CHECK: %conv = zext i1 %tobool to i32
+// CHECK: %dx.dot = mul i32 %0, %conv
+// CHECK: ret i32 %dx.dot
+int test_dot_bool_scalar_arg1_type_promotion ( int p0, bool p1 ) {
+  return dot ( p0, p1 );
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
new file mode 100644
index 0000000000000..54d093aa7ce3a
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/dot-errors.hlsl
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -verify -verify-ignore-unexpected
+
+float test_no_second_arg ( float2 p0) {
+  return __builtin_hlsl_dot ( p0 );
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+}
+
+float test_too_many_arg ( float2 p0) {
+  return __builtin_hlsl_dot ( p0, p0, p0 );
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+float test_dot_no_second_arg ( float2 p0) {
+  return dot ( p0 );
+  // expected-error@-1 {{no matching function for call to 'dot'}}
+}
+
+float test_dot_vector_size_mismatch ( float3 p0, float2 p1 ) {
+  return dot ( p0, p1 );
+  // expected-warning@-1 {{implicit conversion truncates vector: 'float3' (aka 'vector<float, 3>') to 'float __attribute__((ext_vector_type(2)))' (vector of 2 'float' values)}}
+}
+
+float test_dot_builtin_vector_size_mismatch ( float3 p0, float2 p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+float test_dot_scalar_mismatch ( float p0, int p1 ) {
+  return dot ( p0, p1 );
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+
+float test_dot_element_type_mismatch ( int2 p0, float2 p1 ) {
+  return dot ( p0, p1 );
+  // expected-error@-1 {{call to 'dot' is ambiguous}}
+}
+
+//NOTE: for all the *_promotion we are intentionally not handling type promotion in builtins
+float test_builtin_dot_vec_int_to_float_promotion ( int2 p0, float2 p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+int64_t test_builtin_dot_vec_int_to_int64_promotion( int64_t2 p0, int2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+float test_builtin_dot_vec_half_to_float_promotion( float2 p0, half2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+#ifdef __HLSL_ENABLE_16_BIT
+float test_builtin_dot_vec_int16_to_float_promotion( float2 p0, int16_t2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+half test_builtin_dot_vec_int16_to_half_promotion( half2 p0, int16_t2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+int test_builtin_dot_vec_int16_to_int_promotion( int2 p0, int16_t2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+
+int64_t test_builtin_dot_vec_int16_to_int64_promotion( int64_t2 p0, int16_t2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must have the same type}}
+}
+#endif
+
+float test_builtin_dot_float2_splat ( float p0, float2 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_builtin_dot_float3_splat ( float p0, float3 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_builtin_dot_float4_splat ( float p0, float4 p1 ) {
+  return __builtin_hlsl_dot( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_dot_float2_int_splat ( float2 p0, int p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_dot_float3_int_splat ( float3 p0, int p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+float test_builtin_dot_int_vect_to_float_vec_promotion ( int2 p0, float p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+  // expected-error@-1 {{first two arguments to '__builtin_hlsl_dot' must be vectors}}
+}
+
+int test_builtin_dot_bool_type_promotion ( bool p0, bool p1 ) {
+  return __builtin_hlsl_dot ( p0, p1 );
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type (was 'bool')}}
+}
diff --git a/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl b/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
index 135d6cf335c13..8464f1c1a7c2c 100644
--- a/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
+++ b/clang/test/SemaHLSL/OverloadResolutionBugs.hlsl
@@ -24,6 +24,46 @@ void Call4(int16_t H) {
   Fn4(H);
 }
 
+int test_builtin_dot_bool_type_promotion ( bool p0, bool p1 ) {
+  return dot ( p0, p1 );
+}
+
+float test_dot_scalar_mismatch ( float p0, int p1 ) {
+  return dot ( p0, p1 );
+}
+
+float test_dot_element_type_mismatch ( int2 p0, float2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+float test_builtin_dot_vec_int_to_float_promotion ( int2 p0, float2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+int64_t test_builtin_dot_vec_int_to_int64_promotion( int64_t2 p0, int2 p1 ) {
+  return dot ( p0, p1 );
+}
+
+float test_builtin_dot_vec_half_to_float_promotion( float2 p0, half2 p1 ) {
+  return dot( p0, p1 );
+}
+
+float test_builtin_dot_vec_int16_to_float_promotion( float2 p0, int16_t2 p1 ) {
+  return dot( p0, p1 );
+}
+
+half test_builtin_dot_vec_int16_to_half_promotion( half2 p0, int16_t2 p1 ) {
+  return dot( p0, p1 );
+}
+
+int test_builtin_dot_vec_int16_to_int_promotion( int2 p0, int16_t2 p1 ) {
+  return dot( p0, p1 );
+}
+
+int64_t test_builtin_dot_vec_int16_to_int64_promotion( int64_t2 p0, int16_t2 p1 ) {
+  return dot( p0, p1 );
+}
+
 // https://github.com/llvm/llvm-project/issues/81049
 
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 2fe4fdfd5953b..c192d4b84417c 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -19,4 +19,9 @@ def int_dx_flattened_thread_id_in_group : Intrinsic<[llvm_i32_ty], [], [IntrNoMe
 
 def int_dx_create_handle : ClangBuiltin<"__builtin_hlsl_create_handle">,
     Intrinsic<[ llvm_ptr_ty ], [llvm_i8_ty], [IntrWillReturn]>;
+
+def int_dx_dot : 
+    Intrinsic<[LLVMVectorElementType<0>], 
+    [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
+    [IntrNoMem, IntrWillReturn, Commutative] >;
 }

From 45732b64542e37f4908ced2477a25b7a0d703893 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 26 Feb 2024 17:40:56 +0100
Subject: [PATCH 337/546] [mlir][Transforms] Fix compile time regression in
 dialect conversion (#83023)

The dialect conversion does not directly erase ops that are
replaced/erased with a rewriter. Instead, the op stays in place and is
erased at the end if the dialect conversion succeeds. However, ops that
were replaced/erased are ignored from that point on.

#81757 introduced a compile time regression that made the check whether
an op is ignored or not more expensive. Whether an op is ignored or not
is queried many times throughout a dialect conversion, so the check must
be fast.

After this change, replaced ops are stored in the `ignoredOps` set. This
also simplifies the dialect conversion a bit.
---
 mlir/lib/Transforms/Utils/DialectConversion.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 857b601acbc35..4165e0a52428f 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1229,9 +1229,8 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
 }
 
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
-  // Check to see if this operation was replaced or its parent ignored.
-  return ignoredOps.count(op->getParentOp()) ||
-         hasRewrite<ReplaceOperationRewrite>(rewrites, op);
+  // Check to see if this operation or the parent operation is ignored.
+  return ignoredOps.count(op->getParentOp()) || ignoredOps.count(op);
 }
 
 void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
@@ -1480,12 +1479,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
 void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
                                                      ValueRange newValues) {
   assert(newValues.size() == op->getNumResults());
-#ifndef NDEBUG
-  for (auto &rewrite : rewrites)
-    if (auto *opReplacement = dyn_cast<ReplaceOperationRewrite>(rewrite.get()))
-      assert(opReplacement->getOperation() != op &&
-             "operation was already replaced");
-#endif // NDEBUG
+  assert(!ignoredOps.contains(op) && "operation was already replaced");
 
   // Track if any of the results changed, e.g. erased and replaced with null.
   bool resultChanged = false;
@@ -1506,6 +1500,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
 
   // Mark this operation as recursively ignored so that we don't need to
   // convert any nested operations.
+  ignoredOps.insert(op);
   markNestedOpsIgnored(op);
 }
 

From 1d2eced0067bea989e98efc9265725b3aeca0af9 Mon Sep 17 00:00:00 2001
From: Tom Honermann <tom.honermann@intel.com>
Date: Mon, 26 Feb 2024 08:46:03 -0800
Subject: [PATCH 338/546] [llvm][docs] Update the Python version requirement to
 3.8.0 for lit testing on Windows with substitute (virtual) drives. (#81663)

Following the changes made for:
- https://reviews.llvm.org/D154130: [lit][clang] Avoid realpath on Windows due to MAX_PATH limitations
in commit:
- 05d613ea931b6de1b46dfe04b8e55285359047f4

Python 3.8.0 or newer is now required by at least the following tests
when they are run on Windows from a substitute (virtual) drive. A
substitute drive is often used as a workaround for `MAX_PATH`
limitations on Windows. These tests are impacted because they use the
lit `%{?:real}` path expansion syntax to expand symbolic links and
substitute drives. This path expansion is implemented with Python's
`os.path.realpath()` function which changed behavior in Python 3.8.0
with regard to expansion of substitute drives. The changes mentioned
above rely on the newer Python behavior.
- `clang/test/Lexer/case-insensitive-include-absolute.c`
- `clang/test/Lexer/case-insensitive-include-win.c`

This change updates the LLVM Getting Started guide to note this newer
Python version dependency for this relatively niche case. Python 3.6.0
remains the minimum required Python version otherwise.
---
 llvm/docs/GettingStarted.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 687d1f29b5a1f..7634199babbad 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -295,8 +295,9 @@ Package                                                     Version      Notes
 
 .. note::
 
-   #. Only needed if you want to run the automated test suite in the
-      ``llvm/test`` directory.
+   #. Only needed if you want to run the automated test suite. Python 3.8.0
+      or later is needed on Windows if a substitute (virtual) drive is used
+      to access LLVM source code due to ``MAX_PATH`` limitations.
    #. Optional, adds compression / uncompression capabilities to selected LLVM
       tools.
    #. Optional, you can use any other build tool supported by CMake.

From 4bc3b3501ff994fb3504ed2b973342821a9c8cea Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 26 Feb 2024 10:11:57 -0700
Subject: [PATCH 339/546] [lld/ELF] Add documentation on large sections
 (#82560)

Fixes #82438
---
 lld/docs/ELF/large_section_layout_nopic.png | Bin 0 -> 3751 bytes
 lld/docs/ELF/large_section_layout_pic.png   | Bin 0 -> 3748 bytes
 lld/docs/ELF/large_sections.rst             |  52 ++++++++++++++++++++
 lld/docs/ELF/section_layout.png             | Bin 0 -> 1872 bytes
 lld/docs/index.rst                          |   1 +
 5 files changed, 53 insertions(+)
 create mode 100644 lld/docs/ELF/large_section_layout_nopic.png
 create mode 100644 lld/docs/ELF/large_section_layout_pic.png
 create mode 100644 lld/docs/ELF/large_sections.rst
 create mode 100644 lld/docs/ELF/section_layout.png

diff --git a/lld/docs/ELF/large_section_layout_nopic.png b/lld/docs/ELF/large_section_layout_nopic.png
new file mode 100644
index 0000000000000000000000000000000000000000..c1c60e37f615210e0806899fddf0403dd401f383
GIT binary patch
literal 3751
zcmcIndsGtG*2mFtdQnrBHlSHHX-%kEf>M}|d}NqrPv4r9f+&syeu@e?SvHE~_1e&g
zMyY{WDk+)aqwJwriX<wCicbWUvJ~+Bxc1GgZ{2Ury7&9mx@*oq=d69s-g}+B_wRT1
zNjv4~23!YP2LJ$o?#G=^ug>)VfY#16x~r?9Pf1n)z($<Av*X$LppVkHlFR3|@g`li
zZMz@$Yrc+-OZt}kZ+EU)cluVSmZ6rT_h#?SE*H0EZCPWr^=looo!b-iOiqT4?%a7I
zi*_L8IA?LUYg#B_sbXewBJNo<DEclZsBb2F@m1YIuT6GI>~x*tsO|bgtA78ZQ9pZ7
zstP_0Z3kW~c6>kqedCHBFKqm|EpProOo(;S(Fnus{Nl|2Egr6f_r#yaSn5K9u!vn5
zVtK%zq^!>N%|V7y55)+~hBCgL$F+XNaA!17Dz8s<gDI7%eVW^k#kgb}|M{G+DNsoh
z{sOi%pI;u4Sbp_6HbXM{2!5Ct8``HJzE@FepO-1T1=kroyBb++)Gdl#Tp~{0Qt?u^
zt4EcrP%zBwlnRYyLfLe!EJqH9vYXd3dJq?g6El!=hF~nIb2~ntEgzMUM4V9UQi1?C
zHwj`WAS9Cy16--@r_WX0A=*iOoFsBd_VSIn@(l|*K3({0%#(W*zTM81>T{@;>(A+h
z*%xkGMP1QA&Lgh85xUy;s+L0QYIVBb;ByoPw4-*d5kZ$;+a!M&<M!fHnABNG)(zg-
zU}$5&>P-rxl5=3sEW}6DD#i$6A+Z%5bZNIu{UJuBYS^p03HUgl+06R;@?w!)^jm?K
z6|DYRrwjh3#!%!3j?y+9;Wlwt3N-T5c!A}Mgd&lERIdspL%fL7laYhm*Q)kfi*7{1
zM;7hqKEm3{(?LDm9FAfW&7v@3tf28XuRv=z@yJK)hq#1|n9{U9&8DYnrltlqRdp<v
z*gBavI+xD0Kc@DoQ{J7BbP6>jcP#(*fZPT-@0njYj95sRey4;IyiqM?^fSiLjT!O|
zH>!3aT;n}lkM<;W*dF+kxWn+Y{hIXu``n+@DP4K{7{*lwigvV3L)^YlYL*N3CN8dH
zR*$bk55s_*hXWaSJ8{<ldB6oib;RKA_t)b+$cP%QLty%O)Yby1o7t2YVc<hoi&}s}
z`&d;CE=fG_$PA&Bp=uIw<%E^b$7mBq`uuf5A7>z+E+hH}ee!z(D*Uz_{@UVaLgAu1
z$%8AxqhkU-P0-gB@y`rxS;N`3x(Tb#qFGQ``ot?S0VwZfsh=JU5cWtQhhS2ye%V9a
z%e`O8@^zr^>h_?vZ}48KMen6OKZKqZpcvbxsz(pH^X5CksaD^RVBwnt`)Mq@+t%eD
zFCveXl|e+BGIq8v-Z7T{qJjy`r<O;N`9%mDNl5Ag`3B50Q8Xg18@p0wq4q_!IQH|~
zd{k|tI??SRk1qF6@Q!!afhu}#d;!e8f0vSf3bhA+mnP-jKMDSAMk&39QW^PlVvJ#W
z!0=kjpYSwQOQhvZr<JOi(r${QbJ^rIgK{`1xF9iET#*HKIFw9iuW>De(8c=G9@7Dr
zF|5DsU^6n)aO{q|n!eCd>}G5R#1F5wSk6#<!vIG;)*`^wSiwF92O=GQ1kCsUzDSPh
z)=4PQ3nCm_+|59lOAQd_>^O5js`oFVJf+<c1rPN1&Paf?y@o`F#3jNYjJtypcD#j>
z2y4yMxKI8RQ-W-Z$Z&chM?X7v=w<2R#(QxK96OtMi)q9s$@j+D7r_qgGIi^fJBlR1
z3)dYJ9$)fb3O|I%Rq#f1XCEkpi#}mpO~4=Niuxh0VRVY50F*g3Gvsr-pmY26qq&?E
z1qKWUx!_3z(#l!^5!MB$zSTCYjo=ZN2PFrM^-BB0Frrf`qk)~sb{WfOUMId&m6}w7
zB#E};--qM-#^)XaQ+=RnqmYNpxOm|6sL?8WKk;O#9af*G4o^C4+;7Sr3%uebXrqzW
z-?x91V=87I#b&_&vJ5X(Esz#x7EtCtQO#<b_;WWEP18fRTe*#TK{EMcty6(AIa)hF
z$!!*rULDcr5h%laMxvd<hra213Xv@1@`O~?r)DMTj}oB+?|~)IVTB90*MSXe^GG}*
z(cSI6@u{^fk`RD8JFow{;U|_qTkk9TE~g-$EB2wUtzKLS&hkt+nR4V6$Gg0*s0qsW
zY+_SXr$xr>cFT_p8FRkU;_6BD7jZI~-f8yxMGITL*tsPi1uf??m=tJK@_q;v+EL$Q
z00Gu?s`?Yozhk$X+U~MH+Nip|=#x^poM$|<?*!G#`Ifv^s=B3+HgxZz>H0|{#wu8O
zo1bJeEqU`5k0x)>jee*$tnL*)?mc2xBi9^tDYi=e|Hl4zFrC34U)s6%PsI2yKvw`&
zI0)2v@+<M?^oh@PLv!(4N*nh&uTo!Ho{Za}nhXux?QOB2k!kj&t~=x&95D_c9e+kq
zG&6pspbEU!OP#pKu%vUynKfSJSu^0ceG$})F}a5#wHNuSC_xp1gFud;Zuo(+3bF(6
z$<G<$3zLYkxje;Q!E46Xs&WgLQY#1!pBe?A!ZD+>eRJieFO8xw$uwBQJduPFq|jhj
zBgcKM?I%umfL;&5!n<_$zR(`M7Tq@n{b3|tpiMJ%F6pUl`Z&mai+`a=4y{9aV>c*Y
zz?+{KR9plPR1eL$z-*WIfEVG$9BW5UYZqM2qEGH67SpmTP+8OX8+8>`MT%{LEe-U(
z^4KY?>-&sFpWsWCTGq0055ac~x2OZ(Ug%#>zC2cw%;G`YWqpIhSHgO8V5@NCZx-3r
zqTjjI?bADBOlj$0PH(#I(6|xV2+6UZsZewF06XZRDaw^pvL{=tJ3cJC{nx=s`4B!I
zJzpeCdJy|TBdh~_TmYDiJ#74KQy?_lU2s<51(A+HX#}^f%^w@QKePvpI$VG@CXH6Z
z=`9Be>t+x0n{{Wy6Z5qL#oJZ)l=2Ndfa<6i7Srp70faxB)qpTaoKPW#BsT13GL0<X
zD6m_lA8g`8g|(me3^4kR80zGLU3t55_IwkM;Lb|hQk6KNDWQpZC&bW3#HyRZD8QZH
zj#P>?t!3bW%W#3QIe)fI8C>y_U>lVerk&KNYqs>1;XgIp_}D8Mg@_hp#%eF6HI$dX
zkEL2|s>ZdV!2>xWI{IDD`3BW`M2$ZvC7x6s6!aD5fGCERiXM{&sFF{tzL$QCcF4(F
zvvo{Ie^jv&gxW!n<`E@;3yDuH8NrybGfvsNaOR?B;E(yt$-V1+=q6S+V25G7s-{fM
zo*dwTpF1L2-5KT-#d?3xiOO6i_P^|KslL>wT7-jrkr>?jFj`O{4-|c{S-3MGkcTxr
z#nH0<uIxKTVg=LJff;U2^%M6?GThr{HU`2@P6a68``MYl`av7FKj<_h5j!D_#tK?$
zz^|C+9pn9*M=u%4Ly%_AOphsNmZ((7osz?Yo2p*+SF)IG<JC!roA41q(Mj)xEa6CN
z0x{|*a+#MQP-qAF!{lE{4kp%$B&6PsTb=&A6Ldg+BAJ)F2HLpS1geOmHPSP}Xv`zc
za_5!s_>&30n4G4zZ7E)DR+7V}^OTZ)S6uQrJ2M};{PSR~Y4xt4Dp-Wco3mi}TF)wP
z25xM+8JMo=HtF1Mo<(o(15_6;??dJ~%;>&|z-U88iLW>}!tuyL4?=q_di<Jv9)0!A
zC2+4B;;m4CZh6<rP?Xd(7axbZBJaze*wp9HaFNlgExB1T?4>Eqa|#(LQ>rofar3Z~
z^mZ2g1F=bH=49|c&Oq0gRzFaEX|Q@Hn*1xw(Ce*8JtJ#=zEz|)+9TaFAy9uTe8)^A
zVrZMj`Jt6xHSVV8z;rCRuukjd6YJpP`jd-qb?w>QyTB#=;$A@Y#NG3VXf{TBOd$b`
z*1`|x8-pugQdI>r-`8eD!!!@<52lHiLowRuCK`FqL#_jTrSK(}zESMcCj8c%bT#je
z*!qkbgUlYran$=S@Yjqx{ehS0ikGK)Qa!ueUC!^nX+BQOWTyQ%4-Z_R^fhSg8{S{D
zF~Y7S5Mq$Kx8{!F3tspKr$c{1vc=fy|HjL|VEjLcb8*Fv_k&pA|LEtosvKfCXPmiN
VaP5P0g5--WcNb6RGN+KM{{W3!MY{k1

literal 0
HcmV?d00001

diff --git a/lld/docs/ELF/large_section_layout_pic.png b/lld/docs/ELF/large_section_layout_pic.png
new file mode 100644
index 0000000000000000000000000000000000000000..15ca500f4ff33808b1cf1052fc87bd8ec356121e
GIT binary patch
literal 3748
zcmbtXX;f3!77j&&tp=*Kh=~=aqSk;#5-^Z}lu;vy%wdQi)j%*(y+KLBpi)s8wNl^#
z1`Gxi84UpmV=E$4xDX(LFeyp`hCxUG2?7sofAlS1hqZeCoV)hgd*8dxH=KQMIN003
zK7)S-fk0q~Z7m&F_F4!;W!oC)%G1zfs6GUu9(dT&+&P3e-hc7t_by*Po7zdZV)=E&
z&f}|hMZU5+bl@s0Q<Jww$MG`Z(I?;Rxn8tOwdgtPimk&cm2-_npKREb_3bLQ^HTA>
zrf=@G$saN@7L`3u#*o0EiIaTMd9i6?;WSj1Ufof6ji+X?^3s2`YU>U(%e5U*oOQkr
z%=g5=)vST0toqCC>C^Ym`Ji&J<SknKy$Sytrl3jd!IJbla1pWaB25^e2Nk%+>2DH~
z?YS!PbA6o=qroOk@?pu)npuioNoGFWPp_mu-EN#~R2$7?BLep^G=uUQ4`BGPZ)->m
z90yANbfM(C();X5nuT8&w#3^n*1VU$xkM#t8ap}9mYRZAw6IUXab`1aj!~T`8eRjw
zWS$7wS~>vjt`mreBQ$}0sj@mv`U$viKH|$oU^t7(z?scrUl8^cCM1^z#_Ic5V0Kb2
zCF2k%VU6!-(qYY6>^R{mCtbQ4Oq=FTx(BI_`HmV~J|ihdn-#31s~v2kPa0pN+2?yB
zG$<l1zqSj2wHN}Tgs-5Wge#?!-e!_aKs0wosoQphsZF4ZQpU@3l&mvt(6;5AW@MEr
zZU|GcJS`Jmh|`Bsj7Uhtq5Vz$f#3`)qL73G@2pfpAOFfZ7G9an?}3K2a>(I%8jWWz
z9lqxb=*^6IFsd_2x$It8f0k_O^}N^?acHn`Q_)1@TqzYS%rGcq4|Xni@+bn4GvfIM
ze3`vRM!;x>OAg~*6;|dk8Oc4HZ#vW?0Io?FXknF!x%&0y4<w<@9INs2iQ<PE87_(!
zPKw<Er|FkEVIjCdAeCJMb%{bQX1l7t8T&~zcyi@1h;=56)<)Tvlvp`-kfWe|CoEOW
zbl@2D>J-s4`~$e(mbSlQ$sfp*Ex9>u?(m1eKOor`T#T{2A`H;Kf5YSDVPBRZ$=;||
znG}z?5Wo_KIF96EjHM6dzg!C`C>xlvy))M}KY-AU+ueR9GdkP+T@z3sEfe-a;J43b
zvn}M$=$Z{OHAcboyDq`{7K3S?KQ&qc@gAMGq*VP_K#s+<xd4ixutvdr?*nGnv{IxC
z;|k^n_q)U6c331ukphl3)hFp-oDn`zY>V+`x?bsk%Dj=M@xHvfZWcllr#!pt2OHol
z^w%c_4NvU5`w;G@PVL%K6%x_!MSYSKo)uHS@wHW9<b)bwIp~d2_a?Coo_v5WsEAar
zbhGBI7MO?H#dXYikxmV+=H=>34!QB$W8C%+WpBa-<>5r#7Ir}g`+$>_s%jOKneFXu
zs9@zPM3}1o>5<5!fiOkdiJe~EklW4(eouqEwSI*x&&e|FGc$Rlt)L-ukS1wfp<RJt
zty!Lt&eTC!hAb;%%6+`NvR~k}y&V*UwjZ0-OQ<j;$Ch_m1JoG(tYSj}v@K|1&{Mg!
z%68_LGFESzjc(Q3nN|nNB|mILi%e-+mAQ`>Cup!(JPH$gw+Ld0%VTQTi?p*Gg~5@4
ztojr@1!dFGr$}QUU5kFnYC`)2wsiwA(8Zbsu2<$cl)bvd-5|cNFWK;syZi}F$m)Bw
zS6ThNIsKtrvl!z^Hx+uN<AY|l^n0Bl)WLVg2-EFwgC!`CluPQbet23&D}6LDrhUh-
zS4_`&M)k;}7xY*dk1gF~XRvR5<9*%LuhTNH^)7QC_M)fpb0VxIw{?m<0roH%c}BRC
z*;4uZ6cx|8`h5^ydOf&^Xoh#%#``?K{{D^b^Z+IR53C?8`_5^5z@xs7@i2ZKo+#Ck
znMqCjkw<!W2(<`xl67K;4V^jL($t=)7H690|E=(0k~OolSmjOX4nrI5OEONFCBQh1
z5`L3<@LUXgJ|h`bFltaYR`2nlK{CImU7pC+zZy($e_d7u{eIMxh<bZOwE*?*f*YO_
zp6lzmn^v8sz1*oB+^h)OA7VPu0~$n(dwBBl^sU=Z0j|@btpc^a?5~hvKlGQCW@Aj1
zH}mW=0{~VHbTsnO5`0$-!}QFe!By0!9%gt%mXc=}Oz?}nB1yD~Q_aE*AAecvZ%^wy
z_-(4wR<wHTM`84}M2#z~zt8(njyw-g?GKTlI=lunQ?bK{v>CtqiKNakN&T74#`5~x
z#8;BoQ>y8wM!WqiZv|pUtpej)-@Nu&c#@=J(P>m`&a61EaM$FZz8<21S@XqFzBgKz
zndDJf5Eq4Et!*dAruy?SIGfp`0cgrld4piH59Oy9m9Co=>X{aEC-@0yz2W^;`26S(
zpvTv172MJ`3c75l*y0!PeKfHJAu520A*@d~oZz4i3=h|HOD!MjlUZnKIG8(sZbgre
zo<uVF*cHR@jw@A}m3xg7dIfWp&NbaEf3&q-GM~^RI0`D=H&)#|){;=hS-HNNfjIWL
z-KlTirIT)l+MzTQKZ*fN4-gt&c~v?nG}B}2dJDC(Mjm054$A8@8ufy)j6mERy_KVQ
z;erBfED9CscjJCwlX61s6q9RnH5yX&bj3bKp75;u8SKlvW@#Yh^`m;N)&EpwAJmhQ
zYSE--{+A=bA5~V4QEK>zmMwa|aCrqkM!%H?lt>#SL#uvA%wKin=p+hU5}M6ZOhG2o
z0*B}lTB(h2msIea%RJ5BaP?Qk$}rtPl(baLU5o3$sky`1ELFv$IJQA;>gRW-_L?=y
zyraEd9^QW*C{c;e$BEn%FTQ%l!A&+N1iua2V<vPyXqVJc_n4*|cYM_2w-4!fw=VPW
z=aY>I*RW<qC3{=vkatArlfxWlFRm=46#TuRBY=8QZjZ=Joynq^9t&|Tqn=p)`E?AE
zzsh*8nebgEclxCZHUdnj*!ZB{|GFT(3tQ;1tQ%ukTf|mzOQwYrE3%&%;I0u;x8GZT
zoke45yb7)|7+K(Hvg@_B9Ra8^7K09}$v;|*dP7zNCuWx;+IxiZ?Xv!5Hx$NxS=~2K
zc6x)CQzG0C@+TttDBQWl(R5ApM`HUZ?}Idp{P{g?f{km2Dcs;YYg0TJrni3Mtxd;Q
z@$x=tPflrd%+rJYXd_ma7W#M^aJg8e?aKZ41K8H7z85qB;5shmF)Oeu7oJ9CI8pQ_
zZ!^;br@@GCl2Kwj{M`HAyO{C20^k?A?;s^$O;Sq{)zCfHb5GB&fM0lJrt~vehMp+t
zK04|tr8N`gP_tU_IPr<x@2k}1NGTPJUtWk-oLr8Ro(D--ys&_~mmKGNF3ms;eOoHc
z6w9w7?QT5o1Ux9-!V(lxb}UJ!o~yf((r-wjfIUL93CkMx3c&I$KWWQ&hL<Eil1IAW
zOcJuH$f{--jdjbW6*MITds3^|7q1LskI`HkMB>g<ar)7EY4WpRS5i2`4b@JlS(Fjl
z1$EM$NZ8Y^wZs!{gaN^}J_1vYTr`P{hRR;c3X4&whil*KG4mh#*>85}1lZ1R+w=di
z`Pephprkv%4$O?NyImPA$zRVNvYs5f-jW<|ojUUPrEL@rZ?s9RyB7!riY;!DW20=}
z<8<K>wdFV|{91mQ3iaspE|3bg((KeZRGpAg_SBvQVL<?~?LSN>B!2@Le-ECu-4RGd
z_~7vtMJ1oAD=JS0sJ6VM^$E(K^L1e@Vq^rE&AhG4hoq86k;Ur>N@hN`g0XaFyx-m`
zS0BCuol~yTIun{Z8`@2ugosxmJn6RrHy&A(DbC&<RM;7Z7TTe^<e4)%4L@tXS4?NR
z1e!A1!&%7}KvP5ltvUl_oO!uWK01&L>w~0jxx1<AY>WbzKr<X!h(5ogOAAIZvf2M!
z5*E%bSBWn{LGUQVNRV(LoUhCqDYTt|WiBkOC4*^!;zP=CLx@LEJh3f7-92r1K8L-%
z%;;4xtWmI7rSogRxD_68h+{sp`#66iuT_3?Mx)wWD0q&niu5Pm^xu{Q_46Jpe=5lE
zLOT}2ejoESOQ|M~Ih)2Dbe+f|%_4<1zs=SnEMsEPs^+O^3i)#(dKsG=@B@OtBet7)
zYDaG>R?JXfO_YjyE!BUD@o572^S#N0_kWE4YNZ@nS{tylb;773O!&UhVJmyfg718;
F{1;G<Oez2X

literal 0
HcmV?d00001

diff --git a/lld/docs/ELF/large_sections.rst b/lld/docs/ELF/large_sections.rst
new file mode 100644
index 0000000000000..5dccb3373e1c9
--- /dev/null
+++ b/lld/docs/ELF/large_sections.rst
@@ -0,0 +1,52 @@
+Large data sections
+===================
+
+When linking very large binaries, lld may report relocation overflows like
+
+::
+
+  relocation R_X86_64_PC32 out of range: 2158227201 is not in [-2147483648, 2147483647]
+
+This happens when running into architectural limitations. For example, in x86-64
+PIC code, a reference to a static global variable is typically done with a
+``R_X86_64_PC32`` relocation, which is a 32-bit signed offset from the PC. That
+means if the global variable is laid out further than 2GB (2^31 bytes) from the
+instruction referencing it, we run into a relocation overflow.
+
+lld normally lays out sections as follows:
+
+.. image:: section_layout.png
+
+The largest relocation pressure is usually from ``.text`` to the beginning of
+``.rodata`` or ``.text`` to the end of ``.bss``.
+
+Some code models offer a tradeoff between relocation pressure and performance.
+For example, x86-64's medium code model splits global variables into small and
+large globals depending on if their size is over a certain threshold. Large
+globals are placed further away from text and we use 64-bit references to refer
+to them.
+
+Large globals are placed in separate sections from small globals, and those
+sections have a "large" section flag, e.g. ``SHF_X86_64_LARGE`` for x86-64. The
+linker places large sections on the outer edges of the binary, making sure they
+do not affect affect the distance of small globals to text. The large versions
+of ``.rodata``, ``.bss``, and ``.data`` are ``.lrodata``, ``.lbss``, and
+``.ldata``, and they are laid out as follows:
+
+.. image:: large_section_layout_pic.png
+
+We try to keep the number of ``PT_LOAD`` segments to a minimum, so we place
+large sections next to the small sections with the same RWX permissions when
+possible.
+
+``.lbss`` is right after ``.bss`` so that they are merged together and we
+minimize the number of segments with ``p_memsz > p_filesz``.
+
+Note that the above applies to PIC code. For less common non-PIC code with
+absolute relocations instead of relative relocations, 32-bit relocations
+typically assume that symbols are in the lower 2GB of the address space. So for
+non-PIC code, large sections should be placed after all small sections to avoid
+``.lrodata`` pushing small symbols out of the lower 2GB of the address space.
+``-z lrodata-after-bss`` changes the layout to be:
+
+.. image:: large_section_layout_nopic.png
diff --git a/lld/docs/ELF/section_layout.png b/lld/docs/ELF/section_layout.png
new file mode 100644
index 0000000000000000000000000000000000000000..e9a7e76a3005a17f24cef322442dd6dbedbe7d72
GIT binary patch
literal 1872
zcma)73pCSv93Nt0B9AGI>k$)qB$LrrSd5}FHWVU{yrNlTl0-r;m5{5DMr)S!BFpPi
zu3Jmg-?NpdmbDtORr9(V_uO;Ot$R=B{GR9h&gb|2em>vN@0YX9(^W}9O92D|DPi4Q
zc8KS_k}fYN9_=%fkRT9P6YH`GmmK)9bZIr+Tm4w!LG#Jvgk>uqsf6jCJ@~jN^TQ>`
za(x3E6XKrLOEtgzQp`E=+T}c?m}45Mo(13Izc2dmlP2c;n+Kw%+4(xb^Adj0tLY#?
z+^hKM!aEq3If&0QJ*UrPKY8TQCzlX%pPmjpd)NX)k_T7RJ4bkF%Ot($?F}7XR=OdU
z7m6@bl#yOorNI@?)mB8g`_beV_Wc;77b$UZ7n9%CE}gnlP!QFD3-S=0=VE{wYL@pQ
zU4TW!``BGeb0@{ub7g@?)NUGzE^-m1AyI3AiHKc>LVmL69S>@flo&H6Uk%5n`g*hI
z7xA_wTo_>7Nj%2oCPL6q@UR2AY7~P@s($CoGKJ2F#&q73R>f8!ZDE!-L`a7|M56FE
z1EYe3>PM^sq6au(I6cZJ6E_sj`8>o<q2~G6;PkHF?Rp~cH>y3&b!YHr{C|yB6X)D-
zK&X6{J9vg$!<*b6Qnxyu8I{f3dCRShYUlGIcNey<6K&aZoa@QpOAEiflZ2T#k_bG+
z)btrhIpPm^b`%;NIqLHzkKH>SF?*J)eSFs;#cJQ}hbjk6OJzE)(YJO?RvjKuXflBD
zDWa{s3q!KvNV{{u58^ECr>PXEtre}oHHKk<XBBdZ$Jsq773NlU2{#xgIM$yh=g17_
zKvkIv6Ep`0U|groL!REE*dBM)gF=k`msXG$<+rL%9&r%ETgPYrQ`qU7Eco-%imTC}
zvi~^o1L<i?w6;Ix4@-&kdf|B*66k`BOkgg|`pD&$d6pNuF-0fJNn9-sa%j@&>rLv>
zZmko~6PGpa|9Wu74;Ttz83mIV;RUPO{eRQ-+3LX=pA9^eWV?VPR<5+N0S*hbMi=DF
zfs^&N!EQ950Id=CsGPvOspAB^3SIs^8MrubaGmE!?p`QeA9`(7ts7YV&NMDNm!xLv
zMAmF65Ht`JKOei+n~p(Js)0;##&;ou4Hjm3A+yOg?a&Z7AjCCR_`A}AbUp?~PD)31
zI_<N6IWle@unv$CwRamJ%+vOrm<LOWU~S|U$2#-Xic|{?@{@IzQlL5JgSc4M{N>ws
zxwK}O0r|S{5o$DRG?nqPIXOWd{<lN^k9AFRG3)%epJY93-0<b6u!NQciUyo*eW50=
z3B^UpJzWxix|^FJ%&K=9ZTak^EHhZAWknAs2(tQ<;$KXm(jT-_4z%_uOU^0iz<UM6
zD}>Na0yQ*M3+1I*5}Q5X0%Fr`aA;~AwxexK2YKp}=Jc|McMB#@Mr{;JXf}RtOXdk#
zZI6zCYJ^uuJu?37n=<8n6BP%}^sh&x%Q7t-@QM<atcz5?`3dct+2u0wcuivxyG&#Z
zXs4`EjUy`OkeX2XYR*d0i$$@<|5YyOYkF$_(+$t3x?ZQ=bBi>wFo`XPt*pbQdtM<_
z&bMq2*+8_%=dh}VHuIY27&x?FOzNO-NLT6pGsA5mMBaE~Igad4%I%M|bu6)}tzEEq
z7(zoEQFPR9{e90XeC<)kl4g_LtBtf^mg;1w_1nYK-afp=(P5kd3cO=Y8CeeAck&6~
znAo3?YtaHh!i=m2?<uwt=zAOe$Rv<(+1;a5zolY!2*n+H;_YTjA(Hh<b5$W{$b(j8
z0d6Gqk17YW6>K1_h(6j0pS7Y;B(>@*mzTK)>*{0K*?zm`-VPX=5!!{0(de5wf2sy+
zT_HD_Q;X3_F%T_4uQc!J9ZQyFe8jf)jMWdoSK!rCCO}Qkcw=?c$M-og=mi<QQRl2i
z=~CMED8t_6>&^n*nHe9kj08ja&2ar`*~PGWE{v_fD>nqRo*}rJZFDw*E(s`>e%#r1
z!gp%CoGfqHw{T&<eo@4iQi2Cw2D;P+-z~JY=p}<G)i2e^n_>Dw-llWtmIGJcvrCMH
zv(7sjgCx%iEJN4+e*W9w)B9gE)w+F?YX)_dEoTtx|Imup*ln{LV#>>wn^hGD@!co$
qLkkr)2A6-Ds$8;3<*(LC`#nhAHPJMox=tYeDM46_rwiRFIO9)W%#!E;

literal 0
HcmV?d00001

diff --git a/lld/docs/index.rst b/lld/docs/index.rst
index 6281b6893cc19..637e74c70c985 100644
--- a/lld/docs/index.rst
+++ b/lld/docs/index.rst
@@ -166,6 +166,7 @@ document soon.
    error_handling_script
    Partitions
    ReleaseNotes
+   ELF/large_sections
    ELF/linker_script
    ELF/start-stop-gc
    ELF/warn_backrefs

From f70d5c0bc822f8932e8aadfbed7fe9c6cd8c3758 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 26 Feb 2024 09:23:15 -0800
Subject: [PATCH 340/546] [libc][stdbit] implement stdc_count_zeros (C23)
 (#82437)

---
 libc/config/linux/x86_64/entrypoints.txt      |  5 +++++
 libc/docs/stdbit.rst                          | 12 +++++------
 libc/include/llvm-libc-macros/stdbit-macros.h | 20 ++++++++++++++++++
 libc/spec/stdc.td                             | 10 +++++++--
 libc/src/stdbit/CMakeLists.txt                |  1 +
 libc/src/stdbit/stdc_count_zeros_uc.cpp       | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_uc.h         | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_ui.cpp       | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_ui.h         | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_ul.cpp       | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_ul.h         | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_ull.cpp      | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_ull.h        | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_us.cpp       | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_zeros_us.h         | 18 ++++++++++++++++
 libc/test/include/stdbit_test.cpp             | 13 ++++++++++++
 libc/test/src/stdbit/CMakeLists.txt           |  1 +
 .../src/stdbit/stdc_count_zeros_uc_test.cpp   | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_zeros_ui_test.cpp   | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_zeros_ul_test.cpp   | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_zeros_ull_test.cpp  | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_zeros_us_test.cpp   | 21 +++++++++++++++++++
 22 files changed, 349 insertions(+), 8 deletions(-)
 create mode 100644 libc/src/stdbit/stdc_count_zeros_uc.cpp
 create mode 100644 libc/src/stdbit/stdc_count_zeros_uc.h
 create mode 100644 libc/src/stdbit/stdc_count_zeros_ui.cpp
 create mode 100644 libc/src/stdbit/stdc_count_zeros_ui.h
 create mode 100644 libc/src/stdbit/stdc_count_zeros_ul.cpp
 create mode 100644 libc/src/stdbit/stdc_count_zeros_ul.h
 create mode 100644 libc/src/stdbit/stdc_count_zeros_ull.cpp
 create mode 100644 libc/src/stdbit/stdc_count_zeros_ull.h
 create mode 100644 libc/src/stdbit/stdc_count_zeros_us.cpp
 create mode 100644 libc/src/stdbit/stdc_count_zeros_us.h
 create mode 100644 libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_zeros_us_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index beb7a5ed448de..91e1bd73ec73f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -132,6 +132,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 3bd83ff70c892..3597cdc3432fd 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -71,11 +71,11 @@ stdc_first_trailing_one_us   |check|
 stdc_first_trailing_one_ui   |check|
 stdc_first_trailing_one_ul   |check|
 stdc_first_trailing_one_ull  |check|
-stdc_count_zeros_uc
-stdc_count_zeros_us
-stdc_count_zeros_ui
-stdc_count_zeros_ul
-stdc_count_zeros_ull
+stdc_count_zeros_uc          |check|
+stdc_count_zeros_us          |check|
+stdc_count_zeros_ui          |check|
+stdc_count_zeros_ul          |check|
+stdc_count_zeros_ull         |check|
 stdc_count_ones_uc
 stdc_count_ones_us
 stdc_count_ones_ui
@@ -122,7 +122,7 @@ stdc_first_leading_zero    |check|
 stdc_first_leading_one     |check|
 stdc_first_trailing_zero   |check|
 stdc_first_trailing_one    |check|
-stdc_count_zeros
+stdc_count_zeros           |check|
 stdc_count_ones
 stdc_has_single_bit
 stdc_bit_width
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index 0c97da96ebba2..3b2aa9b932b83 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -131,6 +131,19 @@ inline unsigned stdc_first_trailing_one(unsigned long x) {
 inline unsigned stdc_first_trailing_one(unsigned long long x) {
   return stdc_first_trailing_one_ull(x);
 }
+inline unsigned stdc_count_zeros(unsigned char x) {
+  return stdc_count_zeros_uc(x);
+}
+inline unsigned stdc_count_zeros(unsigned short x) {
+  return stdc_count_zeros_us(x);
+}
+inline unsigned stdc_count_zeros(unsigned x) { return stdc_count_zeros_ui(x); }
+inline unsigned stdc_count_zeros(unsigned long x) {
+  return stdc_count_zeros_ul(x);
+}
+inline unsigned stdc_count_zeros(unsigned long long x) {
+  return stdc_count_zeros_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -188,6 +201,13 @@ inline unsigned stdc_first_trailing_one(unsigned long long x) {
       unsigned: stdc_first_trailing_one_ui,                                    \
       unsigned long: stdc_first_trailing_one_ul,                               \
       unsigned long long: stdc_first_trailing_one_ull)(x)
+#define stdc_count_zeros(x)                                                    \
+  _Generic((x),                                                                \
+      unsigned char: stdc_count_zeros_uc,                                      \
+      unsigned short: stdc_count_zeros_us,                                     \
+      unsigned: stdc_count_zeros_ui,                                           \
+      unsigned long: stdc_count_zeros_ul,                                      \
+      unsigned long long: stdc_count_zeros_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 3fe19fae4c2e5..523a08c6a8c3c 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -790,7 +790,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_first_leading_zero">,
         Macro<"stdc_first_leading_one">,
         Macro<"stdc_first_trailing_zero">,
-        Macro<"stdc_first_trailing_one">
+        Macro<"stdc_first_trailing_one">,
+        Macro<"stdc_count_zeros">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -829,7 +830,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_first_trailing_one_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_first_trailing_one_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_first_trailing_one_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_first_trailing_one_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_first_trailing_one_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_count_zeros_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_count_zeros_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_count_zeros_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_count_zeros_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_count_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 14cc26e206e0d..a51ab5c21585f 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -7,6 +7,7 @@ set(prefixes
   first_leading_one
   first_trailing_zero
   first_trailing_one
+  count_zeros
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_count_zeros_uc.cpp b/libc/src/stdbit/stdc_count_zeros_uc.cpp
new file mode 100644
index 0000000000000..22c57bd60c384
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_uc -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_uc, (unsigned char value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_uc.h b/libc/src/stdbit/stdc_count_zeros_uc.h
new file mode 100644
index 0000000000000..34b4636ee3f9e
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_uc -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UC_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UC_H
diff --git a/libc/src/stdbit/stdc_count_zeros_ui.cpp b/libc/src/stdbit/stdc_count_zeros_ui.cpp
new file mode 100644
index 0000000000000..6a1defd9d555b
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_ui -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_ui, (unsigned value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_ui.h b/libc/src/stdbit/stdc_count_zeros_ui.h
new file mode 100644
index 0000000000000..48e8630f6f097
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_ui -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UI_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UI_H
diff --git a/libc/src/stdbit/stdc_count_zeros_ul.cpp b/libc/src/stdbit/stdc_count_zeros_ul.cpp
new file mode 100644
index 0000000000000..ceab32ef9ac36
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_ul -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_ul, (unsigned long value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_ul.h b/libc/src/stdbit/stdc_count_zeros_ul.h
new file mode 100644
index 0000000000000..b88387741adef
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_ul -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_UL_H
diff --git a/libc/src/stdbit/stdc_count_zeros_ull.cpp b/libc/src/stdbit/stdc_count_zeros_ull.cpp
new file mode 100644
index 0000000000000..2f57f727a6919
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_ull ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_ull, (unsigned long long value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_ull.h b/libc/src/stdbit/stdc_count_zeros_ull.h
new file mode 100644
index 0000000000000..e15b33011ab7f
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_ull ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_ULL_H
diff --git a/libc/src/stdbit/stdc_count_zeros_us.cpp b/libc/src/stdbit/stdc_count_zeros_us.cpp
new file mode 100644
index 0000000000000..fc06836ee292a
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_zeros_us -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_zeros_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_zeros_us, (unsigned short value)) {
+  return static_cast<unsigned>(cpp::count_zeros(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_zeros_us.h b/libc/src/stdbit/stdc_count_zeros_us.h
new file mode 100644
index 0000000000000..d422377f076bc
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_zeros_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_zeros_us -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_US_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_zeros_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ZEROS_US_H
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 22d5533df1e85..44e4577d65e50 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -71,6 +71,11 @@ unsigned stdc_first_trailing_one_ul(unsigned long) noexcept { return 0x1DU; }
 unsigned stdc_first_trailing_one_ull(unsigned long long) noexcept {
   return 0x1FU;
 }
+unsigned stdc_count_zeros_uc(unsigned char) noexcept { return 0x2AU; }
+unsigned stdc_count_zeros_us(unsigned short) noexcept { return 0x2BU; }
+unsigned stdc_count_zeros_ui(unsigned) noexcept { return 0x2CU; }
+unsigned stdc_count_zeros_ul(unsigned long) noexcept { return 0x2DU; }
+unsigned stdc_count_zeros_ull(unsigned long long) noexcept { return 0x2FU; }
 }
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
@@ -138,3 +143,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroFirstTrailingOne) {
   EXPECT_EQ(stdc_first_trailing_one(0UL), 0x1DU);
   EXPECT_EQ(stdc_first_trailing_one(0ULL), 0x1FU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroCountZeros) {
+  EXPECT_EQ(stdc_count_zeros(static_cast<unsigned char>(0U)), 0x2AU);
+  EXPECT_EQ(stdc_count_zeros(static_cast<unsigned short>(0U)), 0x2BU);
+  EXPECT_EQ(stdc_count_zeros(0U), 0x2CU);
+  EXPECT_EQ(stdc_count_zeros(0UL), 0x2DU);
+  EXPECT_EQ(stdc_count_zeros(0ULL), 0x2FU);
+}
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index 203f48bda99a4..fafc26cae7344 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -9,6 +9,7 @@ set(prefixes
   first_leading_one
   first_trailing_zero
   first_trailing_one
+  count_zeros
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp
new file mode 100644
index 0000000000000..3acf61aed637f
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_uc_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_uc ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_uc(0U),
+            static_cast<unsigned>(UCHAR_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUcTest, Ones) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_uc(UCHAR_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp
new file mode 100644
index 0000000000000..53ce1c86b40f6
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_ui_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_ui ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ui(0U),
+            static_cast<unsigned>(UINT_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUiTest, Ones) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ui(UINT_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp
new file mode 100644
index 0000000000000..60f01f3166573
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_ul_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_ul ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ul(0U),
+            static_cast<unsigned>(ULONG_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUlTest, Ones) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ul(ULONG_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp
new file mode 100644
index 0000000000000..7da2493c0e8f7
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_ull_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_ull --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ull(0U),
+            static_cast<unsigned>(ULLONG_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUllTest, Ones) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_ull(ULLONG_MAX >> i), i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_zeros_us_test.cpp b/libc/test/src/stdbit/stdc_count_zeros_us_test.cpp
new file mode 100644
index 0000000000000..e8690cb989e3b
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_zeros_us_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_zeros_us ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_zeros_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountZerosUsTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_us(0U),
+            static_cast<unsigned>(USHRT_WIDTH));
+}
+
+TEST(LlvmLibcStdcCountZerosUsTest, Ones) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_zeros_us(USHRT_MAX >> i), i);
+}

From 78275ef0a33a808d30285603585300ce57d7ef26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 18:22:48 +0100
Subject: [PATCH 341/546] [clang][Interp] Pick the right APInt constructor on
 Windows

The second parameter needs to be a uint64_t and nothing else.
This broke windows builders, see
https://github.com/llvm/llvm-project/commit/264d828ea6399c31c210b67a050fbf084634da6a
---
 clang/lib/AST/Interp/InterpBuiltin.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 8f45c789296b6..760219e0ffa9f 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -85,7 +85,8 @@ static void pushInteger(InterpState &S, T Val, QualType QT) {
     pushInteger(S, APSInt(Val, !std::is_signed_v<T>), QT);
   else
     pushInteger(S,
-                APSInt(APInt(sizeof(T) * 8, Val, std::is_signed_v<T>),
+                APSInt(APInt(sizeof(T) * 8, static_cast<uint64_t>(Val),
+                             std::is_signed_v<T>),
                        !std::is_signed_v<T>),
                 QT);
 }
@@ -464,7 +465,7 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
 
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  pushInteger(S, APSInt(APInt(32, Val.popcount())), Call->getType());
+  pushInteger(S, Val.popcount(), Call->getType());
   return true;
 }
 
@@ -805,7 +806,7 @@ static bool interp__builtin_clz(InterpState &S, CodePtr OpPC,
   if (ZeroIsUndefined && Val == 0)
     return false;
 
-  pushInteger(S, APSInt(APInt(32, Val.countl_zero())), Call->getType());
+  pushInteger(S, Val.countl_zero(), Call->getType());
   return true;
 }
 

From be024307075c37f057fd51d4de0e9c2443b51686 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 26 Feb 2024 18:24:53 +0100
Subject: [PATCH 342/546] Thread safety analysis: provide printSCFG definition.
 (#80277)

I called this function when investigating the issue
(https://github.com/llvm/llvm-project/issues/78131), and I was surprised
to see the definition is commented out.

I think it makes sense to provide the definition even though the
implementation is not stable.
---
 clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h | 2 ++
 clang/lib/Analysis/ThreadSafetyCommon.cpp                  | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
index 13e37ac2b56b6..7bdb9052e57e7 100644
--- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
+++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
@@ -527,8 +527,10 @@ class SExprBuilder {
   BlockInfo *CurrentBlockInfo = nullptr;
 };
 
+#ifndef NDEBUG
 // Dump an SCFG to llvm::errs().
 void printSCFG(CFGWalker &Walker);
+#endif // NDEBUG
 
 } // namespace threadSafety
 } // namespace clang
diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp
index 2fe0f85897c3b..33f1f466df244 100644
--- a/clang/lib/Analysis/ThreadSafetyCommon.cpp
+++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp
@@ -995,7 +995,7 @@ void SExprBuilder::exitCFG(const CFGBlock *Last) {
   IncompleteArgs.clear();
 }
 
-/*
+#ifndef NDEBUG
 namespace {
 
 class TILPrinter :
@@ -1016,4 +1016,4 @@ void printSCFG(CFGWalker &Walker) {
 
 } // namespace threadSafety
 } // namespace clang
-*/
+#endif // NDEBUG

From d8406d48a8c94f88613e5613a7e0d6526a520428 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 26 Feb 2024 09:25:24 -0800
Subject: [PATCH 343/546] [libc][stdbit] implement stdc_count_ones (C23)
 (#82444)

---
 libc/config/linux/x86_64/entrypoints.txt      |  5 +++++
 libc/docs/stdbit.rst                          | 12 +++++------
 libc/include/llvm-libc-macros/stdbit-macros.h | 20 ++++++++++++++++++
 libc/spec/stdc.td                             | 10 +++++++--
 libc/src/stdbit/CMakeLists.txt                |  1 +
 libc/src/stdbit/stdc_count_ones_uc.cpp        | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_ones_uc.h          | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_ones_ui.cpp        | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_ones_ui.h          | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_ones_ul.cpp        | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_ones_ul.h          | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_ones_ull.cpp       | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_ones_ull.h         | 18 ++++++++++++++++
 libc/src/stdbit/stdc_count_ones_us.cpp        | 20 ++++++++++++++++++
 libc/src/stdbit/stdc_count_ones_us.h          | 18 ++++++++++++++++
 libc/test/include/stdbit_test.cpp             | 13 ++++++++++++
 libc/test/src/stdbit/CMakeLists.txt           |  1 +
 .../src/stdbit/stdc_count_ones_uc_test.cpp    | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_ones_ui_test.cpp    | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_ones_ul_test.cpp    | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_ones_ull_test.cpp   | 21 +++++++++++++++++++
 .../src/stdbit/stdc_count_ones_us_test.cpp    | 21 +++++++++++++++++++
 22 files changed, 349 insertions(+), 8 deletions(-)
 create mode 100644 libc/src/stdbit/stdc_count_ones_uc.cpp
 create mode 100644 libc/src/stdbit/stdc_count_ones_uc.h
 create mode 100644 libc/src/stdbit/stdc_count_ones_ui.cpp
 create mode 100644 libc/src/stdbit/stdc_count_ones_ui.h
 create mode 100644 libc/src/stdbit/stdc_count_ones_ul.cpp
 create mode 100644 libc/src/stdbit/stdc_count_ones_ul.h
 create mode 100644 libc/src/stdbit/stdc_count_ones_ull.cpp
 create mode 100644 libc/src/stdbit/stdc_count_ones_ull.h
 create mode 100644 libc/src/stdbit/stdc_count_ones_us.cpp
 create mode 100644 libc/src/stdbit/stdc_count_ones_us.h
 create mode 100644 libc/test/src/stdbit/stdc_count_ones_uc_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_ones_ui_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_ones_ul_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_ones_ull_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_count_ones_us_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 91e1bd73ec73f..f2a224d45bbae 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -137,6 +137,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_count_zeros_ui
     libc.src.stdbit.stdc_count_zeros_ul
     libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 3597cdc3432fd..0308caeb92932 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -76,11 +76,11 @@ stdc_count_zeros_us          |check|
 stdc_count_zeros_ui          |check|
 stdc_count_zeros_ul          |check|
 stdc_count_zeros_ull         |check|
-stdc_count_ones_uc
-stdc_count_ones_us
-stdc_count_ones_ui
-stdc_count_ones_ul
-stdc_count_ones_ull
+stdc_count_ones_uc           |check|
+stdc_count_ones_us           |check|
+stdc_count_ones_ui           |check|
+stdc_count_ones_ul           |check|
+stdc_count_ones_ull          |check|
 stdc_has_single_bit_uc
 stdc_has_single_bit_us
 stdc_has_single_bit_ui
@@ -123,7 +123,7 @@ stdc_first_leading_one     |check|
 stdc_first_trailing_zero   |check|
 stdc_first_trailing_one    |check|
 stdc_count_zeros           |check|
-stdc_count_ones
+stdc_count_ones            |check|
 stdc_has_single_bit
 stdc_bit_width
 stdc_bit_floor
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index 3b2aa9b932b83..5ee152e105f77 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -144,6 +144,19 @@ inline unsigned stdc_count_zeros(unsigned long x) {
 inline unsigned stdc_count_zeros(unsigned long long x) {
   return stdc_count_zeros_ull(x);
 }
+inline unsigned stdc_count_ones(unsigned char x) {
+  return stdc_count_ones_uc(x);
+}
+inline unsigned stdc_count_ones(unsigned short x) {
+  return stdc_count_ones_us(x);
+}
+inline unsigned stdc_count_ones(unsigned x) { return stdc_count_ones_ui(x); }
+inline unsigned stdc_count_ones(unsigned long x) {
+  return stdc_count_ones_ul(x);
+}
+inline unsigned stdc_count_ones(unsigned long long x) {
+  return stdc_count_ones_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -208,6 +221,13 @@ inline unsigned stdc_count_zeros(unsigned long long x) {
       unsigned: stdc_count_zeros_ui,                                           \
       unsigned long: stdc_count_zeros_ul,                                      \
       unsigned long long: stdc_count_zeros_ull)(x)
+#define stdc_count_ones(x)                                                     \
+  _Generic((x),                                                                \
+      unsigned char: stdc_count_ones_uc,                                       \
+      unsigned short: stdc_count_ones_us,                                      \
+      unsigned: stdc_count_ones_ui,                                            \
+      unsigned long: stdc_count_ones_ul,                                       \
+      unsigned long long: stdc_count_ones_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 523a08c6a8c3c..6b292588b6c7a 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -791,7 +791,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_first_leading_one">,
         Macro<"stdc_first_trailing_zero">,
         Macro<"stdc_first_trailing_one">,
-        Macro<"stdc_count_zeros">
+        Macro<"stdc_count_zeros">,
+        Macro<"stdc_count_ones">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -835,7 +836,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_count_zeros_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_count_zeros_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_count_zeros_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_count_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_count_zeros_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_count_ones_uc", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_count_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_count_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_count_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index a51ab5c21585f..5fb77d21e57a1 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -8,6 +8,7 @@ set(prefixes
   first_trailing_zero
   first_trailing_one
   count_zeros
+  count_ones
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_count_ones_uc.cpp b/libc/src/stdbit/stdc_count_ones_uc.cpp
new file mode 100644
index 0000000000000..5a7314caa3baa
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_uc ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_uc, (unsigned char value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_uc.h b/libc/src/stdbit/stdc_count_ones_uc.h
new file mode 100644
index 0000000000000..eed3ee5f181b6
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_uc -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UC_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UC_H
diff --git a/libc/src/stdbit/stdc_count_ones_ui.cpp b/libc/src/stdbit/stdc_count_ones_ui.cpp
new file mode 100644
index 0000000000000..289f4bac31f7b
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_ui ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_ui, (unsigned value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_ui.h b/libc/src/stdbit/stdc_count_ones_ui.h
new file mode 100644
index 0000000000000..1f7ccb9c502f6
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_ui -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UI_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UI_H
diff --git a/libc/src/stdbit/stdc_count_ones_ul.cpp b/libc/src/stdbit/stdc_count_ones_ul.cpp
new file mode 100644
index 0000000000000..83f3279d79193
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_ul ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_ul, (unsigned long value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_ul.h b/libc/src/stdbit/stdc_count_ones_ul.h
new file mode 100644
index 0000000000000..bde349a2fb94f
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_ul -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_UL_H
diff --git a/libc/src/stdbit/stdc_count_ones_ull.cpp b/libc/src/stdbit/stdc_count_ones_ull.cpp
new file mode 100644
index 0000000000000..104788aaf2126
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_ull -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_ull, (unsigned long long value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_ull.h b/libc/src/stdbit/stdc_count_ones_ull.h
new file mode 100644
index 0000000000000..830239f8874b0
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_ull -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_ULL_H
diff --git a/libc/src/stdbit/stdc_count_ones_us.cpp b/libc/src/stdbit/stdc_count_ones_us.cpp
new file mode 100644
index 0000000000000..4b6ff0b94b626
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_count_ones_us ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_count_ones_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned, stdc_count_ones_us, (unsigned short value)) {
+  return static_cast<unsigned>(cpp::count_ones(value));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_count_ones_us.h b/libc/src/stdbit/stdc_count_ones_us.h
new file mode 100644
index 0000000000000..08fd4e76eaae6
--- /dev/null
+++ b/libc/src/stdbit/stdc_count_ones_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_count_ones_us -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_US_H
+
+namespace LIBC_NAMESPACE {
+
+unsigned stdc_count_ones_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_COUNT_ONES_US_H
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 44e4577d65e50..46019075a7c10 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -76,6 +76,11 @@ unsigned stdc_count_zeros_us(unsigned short) noexcept { return 0x2BU; }
 unsigned stdc_count_zeros_ui(unsigned) noexcept { return 0x2CU; }
 unsigned stdc_count_zeros_ul(unsigned long) noexcept { return 0x2DU; }
 unsigned stdc_count_zeros_ull(unsigned long long) noexcept { return 0x2FU; }
+unsigned stdc_count_ones_uc(unsigned char) noexcept { return 0x3AU; }
+unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
+unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
+unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
+unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3FU; }
 }
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
@@ -151,3 +156,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroCountZeros) {
   EXPECT_EQ(stdc_count_zeros(0UL), 0x2DU);
   EXPECT_EQ(stdc_count_zeros(0ULL), 0x2FU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroCountOnes) {
+  EXPECT_EQ(stdc_count_ones(static_cast<unsigned char>(0U)), 0x3AU);
+  EXPECT_EQ(stdc_count_ones(static_cast<unsigned short>(0U)), 0x3BU);
+  EXPECT_EQ(stdc_count_ones(0U), 0x3CU);
+  EXPECT_EQ(stdc_count_ones(0UL), 0x3DU);
+  EXPECT_EQ(stdc_count_ones(0ULL), 0x3FU);
+}
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index fafc26cae7344..659e575fedea2 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -10,6 +10,7 @@ set(prefixes
   first_trailing_zero
   first_trailing_one
   count_zeros
+  count_ones
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_count_ones_uc_test.cpp b/libc/test/src/stdbit/stdc_count_ones_uc_test.cpp
new file mode 100644
index 0000000000000..791288154bae7
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_uc_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_uc ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_uc(0U), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUcTest, Ones) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_uc(UCHAR_MAX >> i),
+              UCHAR_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_ui_test.cpp b/libc/test/src/stdbit/stdc_count_ones_ui_test.cpp
new file mode 100644
index 0000000000000..198e366584217
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_ui_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_ui ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ui(0), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUiTest, Ones) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ui(UINT_MAX >> i),
+              UINT_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_ul_test.cpp b/libc/test/src/stdbit/stdc_count_ones_ul_test.cpp
new file mode 100644
index 0000000000000..ce9d6eb081d41
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_ul_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_ul ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ul(0UL), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUlTest, Ones) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ul(ULONG_MAX >> i),
+              ULONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_ull_test.cpp b/libc/test/src/stdbit/stdc_count_ones_ull_test.cpp
new file mode 100644
index 0000000000000..a0e69459c5a88
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_ull_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_ull ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ull(0ULL), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUllTest, Ones) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_ull(ULLONG_MAX >> i),
+              ULLONG_WIDTH - i);
+}
diff --git a/libc/test/src/stdbit/stdc_count_ones_us_test.cpp b/libc/test/src/stdbit/stdc_count_ones_us_test.cpp
new file mode 100644
index 0000000000000..19d3426062851
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_count_ones_us_test.cpp
@@ -0,0 +1,21 @@
+//===-- Unittests for stdc_count_ones_us ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_count_ones_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_us(0), 0U);
+}
+
+TEST(LlvmLibcStdcCountOnesUsTest, Ones) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_count_ones_us(USHRT_MAX >> i),
+              USHRT_WIDTH - i);
+}

From e2d80a3d025875766ac879d3d0c480cd03994d35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 26 Feb 2024 09:29:46 -0800
Subject: [PATCH 344/546] [flang][cuda] Make sure CUDA attribute are imported
 when using module variable (#82844)

CUDA attribute are correctly propagated to the module file but were not
imported currently so they did not appear on the hlfir.declare and
fir.global operations for module variables.
---
 flang/lib/Lower/ConvertVariable.cpp       |  7 +++++--
 flang/test/Lower/CUDA/cuda-mod.cuf        |  9 +++++++++
 flang/test/Lower/CUDA/cuda-module-use.cuf | 15 +++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-mod.cuf
 create mode 100644 flang/test/Lower/CUDA/cuda-module-use.cuf

diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index b2279a319fe92..a673a18cd20d9 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -172,9 +172,12 @@ static fir::GlobalOp declareGlobal(Fortran::lower::AbstractConverter &converter,
       !Fortran::semantics::IsProcedurePointer(ultimate))
     mlir::emitError(loc, "processing global declaration: symbol '")
         << toStringRef(sym.name()) << "' has unexpected details\n";
+  fir::CUDADataAttributeAttr cudaAttr =
+      Fortran::lower::translateSymbolCUDADataAttribute(
+          converter.getFirOpBuilder().getContext(), sym);
   return builder.createGlobal(loc, converter.genType(var), globalName, linkage,
                               mlir::Attribute{}, isConstant(ultimate),
-                              var.isTarget());
+                              var.isTarget(), cudaAttr);
 }
 
 /// Temporary helper to catch todos in initial data target lowering.
@@ -1586,7 +1589,7 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes(
 fir::CUDADataAttributeAttr Fortran::lower::translateSymbolCUDADataAttribute(
     mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
   std::optional<Fortran::common::CUDADataAttr> cudaAttr =
-      Fortran::semantics::GetCUDADataAttr(&sym);
+      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
   return fir::getCUDADataAttribute(mlirContext, cudaAttr);
 }
 
diff --git a/flang/test/Lower/CUDA/cuda-mod.cuf b/flang/test/Lower/CUDA/cuda-mod.cuf
new file mode 100644
index 0000000000000..2cc6439789a27
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-mod.cuf
@@ -0,0 +1,9 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Simple module to test module use in 
+
+module cuf_mod
+  real, device :: md
+end module
+
+! CHECK: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
diff --git a/flang/test/Lower/CUDA/cuda-module-use.cuf b/flang/test/Lower/CUDA/cuda-module-use.cuf
new file mode 100644
index 0000000000000..43ab0f6ff8d68
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-module-use.cuf
@@ -0,0 +1,15 @@
+! RUN: bbc -emit-hlfir -fcuda %S/cuda-mod.cuf
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test importing module with variable with CUDA attributes.
+
+subroutine sub1()
+  use cuf_mod
+  md = 1.0
+end
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[ADDR:.*]] = fir.address_of(@_QMcuf_modEmd) : !fir.ref<f32>
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ADDR]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuf_modEmd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+! CHECK: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32

From 695b630ae16a1b243d9c72cc275b00cf0c8af2df Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Mon, 26 Feb 2024 09:44:01 -0800
Subject: [PATCH 345/546] [ThinLTO] NFC: Merge duplicated functions together
 (#82421)

---
 llvm/include/llvm/LTO/LTO.h           |  2 ++
 llvm/lib/LTO/LTO.cpp                  | 15 +++++++++++++++
 llvm/lib/LTO/LTOCodeGenerator.cpp     | 14 ++------------
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp | 13 ++-----------
 4 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 1050f24161fb9..94996ae89e35d 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -75,6 +75,8 @@ void computeLTOCacheKey(
 
 namespace lto {
 
+StringLiteral getThinLTODefaultCPU(const Triple &TheTriple);
+
 /// Given the original \p Path to an output file, replace any path
 /// prefix matching \p OldPrefix with \p NewPrefix. Also, create the
 /// resulting directory if it does not yet exist.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b5062580169da..34a49c8588b2f 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1557,6 +1557,21 @@ ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
       };
 }
 
+StringLiteral lto::getThinLTODefaultCPU(const Triple &TheTriple) {
+  if (!TheTriple.isOSDarwin())
+    return "";
+  if (TheTriple.getArch() == Triple::x86_64)
+    return "core2";
+  if (TheTriple.getArch() == Triple::x86)
+    return "yonah";
+  if (TheTriple.isArm64e())
+    return "apple-a12";
+  if (TheTriple.getArch() == Triple::aarch64 ||
+      TheTriple.getArch() == Triple::aarch64_32)
+    return "cyclone";
+  return "";
+}
+
 // Given the original \p Path to an output file, replace any path
 // prefix matching \p OldPrefix with \p NewPrefix. Also, create the
 // resulting directory if it does not yet exist.
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 52d8fff14be9c..19b6f7e787923 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -409,18 +409,8 @@ bool LTOCodeGenerator::determineTarget() {
   SubtargetFeatures Features(join(Config.MAttrs, ""));
   Features.getDefaultSubtargetFeatures(Triple);
   FeatureStr = Features.getString();
-  // Set a default CPU for Darwin triples.
-  if (Config.CPU.empty() && Triple.isOSDarwin()) {
-    if (Triple.getArch() == llvm::Triple::x86_64)
-      Config.CPU = "core2";
-    else if (Triple.getArch() == llvm::Triple::x86)
-      Config.CPU = "yonah";
-    else if (Triple.isArm64e())
-      Config.CPU = "apple-a12";
-    else if (Triple.getArch() == llvm::Triple::aarch64 ||
-             Triple.getArch() == llvm::Triple::aarch64_32)
-      Config.CPU = "cyclone";
-  }
+  if (Config.CPU.empty())
+    Config.CPU = lto::getThinLTODefaultCPU(Triple);
 
   // If data-sections is not explicitly set or unset, set data-sections by
   // default to match the behaviour of lld and gold plugin.
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 535faf5f78047..8fd181846f0c4 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -539,17 +539,8 @@ static void resolvePrevailingInIndex(
 // Initialize the TargetMachine builder for a given Triple
 static void initTMBuilder(TargetMachineBuilder &TMBuilder,
                           const Triple &TheTriple) {
-  // Set a default CPU for Darwin triples (copied from LTOCodeGenerator).
-  // FIXME this looks pretty terrible...
-  if (TMBuilder.MCpu.empty() && TheTriple.isOSDarwin()) {
-    if (TheTriple.getArch() == llvm::Triple::x86_64)
-      TMBuilder.MCpu = "core2";
-    else if (TheTriple.getArch() == llvm::Triple::x86)
-      TMBuilder.MCpu = "yonah";
-    else if (TheTriple.getArch() == llvm::Triple::aarch64 ||
-             TheTriple.getArch() == llvm::Triple::aarch64_32)
-      TMBuilder.MCpu = "cyclone";
-  }
+  if (TMBuilder.MCpu.empty())
+    TMBuilder.MCpu = lto::getThinLTODefaultCPU(TheTriple);
   TMBuilder.TheTriple = std::move(TheTriple);
 }
 

From b876596a76cdc183439b36455d26883b67f8ee51 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 26 Feb 2024 09:55:02 -0800
Subject: [PATCH 346/546] [Driver] Improve error when a compiler-rt library is
 not found (#81037)

BSD, Linux, and z/OS enable `LLVM_ENABLE_PER_TARGET_RUNTIME_DIR` by
default.
When a compiler-rt library is not found, we currently report an
incorrect filename `libclang_rt.XXX-$arch.a`
```
% /tmp/Debug/bin/clang++ a.cc -fsanitize=address -o a
ld.lld: error: cannot open /tmp/Debug/lib/clang/19/lib/linux/libclang_rt.asan-x86_64.a: No such file or directory
clang++: error: linker command failed with exit code 1 (use -v to see invocation)
```

With this change, we will correctly report:
```
% /tmp/Debug/bin/clang++ a.cc -fsanitize=address -o a
ld.lld: error: cannot open /tmp/Debug/lib/clang/19/lib/x86_64-unknown-linux-gnu/libclang_rt.asan.a: No such file or directory
clang++: error: linker command failed with exit code 1 (use -v to see invocation)
```

Link: https://discourse.llvm.org/t/runtime-directory-fallback/76860
---
 clang/lib/Driver/ToolChain.cpp                |  18 ++-
 .../libclang_rt.hwasan.a}                     |   0
 .../libclang_rt.hwasan.a.syms}                |   0
 .../libclang_rt.asan.a}                       |   0
 .../libclang_rt.asan.a.syms}                  |   0
 .../clang_rt.crtbegin.o}                      |   0
 .../clang_rt.crtend.o}                        |   0
 .../clang_rt.crtbegin.o}                      |   0
 .../clang_rt.crtend.o}                        |   0
 .../libclang_rt.asan.a}                       |   0
 .../libclang_rt.asan.a.syms}                  |   0
 .../libclang_rt.hwasan.a}                     |   0
 .../libclang_rt.hwasan.a.syms}                |   0
 .../x86_64-unknown-linux/libclang_rt.msan.a   |   0
 .../libclang_rt.msan.a.syms                   |   0
 .../libclang_rt.msan_cxx.a                    |   0
 .../libclang_rt.msan_cxx.a.syms               |   0
 .../x86_64-unknown-linux/libclang_rt.tsan.a   |   0
 .../libclang_rt.tsan.a.syms                   |   0
 .../libclang_rt.tsan_cxx.a                    |   0
 .../libclang_rt.tsan_cxx.a.syms               |   0
 clang/test/Driver/arch-specific-libdir.c      |   2 +-
 clang/test/Driver/arm-compiler-rt.c           |   2 +-
 clang/test/Driver/baremetal-sysroot.cpp       |   2 +-
 clang/test/Driver/baremetal.cpp               |  38 +++---
 clang/test/Driver/compiler-rt-unwind.c        |   8 +-
 clang/test/Driver/coverage-ld.c               |   4 +-
 clang/test/Driver/fuchsia.c                   |   6 +-
 clang/test/Driver/instrprof-ld.c              |  10 +-
 clang/test/Driver/linux-ld.c                  |  16 +--
 .../Driver/print-libgcc-file-name-clangrt.c   |   2 +-
 clang/test/Driver/sanitizer-ld.c              | 110 +++++++++---------
 32 files changed, 114 insertions(+), 104 deletions(-)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.asan-i386.a.syms => aarch64-unknown-linux/libclang_rt.hwasan.a} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.asan-x86_64.a.syms => aarch64-unknown-linux/libclang_rt.hwasan.a.syms} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.hwasan-aarch64.a.syms => i386-unknown-linux/libclang_rt.asan.a} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.hwasan-x86_64.a.syms => i386-unknown-linux/libclang_rt.asan.a.syms} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.msan-x86_64.a.syms => i686-unknown-linux/clang_rt.crtbegin.o} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.msan_cxx-x86_64.a.syms => i686-unknown-linux/clang_rt.crtend.o} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.tsan-x86_64.a.syms => x86_64-unknown-linux/clang_rt.crtbegin.o} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.tsan_cxx-x86_64.a.syms => x86_64-unknown-linux/clang_rt.crtend.o} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.ubsan-i386.a.syms => x86_64-unknown-linux/libclang_rt.asan.a} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.ubsan-x86_64.a.syms => x86_64-unknown-linux/libclang_rt.asan.a.syms} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.ubsan_cxx-i386.a.syms => x86_64-unknown-linux/libclang_rt.hwasan.a} (100%)
 rename clang/test/Driver/Inputs/resource_dir/lib/{linux/libclang_rt.ubsan_cxx-x86_64.a.syms => x86_64-unknown-linux/libclang_rt.hwasan.a.syms} (100%)
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a.syms
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a.syms
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a.syms
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a
 create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a.syms

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index bd854aae35d4c..08b1fd01b3c0a 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -681,19 +681,29 @@ std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component,
   // Check for runtime files in the new layout without the architecture first.
   std::string CRTBasename =
       buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false);
+  SmallString<128> Path;
   for (const auto &LibPath : getLibraryPaths()) {
     SmallString<128> P(LibPath);
     llvm::sys::path::append(P, CRTBasename);
     if (getVFS().exists(P))
       return std::string(P);
+    if (Path.empty())
+      Path = P;
   }
+  if (getTriple().isOSAIX())
+    Path.clear();
 
-  // Fall back to the old expected compiler-rt name if the new one does not
-  // exist.
+  // Check the filename for the old layout if the new one does not exist.
   CRTBasename =
       buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/true);
-  SmallString<128> Path(getCompilerRTPath());
-  llvm::sys::path::append(Path, CRTBasename);
+  SmallString<128> OldPath(getCompilerRTPath());
+  llvm::sys::path::append(OldPath, CRTBasename);
+  if (Path.empty() || getVFS().exists(OldPath))
+    return std::string(OldPath);
+
+  // If none is found, use a file name from the new layout, which may get
+  // printed in an error message, aiding users in knowing what Clang is
+  // looking for.
   return std::string(Path);
 }
 
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-i386.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-i386.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a.syms
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.asan-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/aarch64-unknown-linux/libclang_rt.hwasan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-aarch64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-aarch64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a.syms
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.hwasan-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/i386-unknown-linux/libclang_rt.asan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtbegin.o
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtbegin.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan_cxx-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtend.o
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.msan_cxx-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/i686-unknown-linux/clang_rt.crtend.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtbegin.o
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtbegin.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan_cxx-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtend.o
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.tsan_cxx-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/clang_rt.crtend.o
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-i386.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-i386.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a.syms
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.asan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-i386.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-i386.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-x86_64.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a.syms
similarity index 100%
rename from clang/test/Driver/Inputs/resource_dir/lib/linux/libclang_rt.ubsan_cxx-x86_64.a.syms
rename to clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.hwasan.a.syms
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan.a.syms
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.msan_cxx.a.syms
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan.a.syms
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a.syms b/clang/test/Driver/Inputs/resource_dir/lib/x86_64-unknown-linux/libclang_rt.tsan_cxx.a.syms
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/arch-specific-libdir.c b/clang/test/Driver/arch-specific-libdir.c
index be643d2c8b8ac..162f9e4241260 100644
--- a/clang/test/Driver/arch-specific-libdir.c
+++ b/clang/test/Driver/arch-specific-libdir.c
@@ -49,4 +49,4 @@
 //
 // Have a stricter check for no-archdir - that the driver doesn't add any
 // subdirectory from the provided resource directory.
-// NO-ARCHDIR-NOT: -L[[FILE_PATH]]/Inputs/resource_dir
+// NO-ARCHDIR-NOT: -L[[FILE_PATH]]/Inputs/resource_dir"
diff --git a/clang/test/Driver/arm-compiler-rt.c b/clang/test/Driver/arm-compiler-rt.c
index adecacbcaabf9..5e9e528400d08 100644
--- a/clang/test/Driver/arm-compiler-rt.c
+++ b/clang/test/Driver/arm-compiler-rt.c
@@ -3,7 +3,7 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_arch_subdir \
 // RUN:     -rtlib=compiler-rt -### %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix ARM-EABI
-// ARM-EABI: "{{[^"]*}}libclang_rt.builtins-arm.a"
+// ARM-EABI: "{{[^"]*}}libclang_rt.builtins.a"
 
 // RUN: %clang -target arm-linux-gnueabi \
 // RUN:     --sysroot=%S/Inputs/resource_dir_with_arch_subdir \
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 46338185ffd9d..bbc608809d0e4 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -18,5 +18,5 @@
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
 // CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 8baf388894eb2..657611bb3f38d 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -18,7 +18,7 @@
 // CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "{{.*}}.tmp.out"
+// CHECK-V6M-C-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "{{.*}}.tmp.out"
 
 // RUN: %clang %s -### --target=armv6m-none-eabi -nostdlibinc -nobuiltininc 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBINC %s
@@ -44,7 +44,7 @@
 // CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi -stdlib=libc++ 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-LIBCXX %s
@@ -54,7 +54,7 @@
 // CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm \
@@ -66,7 +66,7 @@
 // CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a" "--target2=rel" "-o" "a.out"
+// CHECK-V6M-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "--target2=rel" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm \
@@ -89,7 +89,7 @@
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-V6M-LIBCXX-USR-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a"
 
 // RUN: %clangxx --target=arm-none-eabi -v 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-THREAD-MODEL
@@ -172,7 +172,7 @@
 // CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV64-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "{{.*}}.tmp.out"
+// CHECK-RV64-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "{{.*}}.tmp.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -181,7 +181,7 @@
 // CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
+// CHECK-RV64-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -193,7 +193,7 @@
 // CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
+// CHECK-RV64-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -205,7 +205,7 @@
 // CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv64.a" "-X" "-o" "a.out"
+// CHECK-RV64-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -L some/directory/user/asked/for \
@@ -220,7 +220,7 @@
 // CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV32-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -229,7 +229,7 @@
 // CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-DEFAULTCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -241,7 +241,7 @@
 // CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lc++abi" "-lunwind"
-// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-LIBCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -253,7 +253,7 @@
 // CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lsupc++" "-lunwind"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-riscv32.a" "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
@@ -375,7 +375,7 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc.a" "-o" "a.out"
+// CHECK-PPCEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64EABI %s
@@ -387,7 +387,7 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64EABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64.a" "-o" "a.out"
+// CHECK-PPC64EABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpcle-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPCLEEABI %s
@@ -399,7 +399,7 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpcle.a" "-o" "a.out"
+// CHECK-PPCLEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc64le-unknown-eabi 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LEEABI %s
@@ -411,7 +411,7 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 // CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins-powerpc64le.a" "-o" "a.out"
+// CHECK-PPC64LEEABI-SAME: "-lc" "-lm" "{{[^"]*}}libclang_rt.builtins.a" "-o" "a.out"
 
 // Check that compiler-rt library without the arch filename suffix will
 // be used if present.
@@ -423,7 +423,7 @@
 // RUN:     --sysroot=%T/baremetal_clang_rt_noarch \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-NOARCH %s
 // CHECK-CLANGRT-NOARCH: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-CLANGRT-NOARCH-NOT: "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-CLANGRT-NOARCH-NOT: "{{[^"]*}}libclang_rt.builtins.a"
 
 // Check that compiler-rt library with the arch filename suffix will be
 // used if present.
@@ -434,7 +434,7 @@
 // RUN:     --target=armv6m-none-eabi \
 // RUN:     --sysroot=%T/baremetal_clang_rt_arch \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-ARCH %s
-// CHECK-CLANGRT-ARCH: "{{[^"]*}}libclang_rt.builtins-armv6m.a"
+// CHECK-CLANGRT-ARCH: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-CLANGRT-ARCH-NOT: "{{[^"]*}}libclang_rt.builtins.a"
 
 // Check that "--no-relax" is forwarded to the linker for RISC-V.
diff --git a/clang/test/Driver/compiler-rt-unwind.c b/clang/test/Driver/compiler-rt-unwind.c
index adc09792a27fc..4260dab9302c7 100644
--- a/clang/test/Driver/compiler-rt-unwind.c
+++ b/clang/test/Driver/compiler-rt-unwind.c
@@ -52,7 +52,7 @@
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt \
 // RUN:     --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT %s
-// RTLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libunwind \
@@ -62,7 +62,7 @@
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libunwind \
 // RUN:     --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT %s
-// RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT: "{{.*}}libclang_rt.builtins.a"
 // RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT-SAME: "--as-needed"
 // RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT-SAME: "-lunwind"
 // RTLIB-COMPILER-RT-UNWINDLIB-COMPILER-RT-SAME: "--no-as-needed"
@@ -71,14 +71,14 @@
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libgcc \
 // RUN:     --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT-UNWINDLIB-GCC %s
-// RTLIB-COMPILER-RT-UNWINDLIB-GCC: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT-UNWINDLIB-GCC: "{{.*}}libclang_rt.builtins.a"
 // RTLIB-COMPILER-RT-UNWINDLIB-GCC-SAME: "-lgcc_s"
 //
 // RUN: %clang -### %s 2>&1              \
 // RUN:     --target=x86_64-unknown-linux -rtlib=compiler-rt --unwindlib=libgcc \
 // RUN:     -static --gcc-toolchain="" -resource-dir=%S/Inputs/resource_dir \
 // RUN:   | FileCheck --check-prefix=RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC %s
-// RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC: "{{.*}}libclang_rt.builtins-x86_64.a"
+// RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC: "{{.*}}libclang_rt.builtins.a"
 // RTLIB-COMPILER-RT-UNWINDLIB-GCC-STATIC-SAME: "-lgcc_eh"
 //
 // RUN: not %clang %s 2> %t.err              \
diff --git a/clang/test/Driver/coverage-ld.c b/clang/test/Driver/coverage-ld.c
index e72bbf86bf4ef..acb08eb5db59a 100644
--- a/clang/test/Driver/coverage-ld.c
+++ b/clang/test/Driver/coverage-ld.c
@@ -13,7 +13,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
 //
 // CHECK-LINUX-I386-NOT: "-u__llvm_profile_runtime"
-// CHECK-LINUX-I386: /Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a"
+// CHECK-LINUX-I386: /Inputs/resource_dir{{/|\\\\}}lib{{.*}}i386-unknown-linux{{.*}}libclang_rt.profile.a"
 // CHECK-LINUX-I386-NOT: "-u__llvm_profile_runtime"
 // CHECK-LINUX-I386: "-lc"
 //
@@ -24,7 +24,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
 //
 // CHECK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
+// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-freebsd --coverage -fuse-ld=ld \
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index cfcb8ad0d3ab4..ca53f0d107a00 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -187,7 +187,7 @@
 // CHECK-SCUDO-X86: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SCUDO-X86: "-fsanitize=safe-stack,scudo"
 // CHECK-SCUDO-X86: "-pie"
-// CHECK-SCUDO-X86: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}fuchsia{{/|\\\\}}libclang_rt.scudo_standalone-x86_64.so"
+// CHECK-SCUDO-X86: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{.*}}fuchsia{{.*}}libclang_rt.scudo_standalone.so"
 
 // RUN: %clang -### %s --target=aarch64-unknown-fuchsia \
 // RUN:     -fsanitize=scudo 2>&1 \
@@ -197,7 +197,7 @@
 // CHECK-SCUDO-AARCH64: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SCUDO-AARCH64: "-fsanitize=shadow-call-stack,scudo"
 // CHECK-SCUDO-AARCH64: "-pie"
-// CHECK-SCUDO-AARCH64: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}fuchsia{{/|\\\\}}libclang_rt.scudo_standalone-aarch64.so"
+// CHECK-SCUDO-AARCH64: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{.*}}fuchsia{{.*}}libclang_rt.scudo_standalone.so"
 
 // RUN: %clang -### %s --target=x86_64-unknown-fuchsia \
 // RUN:     -fsanitize=scudo -fPIC -shared 2>&1 \
@@ -206,7 +206,7 @@
 // RUN:     | FileCheck %s -check-prefix=CHECK-SCUDO-SHARED
 // CHECK-SCUDO-SHARED: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-SCUDO-SHARED: "-fsanitize=safe-stack,scudo"
-// CHECK-SCUDO-SHARED: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}fuchsia{{/|\\\\}}libclang_rt.scudo_standalone-x86_64.so"
+// CHECK-SCUDO-SHARED: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{.*}}fuchsia{{.*}}libclang_rt.scudo_standalone.so"
 
 // RUN: %clang -### %s --target=aarch64-unknown-fuchsia \
 // RUN:     -fsanitize=leak 2>&1 \
diff --git a/clang/test/Driver/instrprof-ld.c b/clang/test/Driver/instrprof-ld.c
index 9a58cd3a0be75..674580b349d42 100644
--- a/clang/test/Driver/instrprof-ld.c
+++ b/clang/test/Driver/instrprof-ld.c
@@ -7,7 +7,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
 //
 // CHECK-LINUX-I386: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-I386: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a" {{.*}} "-lc"
+// CHECK-LINUX-I386: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}i386-unknown-linux{{/|\\\\}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fprofile-instr-generate -fuse-ld=ld \
@@ -16,7 +16,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
 //
 // CHECK-LINUX-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
+// CHECK-LINUX-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fprofile-instr-generate -nostdlib -fuse-ld=ld \
@@ -25,7 +25,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB-X86-64 %s
 //
 // CHECK-LINUX-NOSTDLIB-X86-64: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-NOSTDLIB-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a"
+// CHECK-LINUX-NOSTDLIB-X86-64: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-freebsd -fprofile-instr-generate -fuse-ld=ld \
@@ -62,7 +62,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386-SHARED %s
 //
 // CHECK-LINUX-I386-SHARED: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-I386-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-i386.a" {{.*}} "-lc"
+// CHECK-LINUX-I386-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}i386-unknown-linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     -shared \
@@ -72,7 +72,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64-SHARED %s
 //
 // CHECK-LINUX-X86-64-SHARED: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-LINUX-X86-64-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.profile-x86_64.a" {{.*}} "-lc"
+// CHECK-LINUX-X86-64-SHARED: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{.*}}linux{{.*}}libclang_rt.profile.a" {{.*}} "-lc"
 //
 // RUN: %clang -### %s 2>&1 \
 // RUN:     -shared \
diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c
index b3ce5ca307a6c..d77367b4ece7b 100644
--- a/clang/test/Driver/linux-ld.c
+++ b/clang/test/Driver/linux-ld.c
@@ -61,15 +61,15 @@
 // CHECK-LD-RT: "--eh-frame-hdr"
 // CHECK-LD-RT: "-m" "elf_x86_64"
 // CHECK-LD-RT: "-dynamic-linker"
-// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtbegin-x86_64.o"
+// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-linux{{/|\\\\}}clang_rt.crtbegin.o"
 // CHECK-LD-RT: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-unknown-linux/10.2.0"
 // CHECK-LD-RT: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-unknown-linux/10.2.0/../../../../x86_64-unknown-linux/lib"
 // CHECK-LD-RT: "-L[[SYSROOT]]/lib"
 // CHECK-LD-RT: "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD-RT: libclang_rt.builtins-x86_64.a"
+// CHECK-LD-RT: libclang_rt.builtins.a"
 // CHECK-LD-RT: "-lc"
-// CHECK-LD-RT: libclang_rt.builtins-x86_64.a"
-// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtend-x86_64.o"
+// CHECK-LD-RT: libclang_rt.builtins.a"
+// CHECK-LD-RT: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}x86_64-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
 // RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=i686-unknown-linux \
@@ -84,15 +84,15 @@
 // CHECK-LD-RT-I686: "--eh-frame-hdr"
 // CHECK-LD-RT-I686: "-m" "elf_i386"
 // CHECK-LD-RT-I686: "-dynamic-linker"
-// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtbegin-i386.o"
+// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}i686-unknown-linux{{/|\\\\}}clang_rt.crtbegin.o"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/usr/lib/gcc/i686-unknown-linux/10.2.0"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/usr/lib/gcc/i686-unknown-linux/10.2.0/../../../../i686-unknown-linux/lib"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/lib"
 // CHECK-LD-RT-I686: "-L[[SYSROOT]]/usr/lib"
-// CHECK-LD-RT-I686: libclang_rt.builtins-i386.a"
+// CHECK-LD-RT-I686: libclang_rt.builtins.a"
 // CHECK-LD-RT-I686: "-lc"
-// CHECK-LD-RT-I686: libclang_rt.builtins-i386.a"
-// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}clang_rt.crtend-i386.o"
+// CHECK-LD-RT-I686: libclang_rt.builtins.a"
+// CHECK-LD-RT-I686: "[[RESDIR]]{{/|\\\\}}lib{{/|\\\\}}i686-unknown-linux{{/|\\\\}}clang_rt.crtend.o"
 //
 // RUN: %clang -### %s -no-pie 2>&1 \
 // RUN:     --target=arm-linux-androideabi \
diff --git a/clang/test/Driver/print-libgcc-file-name-clangrt.c b/clang/test/Driver/print-libgcc-file-name-clangrt.c
index 19f9a3c28c31b..ed740e0d2917d 100644
--- a/clang/test/Driver/print-libgcc-file-name-clangrt.c
+++ b/clang/test/Driver/print-libgcc-file-name-clangrt.c
@@ -55,7 +55,7 @@
 // RUN:     --sysroot=%S/Inputs/resource_dir_with_arch_subdir \
 // RUN:     -resource-dir=%S/Inputs/resource_dir_with_arch_subdir 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-CLANGRT-ARM-BAREMETAL %s
-// CHECK-CLANGRT-ARM-BAREMETAL: libclang_rt.builtins-armv7m.a
+// CHECK-CLANGRT-ARM-BAREMETAL: libclang_rt.builtins.a
 
 // RUN: %clang -rtlib=compiler-rt -print-libgcc-file-name \
 // RUN:     --target=armv7m-vendor-none-eabi \
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index 79ce70e19ff4f..53e536d772924 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -8,9 +8,9 @@
 //
 // CHECK-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-NOT: "-lc"
-// CHECK-ASAN-LINUX: libclang_rt.asan-i386.a"
+// CHECK-ASAN-LINUX: libclang_rt.asan.a"
 // CHECK-ASAN-LINUX-NOT: "--export-dynamic"
-// CHECK-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan-i386.a.syms"
+// CHECK-ASAN-LINUX: "--dynamic-list={{.*}}libclang_rt.asan.a.syms"
 // CHECK-ASAN-LINUX-NOT: "--export-dynamic"
 // CHECK-ASAN-LINUX: "-lpthread"
 // CHECK-ASAN-LINUX: "-lrt"
@@ -41,8 +41,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-EXECUTABLE-LINUX %s
 //
-// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static-x86_64
-// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan-x86_64
+// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan_static
+// CHECK-ASAN-EXECUTABLE-LINUX: libclang_rt.asan
 
 // RUN: %clang -fsanitize=address -shared -### %s 2>&1  \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld \
@@ -50,8 +50,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-SHARED-LINUX %s
 //
-// CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static-x86_64
-// CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan-x86_64
+// CHECK-ASAN-SHARED-LINUX: libclang_rt.asan_static
+// CHECK-ASAN-SHARED-LINUX-NOT: libclang_rt.asan
 
 // RUN: %clang -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -fsanitize=address -shared-libsan \
@@ -74,9 +74,9 @@
 //
 // CHECK-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lc"
-// CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan-i386.a"
-// CHECK-SHARED-ASAN-LINUX: libclang_rt.asan-i386.so"
-// CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit-i386.a" "--no-whole-archive"
+// CHECK-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a"
+// CHECK-SHARED-ASAN-LINUX: libclang_rt.asan.so"
+// CHECK-SHARED-ASAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-preinit.a" "--no-whole-archive"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-lrt"
 // CHECK-SHARED-ASAN-LINUX-NOT: "-ldl"
@@ -92,9 +92,9 @@
 //
 // CHECK-DSO-SHARED-ASAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan-i386.a"
-// CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit-i386.a"
-// CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan-i386.so"
+// CHECK-DSO-SHARED-ASAN-LINUX-NOT: libclang_rt.asan.a"
+// CHECK-DSO-SHARED-ASAN-LINUX-NOT: "libclang_rt.asan-preinit.a"
+// CHECK-DSO-SHARED-ASAN-LINUX: libclang_rt.asan.so"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-ASAN-LINUX-NOT: "-ldl"
@@ -153,7 +153,7 @@
 //
 // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-ASAN-LINUX-CXX-STATIC-NOT: stdc++
-// CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan-i386.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-CXX-STATIC: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CXX-STATIC: stdc++
 
 // RUN: %clang -### %s 2>&1 \
@@ -270,10 +270,10 @@
 //
 // CHECK-TSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-TSAN-LINUX-CXX-NOT: stdc++
-// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan-x86_64.a" "--no-whole-archive"
-// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan-x86_64.a.syms"
-// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx-x86_64.a" "--no-whole-archive"
-// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan_cxx-x86_64.a.syms"
+// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
+// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan.a.syms"
+// CHECK-TSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan_cxx.a" "--no-whole-archive"
+// CHECK-TSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.tsan_cxx.a.syms"
 // CHECK-TSAN-LINUX-CXX-NOT: "--export-dynamic"
 // CHECK-TSAN-LINUX-CXX: stdc++
 // CHECK-TSAN-LINUX-CXX: "-lpthread"
@@ -306,10 +306,10 @@
 //
 // CHECK-MSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-MSAN-LINUX-CXX-NOT: stdc++
-// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "--no-whole-archive"
-// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan-x86_64.a.syms"
-// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx-x86_64.a" "--no-whole-archive"
-// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan_cxx-x86_64.a.syms"
+// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
+// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan.a.syms"
+// CHECK-MSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan_cxx.a" "--no-whole-archive"
+// CHECK-MSAN-LINUX-CXX: "--dynamic-list={{.*}}libclang_rt.msan_cxx.a.syms"
 // CHECK-MSAN-LINUX-CXX-NOT: "--export-dynamic"
 // CHECK-MSAN-LINUX-CXX: stdc++
 // CHECK-MSAN-LINUX-CXX: "-lpthread"
@@ -400,7 +400,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-SHAREDLIBASAN %s
 
 // CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone-i386.so{{.*}}"
+// CHECK-UBSAN-LINUX-SHAREDLIBASAN: "{{.*}}libclang_rt.ubsan_standalone.so{{.*}}"
 
 // RUN: %clang -fsanitize=undefined -fsanitize-link-c++-runtime -### %s 2>&1 \
 // RUN:     --target=i386-unknown-linux \
@@ -408,7 +408,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-LINK-CXX %s
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
-// CHECK-UBSAN-LINUX-LINK-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-LINUX-LINK-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
 
 // RUN: %clangxx -fsanitize=undefined -### %s 2>&1 \
@@ -418,9 +418,9 @@
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s
 // CHECK-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
-// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
-// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone_cxx.a" "--no-whole-archive"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
 // CHECK-UBSAN-LINUX-CXX: "-lstdc++"
 // CHECK-UBSAN-LINUX-CXX-NOT: libclang_rt.asan
@@ -433,7 +433,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-MINIMAL-LINUX %s
 // CHECK-UBSAN-MINIMAL-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal-i386.a" "--no-whole-archive"
+// CHECK-UBSAN-MINIMAL-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_minimal.a" "--no-whole-archive"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lpthread"
 // CHECK-UBSAN-MINIMAL-LINUX: "-lresolv"
 
@@ -468,7 +468,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX %s
 // CHECK-ASAN-UBSAN-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-i386.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-NOT: libclang_rt.ubsan
 // CHECK-ASAN-UBSAN-LINUX-NOT: "-lstdc++"
 // CHECK-ASAN-UBSAN-LINUX: "-lpthread"
@@ -480,8 +480,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s
 // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan-i386.a" "--no-whole-archive"
-// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx-i386.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lstdc++"
 // CHECK-ASAN-UBSAN-LINUX-CXX: "-lpthread"
@@ -493,7 +493,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-UBSAN-LINUX-CXX %s
 // CHECK-MSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "--no-whole-archive"
+// CHECK-MSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 
 // RUN: %clangxx -fsanitize=thread,undefined -### %s 2>&1 \
@@ -502,7 +502,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-TSAN-UBSAN-LINUX-CXX %s
 // CHECK-TSAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan-x86_64.a" "--no-whole-archive"
+// CHECK-TSAN-UBSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tsan.a" "--no-whole-archive"
 // CHECK-TSAN-UBSAN-LINUX-CXX-NOT: libclang_rt.ubsan
 
 // RUN: %clang -fsanitize=undefined -### %s 2>&1 \
@@ -525,7 +525,7 @@
 // CHECK-LSAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LSAN-LINUX-NOT: "-lc"
 // CHECK-LSAN-LINUX-NOT: libclang_rt.ubsan
-// CHECK-LSAN-LINUX: libclang_rt.lsan-x86_64.a"
+// CHECK-LSAN-LINUX: libclang_rt.lsan.a"
 // CHECK-LSAN-LINUX: "-lpthread"
 // CHECK-LSAN-LINUX: "-ldl"
 // CHECK-LSAN-LINUX: "-lresolv"
@@ -560,7 +560,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-LSAN-ASAN-LINUX %s
 // CHECK-LSAN-ASAN-LINUX: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
-// CHECK-LSAN-ASAN-LINUX: libclang_rt.asan-x86_64
+// CHECK-LSAN-ASAN-LINUX: libclang_rt.asan
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
 
 // RUN: %clang -fsanitize=address -fsanitize-coverage=func -### %s 2>&1 \
@@ -569,7 +569,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s
 // CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan-x86_64.a" "--no-whole-archive"
+// CHECK-ASAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
 // CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-ASAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-ASAN-COV-LINUX: "-lpthread"
@@ -581,7 +581,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s
 // CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "--no-whole-archive"
+// CHECK-MSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.msan.a" "--no-whole-archive"
 // CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-MSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-MSAN-COV-LINUX: "-lpthread"
@@ -593,7 +593,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s
 // CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan-x86_64.a" "--no-whole-archive"
+// CHECK-DFSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.dfsan.a" "--no-whole-archive"
 // CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan
 // CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-DFSAN-COV-LINUX: "-lpthread"
@@ -605,7 +605,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s
 // CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "--no-whole-archive"
+// CHECK-UBSAN-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++"
 // CHECK-UBSAN-COV-LINUX: "-lpthread"
 // CHECK-UBSAN-COV-LINUX: "-lresolv"
@@ -616,7 +616,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-COV-LINUX %s
 // CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "--no-whole-archive"
+// CHECK-COV-LINUX: "--whole-archive" "{{.*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 // CHECK-COV-LINUX-NOT: "-lstdc++"
 // CHECK-COV-LINUX: "-lpthread"
 // CHECK-COV-LINUX: "-lresolv"
@@ -638,7 +638,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-DIAG-LINUX %s
 // CHECK-CFI-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.ubsan_standalone.a" "--no-whole-archive"
 
 // Cross-DSO CFI links the CFI runtime.
 // RUN: not %clang -fsanitize=cfi -fsanitize-cfi-cross-dso -### %s 2>&1 \
@@ -647,7 +647,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-LINUX %s
 // CHECK-CFI-CROSS-DSO-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-CROSS-DSO-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-LINUX: -export-dynamic
 
 // Cross-DSO CFI with diagnostics links just the CFI runtime.
@@ -658,7 +658,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-CROSS-DSO-DIAG-LINUX %s
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-CROSS-DSO-DIAG-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.cfi_diag.a" "--no-whole-archive"
 // CHECK-CFI-CROSS-DSO-DIAG-LINUX: -export-dynamic
 
 // Cross-DSO CFI on Android does not link runtime libraries.
@@ -710,7 +710,7 @@
 // CHECK-SAFESTACK-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SAFESTACK-LINUX-NOT: "-lc"
 // CHECK-SAFESTACK-LINUX-NOT: whole-archive
-// CHECK-SAFESTACK-LINUX: libclang_rt.safestack-x86_64.a"
+// CHECK-SAFESTACK-LINUX: libclang_rt.safestack.a"
 // CHECK-SAFESTACK-LINUX: "-u" "__safestack_init"
 // CHECK-SAFESTACK-LINUX: "-lpthread"
 // CHECK-SAFESTACK-LINUX: "-ldl"
@@ -772,9 +772,9 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-LINUX %s
 // CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client-x86_64.a" "--no-whole-archive"
+// CHECK-CFI-STATS-LINUX: "--whole-archive" "{{[^"]*}}libclang_rt.stats_client.a" "--no-whole-archive"
 // CHECK-CFI-STATS-LINUX-NOT: "--whole-archive"
-// CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats-x86_64.a"
+// CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats.a"
 
 // RUN: not %clang -fsanitize=cfi -fsanitize-stats -### %s 2>&1 \
 // RUN:     --target=x86_64-apple-darwin -fuse-ld=ld \
@@ -896,7 +896,7 @@
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-SCUDO-LINUX %s
 // CHECK-SCUDO-LINUX: "{{.*}}ld{{(.exe)?}}"
-// CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone-i386.a" "--no-whole-archive"
+// CHECK-SCUDO-LINUX: "--whole-archive" "{{.*}}libclang_rt.scudo_standalone.a" "--no-whole-archive"
 // CHECK-SCUDO-LINUX-NOT: "-lstdc++"
 // CHECK-SCUDO-LINUX: "-lpthread"
 // CHECK-SCUDO-LINUX: "-ldl"
@@ -910,8 +910,8 @@
 //
 // CHECK-SCUDO-SHARED-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lc"
-// CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone-i386.a"
-// CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone-i386.so"
+// CHECK-SCUDO-SHARED-LINUX-NOT: libclang_rt.scudo_standalone.a"
+// CHECK-SCUDO-SHARED-LINUX: libclang_rt.scudo_standalone.so"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lpthread"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-lrt"
 // CHECK-SCUDO-SHARED-LINUX-NOT: "-ldl"
@@ -954,9 +954,9 @@
 //
 // CHECK-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "-lc"
-// CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan-x86_64.a"
+// CHECK-HWASAN-X86-64-LINUX: libclang_rt.hwasan.a"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic"
-// CHECK-HWASAN-X86-64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan-x86_64.a.syms"
+// CHECK-HWASAN-X86-64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan.a.syms"
 // CHECK-HWASAN-X86-64-LINUX-NOT: "--export-dynamic"
 // CHECK-HWASAN-X86-64-LINUX: "-lpthread"
 // CHECK-HWASAN-X86-64-LINUX: "-lrt"
@@ -971,7 +971,7 @@
 //
 // CHECK-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
-// CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-x86_64.so"
+// CHECK-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-lrt"
 // CHECK-SHARED-HWASAN-X86-64-LINUX-NOT: "-ldl"
@@ -987,7 +987,7 @@
 //
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan-x86_64.so"
+// CHECK-DSO-SHARED-HWASAN-X86-64-LINUX: libclang_rt.hwasan.so"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-HWASAN-X86-64-LINUX-NOT: "-ldl"
@@ -1003,9 +1003,9 @@
 //
 // CHECK-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "-lc"
-// CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-aarch64.a"
+// CHECK-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.a"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic"
-// CHECK-HWASAN-AARCH64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan-aarch64.a.syms"
+// CHECK-HWASAN-AARCH64-LINUX: "--dynamic-list={{.*}}libclang_rt.hwasan.a.syms"
 // CHECK-HWASAN-AARCH64-LINUX-NOT: "--export-dynamic"
 // CHECK-HWASAN-AARCH64-LINUX: "-lpthread"
 // CHECK-HWASAN-AARCH64-LINUX: "-lrt"
@@ -1021,7 +1021,7 @@
 //
 // CHECK-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
-// CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-aarch64.so"
+// CHECK-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lrt"
 // CHECK-SHARED-HWASAN-AARCH64-LINUX-NOT: "-ldl"
@@ -1037,7 +1037,7 @@
 //
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lc"
-// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan-aarch64.so"
+// CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX: libclang_rt.hwasan.so"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lpthread"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-lrt"
 // CHECK-DSO-SHARED-HWASAN-AARCH64-LINUX-NOT: "-ldl"

From f9f331652d4f0aff9ece3570abe8c686cdfefff4 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Fri, 23 Feb 2024 09:58:17 -0800
Subject: [PATCH 347/546] Replace ArchSpec::PiecewiseCompare() with
 Triple::operator==()

Looking ast the definition of both functions this is *almost* an NFC
change, except that Triple also looks at the SubArch (important) and
ObjectFormat (less so).

This fixes a bug that only manifests with how Xcode uses the SBAPI to
attach to a process by name: it guesses the architecture based on the
system. If the system is arm64 and the Process is arm64e Target fails
to update the triple because it deemed the two to be equivalent.

rdar://123338218
---
 lldb/include/lldb/Utility/ArchSpec.h          |  5 ----
 lldb/source/Target/Target.cpp                 |  8 +-----
 lldb/source/Utility/ArchSpec.cpp              | 17 -----------
 lldb/test/API/macosx/arm64e-attach/Makefile   |  2 ++
 .../macosx/arm64e-attach/TestArm64eAttach.py  | 28 +++++++++++++++++++
 lldb/test/API/macosx/arm64e-attach/main.c     |  2 ++
 6 files changed, 33 insertions(+), 29 deletions(-)
 create mode 100644 lldb/test/API/macosx/arm64e-attach/Makefile
 create mode 100644 lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
 create mode 100644 lldb/test/API/macosx/arm64e-attach/main.c

diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index a226a3a5a9b71..50830b889b911 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -505,11 +505,6 @@ class ArchSpec {
 
   bool IsFullySpecifiedTriple() const;
 
-  void PiecewiseTripleCompare(const ArchSpec &other, bool &arch_different,
-                              bool &vendor_different, bool &os_different,
-                              bool &os_version_different,
-                              bool &env_different) const;
-
   /// Detect whether this architecture uses thumb code exclusively
   ///
   /// Some embedded ARM chips (e.g. the ARM Cortex M0-7 line) can only execute
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e17bfcb5d5e2a..e982a30a3ae4f 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1568,14 +1568,8 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
 
       if (m_arch.GetSpec().IsCompatibleMatch(other)) {
         compatible_local_arch = true;
-        bool arch_changed, vendor_changed, os_changed, os_ver_changed,
-            env_changed;
 
-        m_arch.GetSpec().PiecewiseTripleCompare(other, arch_changed,
-                                                vendor_changed, os_changed,
-                                                os_ver_changed, env_changed);
-
-        if (!arch_changed && !vendor_changed && !os_changed && !env_changed)
+        if (m_arch.GetSpec().GetTriple() == other.GetTriple())
           replace_local_arch = false;
       }
     }
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index fb0e985a0d565..07ef435ef451d 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -1421,23 +1421,6 @@ bool ArchSpec::IsFullySpecifiedTriple() const {
   return true;
 }
 
-void ArchSpec::PiecewiseTripleCompare(
-    const ArchSpec &other, bool &arch_different, bool &vendor_different,
-    bool &os_different, bool &os_version_different, bool &env_different) const {
-  const llvm::Triple &me(GetTriple());
-  const llvm::Triple &them(other.GetTriple());
-
-  arch_different = (me.getArch() != them.getArch());
-
-  vendor_different = (me.getVendor() != them.getVendor());
-
-  os_different = (me.getOS() != them.getOS());
-
-  os_version_different = (me.getOSMajorVersion() != them.getOSMajorVersion());
-
-  env_different = (me.getEnvironment() != them.getEnvironment());
-}
-
 bool ArchSpec::IsAlwaysThumbInstructions() const {
   std::string Status;
   if (GetTriple().getArch() == llvm::Triple::arm ||
diff --git a/lldb/test/API/macosx/arm64e-attach/Makefile b/lldb/test/API/macosx/arm64e-attach/Makefile
new file mode 100644
index 0000000000000..c9319d6e6888a
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/Makefile
@@ -0,0 +1,2 @@
+C_SOURCES := main.c
+include Makefile.rules
diff --git a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
new file mode 100644
index 0000000000000..0dc8700ed02dd
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
@@ -0,0 +1,28 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestArm64eAttach(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    # On Darwin systems, arch arm64e means ARMv8.3 with ptrauth ABI used.
+    @skipIf(archs=no_match(["arm64e"]))
+    def test(self):
+        # Skip this test if not running on AArch64 target that supports PAC
+        if not self.isAArch64PAuth():
+            self.skipTest("Target must support pointer authentication.")
+        self.build()
+        popen = self.spawnSubprocess(self.getBuildArtifact(), [])
+        error = lldb.SBError()
+        # This simulates how Xcode attaches to a process by pid/name.
+        target = self.dbg.CreateTarget("", "arm64", "", True, error)
+        listener = lldb.SBListener("my.attach.listener")
+        process = target.AttachToProcessWithID(listener, popen.pid, error)
+        self.assertSuccess(error)
+        self.assertTrue(process, PROCESS_IS_VALID)
+        self.assertEqual(target.GetTriple().split('-')[0], "arm64e",
+                         "target triple is updated correctly")
+        error = process.Kill()
+        self.assertSuccess(error)
diff --git a/lldb/test/API/macosx/arm64e-attach/main.c b/lldb/test/API/macosx/arm64e-attach/main.c
new file mode 100644
index 0000000000000..7baf2ffd8f289
--- /dev/null
+++ b/lldb/test/API/macosx/arm64e-attach/main.c
@@ -0,0 +1,2 @@
+int getchar();
+int main() { return getchar(); }

From 01450dd1c69d1edb0d01159352a56c99988839f4 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Sun, 25 Feb 2024 15:08:21 -0800
Subject: [PATCH 348/546] Change debugserver to report the cpu(sub)type of
 process, not the host.

This way debugserver can correctly report qProcessInfo for arm64
processes on arm64e-capable hosts.

Patch implemented with help from Jason Molenda!
---
 lldb/source/Target/Target.cpp                 |   8 ++
 lldb/test/API/macosx/arm64e-attach/Makefile   |   5 +
 .../macosx/arm64e-attach/TestArm64eAttach.py  |   9 +-
 lldb/tools/debugserver/source/DNB.cpp         |   8 ++
 lldb/tools/debugserver/source/DNB.h           |   2 +
 .../debugserver/source/MacOSX/MachProcess.h   |   2 +
 .../debugserver/source/MacOSX/MachProcess.mm  |  17 +++
 lldb/tools/debugserver/source/RNBRemote.cpp   | 106 ++++++++++--------
 8 files changed, 108 insertions(+), 49 deletions(-)

diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e982a30a3ae4f..089915cab4915 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -33,6 +33,7 @@
 #include "lldb/Expression/UtilityFunction.h"
 #include "lldb/Host/Host.h"
 #include "lldb/Host/PosixApi.h"
+#include "lldb/Host/SafeMachO.h"
 #include "lldb/Host/StreamFile.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
@@ -1571,6 +1572,13 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
 
         if (m_arch.GetSpec().GetTriple() == other.GetTriple())
           replace_local_arch = false;
+        // Workaround for for pre-2024 debugserver, which always
+        // returns arm64e on arm64e-capable hardware regardless of
+        // what the process is. This can be deleted at some point in
+        // the future.
+        if (!m_arch.GetSpec().GetMachOCPUSubType() &&
+            other.GetMachOCPUSubType() == llvm::MachO::CPU_SUBTYPE_ARM64E)
+          replace_local_arch = true;
       }
     }
   }
diff --git a/lldb/test/API/macosx/arm64e-attach/Makefile b/lldb/test/API/macosx/arm64e-attach/Makefile
index c9319d6e6888a..b24dccd7c02ad 100644
--- a/lldb/test/API/macosx/arm64e-attach/Makefile
+++ b/lldb/test/API/macosx/arm64e-attach/Makefile
@@ -1,2 +1,7 @@
 C_SOURCES := main.c
+
+# Uncomment this for local debugging.
+#all:
+#	xcrun clang -g $(SRCDIR)/main.c -o a.out -target arm64e-apple-macosx
+
 include Makefile.rules
diff --git a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
index 0dc8700ed02dd..3ec977b507b89 100644
--- a/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
+++ b/lldb/test/API/macosx/arm64e-attach/TestArm64eAttach.py
@@ -13,10 +13,12 @@ def test(self):
         # Skip this test if not running on AArch64 target that supports PAC
         if not self.isAArch64PAuth():
             self.skipTest("Target must support pointer authentication.")
+
         self.build()
         popen = self.spawnSubprocess(self.getBuildArtifact(), [])
-        error = lldb.SBError()
+
         # This simulates how Xcode attaches to a process by pid/name.
+        error = lldb.SBError()
         target = self.dbg.CreateTarget("", "arm64", "", True, error)
         listener = lldb.SBListener("my.attach.listener")
         process = target.AttachToProcessWithID(listener, popen.pid, error)
@@ -24,5 +26,10 @@ def test(self):
         self.assertTrue(process, PROCESS_IS_VALID)
         self.assertEqual(target.GetTriple().split('-')[0], "arm64e",
                          "target triple is updated correctly")
+
+        self.expect('process plugin packet send qProcessInfo',
+                    "debugserver returns correct triple",
+                    substrs=['cputype:100000c', 'cpusubtype:2', 'ptrsize:8'])
+
         error = process.Kill()
         self.assertSuccess(error)
diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp
index 0ec50df42d1fe..1ac9a8040c616 100644
--- a/lldb/tools/debugserver/source/DNB.cpp
+++ b/lldb/tools/debugserver/source/DNB.cpp
@@ -1062,6 +1062,14 @@ DNBGetTSDAddressForThread(nub_process_t pid, nub_thread_t tid,
   return INVALID_NUB_ADDRESS;
 }
 
+std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+DNBGetMainBinaryCPUTypes(nub_process_t pid) {
+  MachProcessSP procSP;
+  if (GetProcessSP(pid, procSP))
+    return procSP->GetMainBinaryCPUTypes(pid);
+  return {};
+}
+
 JSONGenerator::ObjectSP
 DNBGetAllLoadedLibrariesInfos(nub_process_t pid, bool report_load_commands) {
   MachProcessSP procSP;
diff --git a/lldb/tools/debugserver/source/DNB.h b/lldb/tools/debugserver/source/DNB.h
index 97de83ef9ff80..10d1f68794355 100644
--- a/lldb/tools/debugserver/source/DNB.h
+++ b/lldb/tools/debugserver/source/DNB.h
@@ -210,6 +210,8 @@ DNBGetTSDAddressForThread(nub_process_t pid, nub_thread_t tid,
                           uint64_t plo_pthread_tsd_base_address_offset,
                           uint64_t plo_pthread_tsd_base_offset,
                           uint64_t plo_pthread_tsd_entry_size);
+std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+DNBGetMainBinaryCPUTypes(nub_process_t pid);
 JSONGenerator::ObjectSP
 DNBGetAllLoadedLibrariesInfos(nub_process_t pid, bool report_load_commands);
 JSONGenerator::ObjectSP
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.h b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
index 8432bdb36c0cf..db673693a1b21 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.h
@@ -103,6 +103,8 @@ class MachProcess {
       const char *stdin_path, const char *stdout_path, const char *stderr_path,
       bool no_stdio, MachProcess *process, int disable_aslr, DNBError &err);
   nub_addr_t GetDYLDAllImageInfosAddress();
+  std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+  GetMainBinaryCPUTypes(nub_process_t pid);
   static const void *PrepareForAttach(const char *path,
                                       nub_launch_flavor_t launch_flavor,
                                       bool waitfor, DNBError &err_str);
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 3a4dfb9ea6ea2..87bdbf835bfd1 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -1111,6 +1111,23 @@ static bool mach_header_validity_test(uint32_t magic, uint32_t cputype) {
   return FormatDynamicLibrariesIntoJSON(image_infos, report_load_commands);
 }
 
+std::optional<std::pair<cpu_type_t, cpu_subtype_t>>
+MachProcess::GetMainBinaryCPUTypes(nub_process_t pid) {
+  int pointer_size = GetInferiorAddrSize(pid);
+  std::vector<struct binary_image_information> image_infos;
+  GetAllLoadedBinariesViaDYLDSPI(image_infos);
+  uint32_t platform = GetPlatform();
+  for (auto &image_info : image_infos)
+    if (GetMachOInformationFromMemory(platform, image_info.load_address,
+                                      pointer_size, image_info.macho_info))
+      if (image_info.macho_info.mach_header.filetype == MH_EXECUTE)
+        return {
+            {static_cast<cpu_type_t>(image_info.macho_info.mach_header.cputype),
+             static_cast<cpu_subtype_t>(
+                 image_info.macho_info.mach_header.cpusubtype)}};
+  return {};
+}
+
 // Fetch information about the shared libraries at the given load addresses
 // using the
 // dyld SPIs that exist in macOS 10.12, iOS 10, tvOS 10, watchOS 3 and newer.
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index feea4c914ec53..03d427d3fc595 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -6195,59 +6195,14 @@ rnb_err_t RNBRemote::HandlePacket_qSymbol(const char *command) {
   }
 }
 
-// Note that all numeric values returned by qProcessInfo are hex encoded,
-// including the pid and the cpu type.
-
-rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
-  nub_process_t pid;
-  std::ostringstream rep;
-
-  // If we haven't run the process yet, return an error.
-  if (!m_ctx.HasValidProcessID())
-    return SendPacket("E68");
-
-  pid = m_ctx.ProcessID();
-
-  rep << "pid:" << std::hex << pid << ';';
-
-  int procpid_mib[4];
-  procpid_mib[0] = CTL_KERN;
-  procpid_mib[1] = KERN_PROC;
-  procpid_mib[2] = KERN_PROC_PID;
-  procpid_mib[3] = pid;
-  struct kinfo_proc proc_kinfo;
-  size_t proc_kinfo_size = sizeof(struct kinfo_proc);
-
-  if (::sysctl(procpid_mib, 4, &proc_kinfo, &proc_kinfo_size, NULL, 0) == 0) {
-    if (proc_kinfo_size > 0) {
-      rep << "parent-pid:" << std::hex << proc_kinfo.kp_eproc.e_ppid << ';';
-      rep << "real-uid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_ruid
-          << ';';
-      rep << "real-gid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_rgid
-          << ';';
-      rep << "effective-uid:" << std::hex << proc_kinfo.kp_eproc.e_ucred.cr_uid
-          << ';';
-      if (proc_kinfo.kp_eproc.e_ucred.cr_ngroups > 0)
-        rep << "effective-gid:" << std::hex
-            << proc_kinfo.kp_eproc.e_ucred.cr_groups[0] << ';';
-    }
-  }
-
+static std::pair<cpu_type_t, cpu_subtype_t>
+GetCPUTypesFromHost(nub_process_t pid) {
   cpu_type_t cputype = DNBProcessGetCPUType(pid);
   if (cputype == 0) {
     DNBLog("Unable to get the process cpu_type, making a best guess.");
     cputype = best_guess_cpu_type();
   }
 
-  uint32_t addr_size = 0;
-  if (cputype != 0) {
-    rep << "cputype:" << std::hex << cputype << ";";
-    if (cputype & CPU_ARCH_ABI64)
-      addr_size = 8;
-    else
-      addr_size = 4;
-  }
-
   bool host_cpu_is_64bit = false;
   uint32_t is64bit_capable;
   size_t is64bit_capable_len = sizeof(is64bit_capable);
@@ -6288,14 +6243,69 @@ rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
     if (cputype == CPU_TYPE_ARM64_32 && cpusubtype == 2)
       cpusubtype = CPU_SUBTYPE_ARM64_32_V8;
 #endif
+  }
+
+  return {cputype, cpusubtype};
+}
+
+// Note that all numeric values returned by qProcessInfo are hex encoded,
+// including the pid and the cpu type.
 
+rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
+  nub_process_t pid;
+  std::ostringstream rep;
+
+  // If we haven't run the process yet, return an error.
+  if (!m_ctx.HasValidProcessID())
+    return SendPacket("E68");
+
+  pid = m_ctx.ProcessID();
+
+  rep << "pid:" << std::hex << pid << ';';
+
+  int procpid_mib[4];
+  procpid_mib[0] = CTL_KERN;
+  procpid_mib[1] = KERN_PROC;
+  procpid_mib[2] = KERN_PROC_PID;
+  procpid_mib[3] = pid;
+  struct kinfo_proc proc_kinfo;
+  size_t proc_kinfo_size = sizeof(struct kinfo_proc);
+
+  if (::sysctl(procpid_mib, 4, &proc_kinfo, &proc_kinfo_size, NULL, 0) == 0) {
+    if (proc_kinfo_size > 0) {
+      rep << "parent-pid:" << std::hex << proc_kinfo.kp_eproc.e_ppid << ';';
+      rep << "real-uid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_ruid
+          << ';';
+      rep << "real-gid:" << std::hex << proc_kinfo.kp_eproc.e_pcred.p_rgid
+          << ';';
+      rep << "effective-uid:" << std::hex << proc_kinfo.kp_eproc.e_ucred.cr_uid
+          << ';';
+      if (proc_kinfo.kp_eproc.e_ucred.cr_ngroups > 0)
+        rep << "effective-gid:" << std::hex
+            << proc_kinfo.kp_eproc.e_ucred.cr_groups[0] << ';';
+    }
+  }
+
+  cpu_type_t cputype;
+  cpu_subtype_t cpusubtype;
+  if (auto cputypes = DNBGetMainBinaryCPUTypes(pid))
+    std::tie(cputype, cpusubtype) = *cputypes;
+  else
+    std::tie(cputype, cpusubtype) = GetCPUTypesFromHost(pid);
+
+  uint32_t addr_size = 0;
+  if (cputype != 0) {
+    rep << "cputype:" << std::hex << cputype << ";";
     rep << "cpusubtype:" << std::hex << cpusubtype << ';';
+    if (cputype & CPU_ARCH_ABI64)
+      addr_size = 8;
+    else
+      addr_size = 4;
   }
 
   bool os_handled = false;
   if (addr_size > 0) {
     rep << "ptrsize:" << std::dec << addr_size << ';';
-
 #if defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1
     // Try and get the OS type by looking at the load commands in the main
     // executable and looking for a LC_VERSION_MIN load command. This is the

From a4dcfbcb786f69ab8bab35f41e725e592fbac3fb Mon Sep 17 00:00:00 2001
From: Xing Xue <xingxue@outlook.com>
Date: Mon, 26 Feb 2024 13:13:05 -0500
Subject: [PATCH 349/546] [OpenMP][AIX] XFAIL capacity tests on AIX in 32-bit
 (#83014)

This patch XFAILs two capacity tests on AIX in 32-bit because running
out resource with `4 x omp_get_max_threads()` in 32-bit mode.
---
 .../test/tasking/hidden_helper_task/capacity_mix_threads.cpp   | 3 +++
 .../test/tasking/hidden_helper_task/capacity_nthreads.cpp      | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
index 776aee9d8e2ca..3f2ceef0c4add 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
@@ -1,4 +1,7 @@
 // RUN: %libomp-cxx-compile-and-run
+//
+// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads.
+// XFAIL: aix && ppc
 
 #include <omp.h>
 
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
index b551318a72f3f..f7405d00255cb 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
@@ -1,4 +1,7 @@
 // RUN: %libomp-cxx-compile-and-run
+//
+// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads.
+// XFAIL: aix && ppc
 
 #include <omp.h>
 

From 9de78c4e243e9b1dffb289173a94d6a50421c463 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Mon, 26 Feb 2024 10:13:40 -0800
Subject: [PATCH 350/546] AMDGPU: Simplify FP8 conversion definitions. NFC.
 (#83043)

Reals should inherit predicates from the corresponding Pseudo.
---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td | 6 ++----
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 --
 llvm/lib/Target/AMDGPU/VOPInstructions.td  | 1 +
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index f5424cf48d7a5..dbb1977183d19 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -636,8 +636,8 @@ def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]>
   let Src1VOP3DPP = Src1RC64;
 }
 
-let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
-    SchedRW = [WriteFloatCvt] in {
+let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
+    mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
   defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
   defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
   defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
@@ -1422,12 +1422,10 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 let AssemblerPredicate = isGFX940Plus in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
-let OtherPredicates = [HasFP8ConversionInsts] in {
 defm V_CVT_F32_FP8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
 defm V_CVT_F32_BF8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
 defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
 defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
-}
 
 //===----------------------------------------------------------------------===//
 // GFX10
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 7198a4022dae8..334cfad478f15 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1647,9 +1647,7 @@ defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
 
 defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
 
-let OtherPredicates = [HasFP8ConversionInsts] in {
 defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
 defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
 defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
 defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
-}
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 80d7d96a5e3cc..918bdb9506b04 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -670,6 +670,7 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
 
   let SubtargetPredicate = HasSDWA9;
   let AssemblerPredicate = HasSDWA9;
+  let OtherPredicates    = ps.OtherPredicates;
   let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
                                                AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "GFX9";

From 3aec947b671bf3a2c00ef7d29557db687af8ff7c Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Mon, 26 Feb 2024 13:21:27 -0500
Subject: [PATCH 351/546] [libc][__support/memory_size] fix missing branch and
 add related tests (#83016)

fix #82644.
---
 libc/src/__support/memory_size.h             | 6 ++++--
 libc/test/src/__support/memory_size_test.cpp | 7 +++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/memory_size.h b/libc/src/__support/memory_size.h
index 4c7d2079553e8..94aee2520afaa 100644
--- a/libc/src/__support/memory_size.h
+++ b/libc/src/__support/memory_size.h
@@ -52,9 +52,11 @@ class SafeMemSize {
 
   LIBC_INLINE SafeMemSize operator+(const SafeMemSize &other) {
     type result;
-    if (LIBC_UNLIKELY((value | other.value) < 0))
+    if (LIBC_UNLIKELY((value | other.value) < 0)) {
       result = -1;
-    result = value + other.value;
+    } else {
+      result = value + other.value;
+    }
     return SafeMemSize{result};
   }
 
diff --git a/libc/test/src/__support/memory_size_test.cpp b/libc/test/src/__support/memory_size_test.cpp
index 93ef3711d40e0..1c8f1ce87415b 100644
--- a/libc/test/src/__support/memory_size_test.cpp
+++ b/libc/test/src/__support/memory_size_test.cpp
@@ -49,6 +49,13 @@ TEST(LlvmLibcMemSizeTest, Addition) {
   ASSERT_FALSE((max + SafeMemSize{static_cast<size_t>(1)}).valid());
   ASSERT_FALSE((third + third + third + third).valid());
   ASSERT_FALSE((half + half + half).valid());
+
+  ASSERT_FALSE((SafeMemSize{static_cast<size_t>(-1)} +
+                SafeMemSize{static_cast<size_t>(2)})
+                   .valid());
+  ASSERT_FALSE((SafeMemSize{static_cast<size_t>(2)} +
+                SafeMemSize{static_cast<size_t>(-1)})
+                   .valid());
 }
 
 TEST(LlvmLibcMemSizeTest, Multiplication) {

From 0b398256b3f72204ad1f7c625efe4990204e898a Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Mon, 26 Feb 2024 18:22:05 +0000
Subject: [PATCH 352/546] [RemoveDIs] Print non-intrinsic debug info in textual
 IR output (#79281)

This patch adds support for printing the proposed non-instruction debug
info ("RemoveDIs") out to textual IR. This patch does not add any
bitcode support, parsing support, or documentation.

Printing of the new format is controlled by a flag added in this patch,
`--write-experimental-debuginfo`, which defaults to false. The new
format will be printed *iff* this flag is true, so whether we use the IR
format is completely independent of whether we use non-instruction debug
info during LLVM passes (which is controlled by the
`--try-experimental-debuginfo-iterators` flag).

Even with the flag disabled, some existing tests need to be updated, as this
patch causes debug intrinsic declarations to be changed in a round trip,
such that they always appear at the end of a module and have no attributes
(this has no functional change on the module).

The design of this new IR format was proposed previously on
Discourse, and any further discussion about the design can still be
contributed there:

https://discourse.llvm.org/t/rfc-debuginfo-proposed-changes-to-the-textual-ir-representation-for-debug-values/73491
---
 llvm/include/llvm/IR/Module.h                 | 14 +++
 llvm/include/llvm/IR/PrintPasses.h            | 19 ++++
 llvm/lib/IR/AsmWriter.cpp                     | 75 +++++++---------
 llvm/lib/IR/BasicBlock.cpp                    |  4 -
 llvm/lib/IR/IRPrintingPasses.cpp              | 24 ++----
 llvm/lib/IR/Module.cpp                        | 22 +++++
 llvm/lib/IRPrinter/IRPrintingPasses.cpp       | 27 +++---
 .../Generic/inline-alloca-ordering.ll         |  7 +-
 .../DebugInfo/Generic/inline-dbg-values.ll    |  8 +-
 llvm/test/DebugInfo/dpvalue-print-nocrash.ll  |  3 +-
 .../print-non-instruction-debug-info.ll       | 86 +++++++++++++++++++
 .../pr33641_remove_arg_dbgvalue.ll            | 18 ++--
 .../callsite-split-preserve-debug.ll          |  4 +-
 .../debug-info-on-skipped-selects.ll          |  2 +-
 .../DeadArgElim/2010-04-30-DbgInfo.ll         | 35 ++++++--
 .../IROutliner/outlining-debug-statements.ll  |  7 --
 llvm/test/Transforms/LoopRotate/dbgvalue.ll   |  4 +-
 .../LoopRotate/delete-dbg-values.ll           |  4 +-
 ...e-that-exception-unwind-path-is-visited.ll |  5 +-
 llvm/test/Transforms/SROA/dbg-inline.ll       | 50 +++++------
 20 files changed, 272 insertions(+), 146 deletions(-)
 create mode 100644 llvm/test/DebugInfo/print-non-instruction-debug-info.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 68a89dc45c283..f5f3c66f0ae1e 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -218,11 +218,18 @@ class LLVM_EXTERNAL_VISIBILITY Module {
   /// \ref BasicBlock.
   bool IsNewDbgInfoFormat;
 
+  /// Used when converting this module to the new debug info format; removes all
+  /// declarations of debug intrinsics that are replaced by non-intrinsic
+  /// records in the new format.
+  void removeDebugIntrinsicDeclarations();
+
   /// \see BasicBlock::convertToNewDbgValues.
   void convertToNewDbgValues() {
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
+    // Remove the declarations of the old debug intrinsics, if any exist.
+    removeDebugIntrinsicDeclarations();
     IsNewDbgInfoFormat = true;
   }
 
@@ -234,6 +241,13 @@ class LLVM_EXTERNAL_VISIBILITY Module {
     IsNewDbgInfoFormat = false;
   }
 
+  void setIsNewDbgInfoFormat(bool UseNewFormat) {
+    if (UseNewFormat && !IsNewDbgInfoFormat)
+      convertToNewDbgValues();
+    else if (!UseNewFormat && IsNewDbgInfoFormat)
+      convertFromNewDbgValues();
+  }
+
   /// The Module constructor. Note that there is no default constructor. You
   /// must provide a name for the module upon construction.
   explicit Module(StringRef ModuleID, LLVMContext& C);
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 95b97e76c867c..3803bd05cbe57 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,6 +78,25 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+    -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index fba404c9b027c..a3da6ca811489 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -861,7 +861,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
 
-  /// Add all of the metadata from an instruction.
+  /// Add all of the metadata from a DbgRecord.
   void processDbgRecordMetadata(const DbgRecord &DPV);
 };
 
@@ -1140,6 +1140,9 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
+    // Process metadata used by DbgRecords; we only specifically care about the
+    // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
+    // Expression fields should only be printed inline and so do not use a slot.
     CreateMetadataSlot(DPV->getVariable());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
@@ -2703,6 +2706,7 @@ class AssemblyWriter {
   void printDPValue(const DPValue &DPI);
   void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
+  void printDbgRecordLine(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -3885,9 +3889,6 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  bool ConvertBack = F->IsNewDbgInfoFormat;
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertFromNewDbgValues();
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
@@ -4030,8 +4031,6 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "}\n";
   }
 
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertToNewDbgValues();
   Machine.purgeFunction();
 }
 
@@ -4098,6 +4097,8 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (const Instruction &I : *BB) {
+    for (const DbgRecord &DR : I.getDbgValueRange())
+      printDbgRecordLine(DR);
     printInstructionLine(I);
   }
 
@@ -4611,12 +4612,10 @@ void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &Value) {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
-  Out << "  DPValue ";
-
-  switch (Value.getType()) {
+void AssemblyWriter::printDPValue(const DPValue &DPV) {
+  auto WriterCtx = getContext();
+  Out << "#dbg_";
+  switch (DPV.getType()) {
   case DPValue::LocationType::Value:
     Out << "value";
     break;
@@ -4629,35 +4628,39 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
   default:
     llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
   }
-  Out << " { ";
-  auto WriterCtx = getContext();
-  WriteAsOperandInternal(Out, Value.getRawLocation(), WriterCtx, true);
+  Out << "(";
+  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getExpression(), WriterCtx, true);
   Out << ", ";
-  if (Value.isDbgAssign()) {
-    WriteAsOperandInternal(Out, Value.getAssignID(), WriterCtx, true);
+  if (DPV.isDbgAssign()) {
+    WriteAsOperandInternal(Out, DPV.getAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, Value.getDebugLoc().get(), WriterCtx, true);
-  Out << " marker @" << Value.getMarker();
-  Out << " }";
+  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  Out << ")";
+}
+
+/// printDbgRecordLine - Print a DbgRecord with indentation and a newline
+/// character.
+void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
+  // Print lengthier indentation to bring out-of-line with instructions.
+  Out << "    ";
+  printDbgRecord(DR);
+  Out << '\n';
 }
 
 void AssemblyWriter::printDPLabel(const DPLabel &Label) {
-  // There's no formal representation of a DPLabel -- print purely as
-  // a debugging aid.
-  Out << "  DPLabel { ";
   auto WriterCtx = getContext();
+  Out << "#dbg_label(";
   WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
-  Out << " marker @" << Label.getMarker();
-  Out << " }";
+  Out << ")";
 }
 
 void AssemblyWriter::printMetadataAttachments(
@@ -4805,19 +4808,11 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
-  // RemoveDIs: always print with debug-info in intrinsic format.
-  bool ConvertAfter = IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module *>(this)->convertFromNewDbgValues();
-
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printModule(this);
-
-  if (ConvertAfter)
-    const_cast<Module *>(this)->convertToNewDbgValues();
 }
 
 void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
@@ -4908,8 +4903,6 @@ void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                      bool IsForDebug) const {
-  // There's no formal representation of a DPMarker -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4931,8 +4924,6 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4950,8 +4941,6 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DbgLabelRecord -- print purely as
-  // a debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 6ea876fde5ec6..c110c4c1437c3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -61,10 +61,6 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
-  // Is the command line option set?
-  if (!UseNewDbgInfoFormat)
-    return;
-
   IsNewDbgInfoFormat = true;
 
   // Iterate over all instructions in the instruction list, collecting dbg.value
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index b19210e776ed5..7b4f66a908c59 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -23,6 +23,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -39,11 +41,9 @@ class PrintModulePassWrapper : public ModulePass {
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      M.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this module in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -62,9 +62,6 @@ class PrintModulePassWrapper : public ModulePass {
       }
     }
 
-    if (IsNewDbgInfoFormat)
-      M.convertToNewDbgValues();
-
     return false;
   }
 
@@ -87,11 +84,9 @@ class PrintFunctionPassWrapper : public FunctionPass {
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      F.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this function in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
@@ -101,9 +96,6 @@ class PrintFunctionPassWrapper : public FunctionPass {
         OS << Banner << '\n' << static_cast<Value &>(F);
     }
 
-    if (IsNewDbgInfoFormat)
-      F.convertToNewDbgValues();
-
     return false;
   }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 1946db2ee0be7..a8696ed9e3ce5 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -85,6 +85,28 @@ Module::~Module() {
   IFuncList.clear();
 }
 
+void Module::removeDebugIntrinsicDeclarations() {
+  auto *DeclareIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
+  assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug declare intrinsic should have had uses removed.");
+  DeclareIntrinsicFn->eraseFromParent();
+  auto *ValueIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
+  assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug value intrinsic should have had uses removed.");
+  ValueIntrinsicFn->eraseFromParent();
+  auto *AssignIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
+  assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug assign intrinsic should have had uses removed.");
+  AssignIntrinsicFn->eraseFromParent();
+  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
+  assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
+         "Debug label intrinsic should have had uses removed.");
+  LabelntrinsicFn->eraseFromParent();
+}
+
 std::unique_ptr<RandomNumberGenerator>
 Module::createRNG(const StringRef Name) const {
   SmallString<32> Salt(Name);
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index 52b242b4dcd52..340f4fa6bfc84 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -22,6 +22,11 @@
 
 using namespace llvm;
 
+cl::opt<bool> WriteNewDbgInfoFormat(
+    "write-experimental-debuginfo",
+    cl::desc("Write debug info in the new non-intrinsic format"),
+    cl::init(false));
+
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -31,11 +36,9 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = M.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    M.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this module in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -63,9 +66,6 @@ PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
     Index->print(OS);
   }
 
-  if (ShouldConvert)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -75,11 +75,9 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = F.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    F.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this function in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
@@ -88,8 +86,5 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
       OS << Banner << '\n' << static_cast<Value &>(F);
   }
 
-  if (ShouldConvert)
-    F.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
index 9ff3d80af80d1..9f401ceb5b6f4 100644
--- a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
+++ b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
@@ -15,8 +15,6 @@
 ;; doesn't transfer the dbg.value to the entry block. This needs Special
 ;; Handling once we get rid of debug-intrinsics.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK:    define i32 @bar()
 ; CHECK-NEXT: %1 = alloca [65 x i32], align 16
 ; CHECK-NEXT: call void @ext()
@@ -24,9 +22,10 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !12
 ; CHECK-NEXT: call void @init(ptr %1)
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 declare void @ext()
 declare void @init(ptr)
-declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define internal i32 @foo() !dbg !4 {
   %1 = alloca [65 x i32], align 16
@@ -42,6 +41,8 @@ define i32 @bar() !dbg !16 {
   ret i32 %1
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
index 7d580d6fba37a..b0390b9f78f41 100644
--- a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
+++ b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
@@ -7,8 +7,6 @@
 ;;
 ;; test should be inlined into test2
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK: define i32 @test2
 ; CHECK-NEXT: entry:
 ; CHECK:      %k.addr.i = alloca i32, align 4
@@ -47,6 +45,8 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[KVAR]], metadata !DIExpression()), !dbg ![[KLINE]]
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[K2VAR]], metadata !DIExpression()), !dbg ![[GLINE]]
 ;
+; CHECK: declare void @llvm.dbg.value(metadata,
+;
 ;; Test that the metadata maps onto the correct things, and that the DILocations
 ;; attached to the intrinsics have been inlined.
 ;
@@ -100,8 +100,6 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %3, !dbg !20
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 declare i32 @_Z8test_exti(i32)
 
 define i32 @test2(i32 %foo, i32 %bar) !dbg !10 {
@@ -118,6 +116,8 @@ try.cont:                                         ; preds = %catch, %invoke.cont
   ret i32 0, !dbg !30
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 9f120af13ac9c..0a618c6780d1d 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -2,8 +2,7 @@
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: CLONE:   DPValue value {
-; CHECK-SAME: marker @0x0
+; CHECK: CLONE:   #dbg_value(
 
 define ptr @func_10(i32 %p_11) {
 entry:
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
new file mode 100644
index 0000000000000..f8271df146fe9
--- /dev/null
+++ b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
@@ -0,0 +1,86 @@
+;; Test that we can write in the new debug info format.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+;; Test also that the new flag is independent of the flag that enables use of
+;; these non-instruction debug info during LLVM passes.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]])
+; CHECK-NEXT: entry:
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_B:[0-9a-zA-Z]+]] = alloca
+; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]])
+; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]])
+; NEWDBG-NEXT: {{^}}    #dbg_label(![[LABEL_ID:[0-9]+]])
+; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
+; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
+; CHECK-NEXT: {{^}}  ret i32
+
+; OLDDBG-DAG: declare void @llvm.dbg.value
+; OLDDBG-DAG: declare void @llvm.dbg.declare
+; OLDDBG-DAG: declare void @llvm.dbg.assign
+
+; CHECK-DAG: llvm.dbg.cu
+; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a"
+; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b"
+; CHECK-DAG: ![[LOC_1]] = !DILocation(line: 3, column: 15
+; CHECK-DAG: ![[LOC_2]] = !DILocation(line: 3, column: 20
+; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
+; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
+; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
+
+define dso_local i32 @f(i32 %a) !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30
+  %b = alloca i32, !dbg !30, !DIAssignID !40
+  call void @llvm.dbg.declare(metadata ptr %b, metadata !21, metadata !DIExpression()), !dbg !31
+  %add = add i32 %a, 5, !dbg !31
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %add), metadata !20, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg !32
+  call void @llvm.dbg.label(metadata !50), !dbg !32
+  store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
+  call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
+  ret i32 %add, !dbg !33
+
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "print.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 18.0.0"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!20, !21}
+!20 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !12)
+!21 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !12)
+!30 = !DILocation(line: 3, column: 15, scope: !7)
+!31 = !DILocation(line: 3, column: 20, scope: !7)
+!32 = !DILocation(line: 3, column: 25, scope: !7)
+!33 = !DILocation(line: 3, column: 30, scope: !7)
+!40 = distinct !DIAssignID()
+!50 = !DILabel(scope: !7, name: "label", file: !1, line: 3)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index 83226e56b5acd..e0777f9ecee8b 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -30,8 +30,7 @@ define internal void @bar(%p_t %p)  {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@bar
 ; CGSCC-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression())
-; CGSCC-SAME:         !dbg [[DBG5:![0-9]+]]
+; CGSCC-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.dbg.value(metadata %p_t %p, metadata !4, metadata !5), !dbg !6
@@ -52,21 +51,20 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; TUNIT: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; TUNIT: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; TUNIT: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
-; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CGSCC: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; CGSCC: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; CGSCC: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: !4)
-; CGSCC: [[META4:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: !0)
-; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: !4)
+; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: [[META4:![0-9]+]])
+; CGSCC: [[META4]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: [[META0]])
+; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: [[META4]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
index e185286304a68..3fdf62bf57502 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
@@ -3,8 +3,6 @@
 
 ;; Test that DebugLocs are preserved, and that dbg.values are duplicated.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK-LABEL: @test1
 ; CHECK:         call void @llvm.dbg.value(metadata i32 0,
 ; CHECK-NEXT:    [[R1:%.+]] = call i32 @callee(i32 0, i32 %dd), !dbg [[DBG1:!.*]]
@@ -43,6 +41,8 @@ CallSite:                                         ; preds = %land.rhs, %entry
 ; CHECK-NEXT:    phi i32 [ [[R1]], %Header.split ], [ [[R2]], %TBB.split ], !dbg [[DBG_CALL]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 1,
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 define void @test2(ptr %ptr, i32 %i) !dbg !19 {
 Header:
   %tobool = icmp ne i32 %i, 10
diff --git a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
index ffd4a0170ac9d..ce34b16a4d7a3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
@@ -5,8 +5,8 @@
 ; Test that when we skip over multiple selects in CGP, that the debug-info
 ; attached to those selects is still fixed up.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
 ; CHECK: call void @llvm.dbg.value(metadata ptr %sunkaddr,
+; CHECK: declare void @llvm.dbg.value(metadata,
 
 source_filename = "reduced.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 0dee72f4f6b6f..d84dd10e7d28b 100644
--- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,12 +8,21 @@
 
 define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp !dbg !1 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@vfs_addname
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[LEN:%.*]], i32 [[HASH:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[LEN]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META14:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[FLAGS]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR3:[0-9]+]], !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[TMP0]], !dbg [[DBG16]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %len, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %hash, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %flags, metadata !12, metadata !DIExpression()), !dbg !DILocation(scope: !1)
-; CHECK:  call fastcc ptr @add_name_internal(ptr %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <ptr> [#uses=1]
   ret ptr %0, !dbg !13
 }
@@ -22,6 +31,24 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@add_name_internal
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i8 poison, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[HASH]], 0, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB:%.*]], label [[BB1:%.*]], !dbg [[DBG28]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB2:%.*]], !dbg [[DBG30:![0-9]+]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]], !dbg [[DBG31:![0-9]+]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ @.str, [[BB]] ], [ [[NAME]], [[BB1]] ]
+; CHECK-NEXT:    ret ptr [[DOT0]], !dbg [[DBG31]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !16)
   call void @llvm.dbg.value(metadata i32 %len, metadata !20, metadata !DIExpression()), !dbg !DILocation(scope: !16)
@@ -45,9 +72,7 @@ bb2:                                              ; preds = %bb1, %bb
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind ssp }
-; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #2 = { noinline nounwind ssp }
-; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { noinline nounwind ssp }
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
@@ -68,9 +93,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !14 = distinct !DILexicalBlock(line: 12, column: 0, file: !28, scope: !1)
 !15 = !DILocalVariable(name: "name", line: 17, arg: 1, scope: !16, file: !2, type: !6)
 ; CHECK: !DISubprogram(name: "add_name_internal"
-; CHECK-SAME: type: ![[MD:[0-9]+]]
 !16 = distinct !DISubprogram(name: "add_name_internal", linkageName: "add_name_internal", line: 22, isLocal: true, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !28, scope: !2, type: !17)
-; CHECK: ![[MD]] = !DISubroutineType(cc: DW_CC_nocall, types: !{{[0-9]+}})
 !17 = !DISubroutineType(types: !18)
 !18 = !{!6, !6, !9, !9, !19, !9}
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index f932788c73e2a..bf846c310a525 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -51,14 +51,7 @@ entry:
   ret void
 }
 
-; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) #1 {
 ; CHECK: entry_to_outline:
-; CHECK-NEXT:    store i32 2, ptr [[ARG0]], align 4
-; CHECK-NEXT:    store i32 3, ptr [[ARG1]], align 4
-; CHECK-NEXT:    store i32 4, ptr [[ARG2]], align 4
-; CHECK-NEXT:    [[AL:%.*]] = load i32, ptr [[ARG0]], align 4
-; CHECK-NEXT:    [[BL:%.*]] = load i32, ptr [[ARG1]], align 4
-; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 !0 = !DIFile(filename: "foo.c", directory: "/tmp")
 !1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 92cc886bc81c1..331ca9efdd4a0 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -4,8 +4,6 @@
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; This function rotates the exit conditon into the entry block, moving the
 ; dbg.values with it. Check that they resolve through the PHIs to the arguments
 ; only in the entry block. In the loop block, the dbg.values shift down below
@@ -265,6 +263,8 @@ L0.latch:
   br label %L0, !dbg !77
 }
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 !llvm.module.flags = !{!20}
 !llvm.dbg.cu = !{!2}
 
diff --git a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
index bce5ed02b43bf..8c23c0276024c 100644
--- a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
+++ b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
@@ -10,8 +10,6 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK-LABEL: define void @_ZNK4llvm5APInt4sextEj(ptr
 ; CHECK-LABEL: entry:
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC:[0-9]+]],
@@ -22,6 +20,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SINK]],
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC]],
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define void @_ZNK4llvm5APInt4sextEj(ptr %agg.result) !dbg !5 {
diff --git a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index e4c123e02540a..22d14f52e9664 100644
--- a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -103,10 +103,9 @@ declare void @NSLog(ptr, ...)
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { ssp uwtable }
-; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #2 = { nonlazybind }
+; CHECK: attributes #1 = { nonlazybind }
 ; CHECK: attributes [[NUW]] = { nounwind }
-; CHECK: attributes #4 = { noinline ssp uwtable }
+; CHECK: attributes #3 = { noinline ssp uwtable }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
diff --git a/llvm/test/Transforms/SROA/dbg-inline.ll b/llvm/test/Transforms/SROA/dbg-inline.ll
index 454ca13230bfa..3e7683ad2db77 100644
--- a/llvm/test/Transforms/SROA/dbg-inline.ll
+++ b/llvm/test/Transforms/SROA/dbg-inline.ll
@@ -19,10 +19,10 @@ target triple = "x86_64-apple-macosx10.15.0"
 define i64 @_Z1g4pair(i64 %p.coerce0, i64 %p.coerce1) #0 !dbg !8 {
 ; CHECK-LABEL: @_Z1g4pair(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
 ; CHECK-NEXT:    ret i64 [[P_COERCE0]], !dbg [[DBG22:![0-9]+]]
 ;
 entry:
@@ -77,32 +77,32 @@ attributes #2 = { argmemonly nounwind willreturn }
 !26 = !DILocation(line: 10, column: 3, scope: !8)
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline ssp uwtable }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project 5110fd0343c2d06c8ae538741fbef13ece5e68de)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None, sysroot: "/")
-; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "/Volumes/Data/llvm-project")
-; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], nameTableKind: None, sysroot: "/")
+; CHECK: [[META1]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: {{.*}})
+; CHECK: [[META2]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META6:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: !8, file: !8, line: 9, type: !9, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: [[META8:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
-; CHECK: [[META9:![0-9]+]] = !DISubroutineType(types: !10)
-; CHECK: [[META10:![0-9]+]] = !{!11, !12}
-; CHECK: [[META11:![0-9]+]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
-; CHECK: [[META12:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: !8, line: 1, size: 128, flags: DIFlagTypePassByValue, elements: !13, identifier: "_ZTS4pair")
-; CHECK: [[META13:![0-9]+]] = !{!14, !15}
-; CHECK: [[META14:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !12, file: !8, line: 1, baseType: !11, size: 64)
-; CHECK: [[META15:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !12, file: !8, line: 1, baseType: !11, size: 64, offset: 64)
-; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !8, line: 9, type: !12)
-; CHECK: [[DBG17]] = !DILocation(line: 0, scope: !7)
-; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: !19, file: !8, line: 5, type: !12)
-; CHECK: [[META19:![0-9]+]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: !8, file: !8, line: 5, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: [[DBG20]] = !DILocation(line: 0, scope: !19, inlinedAt: !21)
-; CHECK: [[META21:![0-9]+]] = distinct !DILocation(line: 10, column: 10, scope: !7)
-; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: !7)
+; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: [[META8:![0-9]+]], file: [[META8]], line: 9, type: [[META9:![0-9]+]], scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META8]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
+; CHECK: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]]}
+; CHECK: [[META11]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+; CHECK: [[META12]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: [[META8]], line: 1, size: 128, flags: DIFlagTypePassByValue, elements: [[META13:![0-9]+]], identifier: "_ZTS4pair")
+; CHECK: [[META13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]]}
+; CHECK: [[META14]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64)
+; CHECK: [[META15]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64, offset: 64)
+; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: [[META7]], file: [[META8]], line: 9, type: [[META12]])
+; CHECK: [[DBG17]] = !DILocation(line: 0, scope: [[META7]])
+; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: [[META19:![0-9]+]], file: [[META8]], line: 5, type: [[META12]])
+; CHECK: [[META19]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: [[META8]], file: [[META8]], line: 5, type: [[META9]], scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG20]] = !DILocation(line: 0, scope: [[META19]], inlinedAt: [[META21:![0-9]+]])
+; CHECK: [[META21]] = distinct !DILocation(line: 10, column: 10, scope: [[META7]])
+; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: [[META7]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}

From 257cbea20d3ce6c6a3848ebd0ea0662d3cdb46b5 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 26 Feb 2024 18:22:35 +0000
Subject: [PATCH 353/546] [DAG] Format DAGCombiner::mayAlias. NFC

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6a28bc8da223b..c4d49adc21c4b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27843,11 +27843,10 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
       int64_t Offset = 0;
       if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
-        Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
-                     ? C->getSExtValue()
-                     : (LSN->getAddressingMode() == ISD::PRE_DEC)
-                           ? -1 * C->getSExtValue()
-                           : 0;
+        Offset = (LSN->getAddressingMode() == ISD::PRE_INC) ? C->getSExtValue()
+                 : (LSN->getAddressingMode() == ISD::PRE_DEC)
+                     ? -1 * C->getSExtValue()
+                     : 0;
       uint64_t Size =
           MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
       return {LSN->isVolatile(),

From f1bb88bee248fb30e3145a2a19373233b7a59882 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 26 Feb 2024 10:31:58 -0800
Subject: [PATCH 354/546] [RISCV] Use PromoteSetCCOperands to promote operands
 for UMAX/UMIN during type legalization. (#82716)

For RISC-V, we were always choosing to sign extend when promoting
i32->i64. If the promoted inputs happen to be zero extended already, we
should use zero extend instead. This is what we do for SETCC.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 54 +++++++-----
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 15 +---
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       |  1 -
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     | 82 ++++++++++---------
 .../CodeGen/RISCV/rvv/get_vector_length.ll    | 52 ++++--------
 5 files changed, 91 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index df17d6530b0de..6e55acd22bb37 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1354,10 +1354,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
   // It doesn't matter if we sign extend or zero extend in the inputs. So do
-  // whatever is best for the target.
-  SDValue LHS = SExtOrZExtPromotedInteger(N->getOperand(0));
-  SDValue RHS = SExtOrZExtPromotedInteger(N->getOperand(1));
+  // whatever is best for the target and the promoted operands.
+  SExtOrZExtPromotedOperands(LHS, RHS);
+
   return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
@@ -1922,25 +1925,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   return false;
 }
 
-/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
-/// shared among BR_CC, SELECT_CC, and SETCC handlers.
-void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
-                                            ISD::CondCode CCCode) {
-  // We have to insert explicit sign or zero extends. Note that we could
-  // insert sign extends for ALL conditions. For those operations where either
-  // zero or sign extension would be valid, we ask the target which extension
-  // it would prefer.
-
-  // Signed comparisons always require sign extension.
-  if (ISD::isSignedIntSetCC(CCCode)) {
-    LHS = SExtPromotedInteger(LHS);
-    RHS = SExtPromotedInteger(RHS);
-    return;
-  }
-
-  assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) &&
-         "Unknown integer comparison!");
-
+// These operands can be either sign extended or zero extended as long as we
+// treat them the same. If an extension is free, choose that. Otherwise, follow
+// target preference.
+void DAGTypeLegalizer::SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS) {
   SDValue OpL = GetPromotedInteger(LHS);
   SDValue OpR = GetPromotedInteger(RHS);
 
@@ -1984,6 +1972,28 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
   RHS = ZExtPromotedInteger(RHS);
 }
 
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends. Note that we could
+  // insert sign extends for ALL conditions. For those operations where either
+  // zero or sign extension would be valid, we ask the target which extension
+  // it would prefer.
+
+  // Signed comparisons always require sign extension.
+  if (ISD::isSignedIntSetCC(CCCode)) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+    return;
+  }
+
+  assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) &&
+         "Unknown integer comparison!");
+
+  SExtOrZExtPromotedOperands(LHS, RHS);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 3c84f67653eca..e08acd36b41d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -275,20 +275,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
     return DAG.getZeroExtendInReg(Op, dl, OldVT);
   }
 
-  // Get a promoted operand and sign or zero extend it to the final size
-  // (depending on TargetLoweringInfo::isSExtCheaperThanZExt). For a given
-  // subtarget and type, the choice of sign or zero-extension will be
-  // consistent.
-  SDValue SExtOrZExtPromotedInteger(SDValue Op) {
-    EVT OldVT = Op.getValueType();
-    SDLoc DL(Op);
-    Op = GetPromotedInteger(Op);
-    if (TLI.isSExtCheaperThanZExt(OldVT, Op.getValueType()))
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), Op,
-                         DAG.getValueType(OldVT));
-    return DAG.getZeroExtendInReg(Op, DL, OldVT);
-  }
-
   // Promote the given operand V (vector or scalar) according to N's specific
   // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
   // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
@@ -415,6 +401,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
 
+  void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
   //===--------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 3880ac9f4ec62..9e93ad0043a7e 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -2812,7 +2812,6 @@ define i16 @utesth_f16i16_mm(half %x) {
 ; RV64-NEXT:    .cfi_offset ra, -8
 ; RV64-NEXT:    call __extendhfsf2
 ; RV64-NEXT:    fcvt.lu.s a0, fa0, rtz
-; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    lui a1, 16
 ; RV64-NEXT:    addiw a1, a1, -1
 ; RV64-NEXT:    bltu a0, a1, .LBB43_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 208c881294eed..48ce7d623475c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -4827,10 +4827,10 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    mv s0, a0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, a1
 ; CHECK-NOV-NEXT:    call __extendhfsf2
-; CHECK-NOV-NEXT:    fmv.s fs5, fa0
+; CHECK-NOV-NEXT:    fmv.s fs6, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s7
 ; CHECK-NOV-NEXT:    call __extendhfsf2
-; CHECK-NOV-NEXT:    fmv.s fs6, fa0
+; CHECK-NOV-NEXT:    fmv.s fs5, fa0
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s6
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs4, fa0
@@ -4846,54 +4846,36 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fmv.s fs0, fa0
-; CHECK-NOV-NEXT:    fcvt.lu.s s2, fs6, rtz
-; CHECK-NOV-NEXT:    fcvt.lu.s a0, fs5, rtz
 ; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
-; CHECK-NOV-NEXT:    sext.w s1, a0
+; CHECK-NOV-NEXT:    fcvt.lu.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.lu.s a0, fa0, rtz
-; CHECK-NOV-NEXT:    sext.w a0, a0
 ; CHECK-NOV-NEXT:    lui a1, 16
 ; CHECK-NOV-NEXT:    addiw a1, a1, -1
-; CHECK-NOV-NEXT:    bltu a0, a1, .LBB43_2
+; CHECK-NOV-NEXT:    bgeu a0, a1, .LBB43_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
-; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a2, fs5, rtz
+; CHECK-NOV-NEXT:    bgeu s1, a1, .LBB43_11
 ; CHECK-NOV-NEXT:  .LBB43_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a3, fs4, rtz
-; CHECK-NOV-NEXT:    sext.w a2, s2
-; CHECK-NOV-NEXT:    bltu s1, a1, .LBB43_4
-; CHECK-NOV-NEXT:  # %bb.3: # %entry
-; CHECK-NOV-NEXT:    mv s1, a1
-; CHECK-NOV-NEXT:  .LBB43_4: # %entry
+; CHECK-NOV-NEXT:    bgeu a2, a1, .LBB43_12
+; CHECK-NOV-NEXT:  .LBB43_3: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a4, fs3, rtz
-; CHECK-NOV-NEXT:    sext.w a3, a3
-; CHECK-NOV-NEXT:    bltu a2, a1, .LBB43_6
-; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    mv a2, a1
-; CHECK-NOV-NEXT:  .LBB43_6: # %entry
+; CHECK-NOV-NEXT:    bgeu a3, a1, .LBB43_13
+; CHECK-NOV-NEXT:  .LBB43_4: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a5, fs2, rtz
-; CHECK-NOV-NEXT:    sext.w a4, a4
-; CHECK-NOV-NEXT:    bltu a3, a1, .LBB43_8
-; CHECK-NOV-NEXT:  # %bb.7: # %entry
-; CHECK-NOV-NEXT:    mv a3, a1
-; CHECK-NOV-NEXT:  .LBB43_8: # %entry
+; CHECK-NOV-NEXT:    bgeu a4, a1, .LBB43_14
+; CHECK-NOV-NEXT:  .LBB43_5: # %entry
 ; CHECK-NOV-NEXT:    fcvt.lu.s a6, fs1, rtz
-; CHECK-NOV-NEXT:    sext.w a5, a5
-; CHECK-NOV-NEXT:    bltu a4, a1, .LBB43_10
-; CHECK-NOV-NEXT:  # %bb.9: # %entry
-; CHECK-NOV-NEXT:    mv a4, a1
-; CHECK-NOV-NEXT:  .LBB43_10: # %entry
-; CHECK-NOV-NEXT:    fcvt.lu.s a7, fs0, rtz
-; CHECK-NOV-NEXT:    sext.w a6, a6
 ; CHECK-NOV-NEXT:    bgeu a5, a1, .LBB43_15
-; CHECK-NOV-NEXT:  # %bb.11: # %entry
-; CHECK-NOV-NEXT:    sext.w a7, a7
+; CHECK-NOV-NEXT:  .LBB43_6: # %entry
+; CHECK-NOV-NEXT:    fcvt.lu.s a7, fs0, rtz
 ; CHECK-NOV-NEXT:    bgeu a6, a1, .LBB43_16
-; CHECK-NOV-NEXT:  .LBB43_12: # %entry
-; CHECK-NOV-NEXT:    bltu a7, a1, .LBB43_14
-; CHECK-NOV-NEXT:  .LBB43_13: # %entry
+; CHECK-NOV-NEXT:  .LBB43_7: # %entry
+; CHECK-NOV-NEXT:    bltu a7, a1, .LBB43_9
+; CHECK-NOV-NEXT:  .LBB43_8: # %entry
 ; CHECK-NOV-NEXT:    mv a7, a1
-; CHECK-NOV-NEXT:  .LBB43_14: # %entry
+; CHECK-NOV-NEXT:  .LBB43_9: # %entry
 ; CHECK-NOV-NEXT:    sh a7, 14(s0)
 ; CHECK-NOV-NEXT:    sh a6, 12(s0)
 ; CHECK-NOV-NEXT:    sh a5, 10(s0)
@@ -4920,14 +4902,34 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fld fs6, 0(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    addi sp, sp, 128
 ; CHECK-NOV-NEXT:    ret
+; CHECK-NOV-NEXT:  .LBB43_10: # %entry
+; CHECK-NOV-NEXT:    mv a0, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a2, fs5, rtz
+; CHECK-NOV-NEXT:    bltu s1, a1, .LBB43_2
+; CHECK-NOV-NEXT:  .LBB43_11: # %entry
+; CHECK-NOV-NEXT:    mv s1, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a3, fs4, rtz
+; CHECK-NOV-NEXT:    bltu a2, a1, .LBB43_3
+; CHECK-NOV-NEXT:  .LBB43_12: # %entry
+; CHECK-NOV-NEXT:    mv a2, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a4, fs3, rtz
+; CHECK-NOV-NEXT:    bltu a3, a1, .LBB43_4
+; CHECK-NOV-NEXT:  .LBB43_13: # %entry
+; CHECK-NOV-NEXT:    mv a3, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a5, fs2, rtz
+; CHECK-NOV-NEXT:    bltu a4, a1, .LBB43_5
+; CHECK-NOV-NEXT:  .LBB43_14: # %entry
+; CHECK-NOV-NEXT:    mv a4, a1
+; CHECK-NOV-NEXT:    fcvt.lu.s a6, fs1, rtz
+; CHECK-NOV-NEXT:    bltu a5, a1, .LBB43_6
 ; CHECK-NOV-NEXT:  .LBB43_15: # %entry
 ; CHECK-NOV-NEXT:    mv a5, a1
-; CHECK-NOV-NEXT:    sext.w a7, a7
-; CHECK-NOV-NEXT:    bltu a6, a1, .LBB43_12
+; CHECK-NOV-NEXT:    fcvt.lu.s a7, fs0, rtz
+; CHECK-NOV-NEXT:    bltu a6, a1, .LBB43_7
 ; CHECK-NOV-NEXT:  .LBB43_16: # %entry
 ; CHECK-NOV-NEXT:    mv a6, a1
-; CHECK-NOV-NEXT:    bgeu a7, a1, .LBB43_13
-; CHECK-NOV-NEXT:    j .LBB43_14
+; CHECK-NOV-NEXT:    bgeu a7, a1, .LBB43_8
+; CHECK-NOV-NEXT:    j .LBB43_9
 ;
 ; CHECK-V-LABEL: utesth_f16i16_mm:
 ; CHECK-V:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
index 1d42b6e3937c7..bd0fecd285515 100644
--- a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -52,47 +52,27 @@ define i32 @vector_length_i16_fixed(i16 zeroext %tc) {
 }
 
 define i32 @vector_length_i32_fixed(i32 zeroext %tc) {
-; RV32-LABEL: vector_length_i32_fixed:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 2
-; RV32-NEXT:    bltu a0, a1, .LBB4_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a0, 2
-; RV32-NEXT:  .LBB4_2:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_length_i32_fixed:
-; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    li a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB4_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 2
-; RV64-NEXT:  .LBB4_2:
-; RV64-NEXT:    ret
+; CHECK-LABEL: vector_length_i32_fixed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    bltu a0, a1, .LBB4_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    ret
   %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 2, i1 false)
   ret i32 %a
 }
 
 define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) {
-; RV32-LABEL: vector_length_XLen_fixed:
-; RV32:       # %bb.0:
-; RV32-NEXT:    li a1, 2
-; RV32-NEXT:    bltu a0, a1, .LBB5_2
-; RV32-NEXT:  # %bb.1:
-; RV32-NEXT:    li a0, 2
-; RV32-NEXT:  .LBB5_2:
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_length_XLen_fixed:
-; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    li a1, 2
-; RV64-NEXT:    bltu a0, a1, .LBB5_2
-; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a0, 2
-; RV64-NEXT:  .LBB5_2:
-; RV64-NEXT:    ret
+; CHECK-LABEL: vector_length_XLen_fixed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    bltu a0, a1, .LBB5_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    ret
   %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 2, i1 false)
   ret i32 %a
 }

From dc06d75ab27b4dcae2940fc386fadd06f70faffe Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Mon, 26 Feb 2024 18:49:18 +0000
Subject: [PATCH 355/546] Revert "[RemoveDIs] Print non-intrinsic debug info in
 textual IR output (#79281)"

Reverted due to failures on buildbots, where a new cl flag was placed
in the wrong file, resulting in link errors.

https://lab.llvm.org/buildbot/#/builders/198/builds/8548

This reverts commit 0b398256b3f72204ad1f7c625efe4990204e898a.
---
 llvm/include/llvm/IR/Module.h                 | 14 ---
 llvm/include/llvm/IR/PrintPasses.h            | 19 ----
 llvm/lib/IR/AsmWriter.cpp                     | 75 +++++++++-------
 llvm/lib/IR/BasicBlock.cpp                    |  4 +
 llvm/lib/IR/IRPrintingPasses.cpp              | 24 ++++--
 llvm/lib/IR/Module.cpp                        | 22 -----
 llvm/lib/IRPrinter/IRPrintingPasses.cpp       | 27 +++---
 .../Generic/inline-alloca-ordering.ll         |  7 +-
 .../DebugInfo/Generic/inline-dbg-values.ll    |  8 +-
 llvm/test/DebugInfo/dpvalue-print-nocrash.ll  |  3 +-
 .../print-non-instruction-debug-info.ll       | 86 -------------------
 .../pr33641_remove_arg_dbgvalue.ll            | 18 ++--
 .../callsite-split-preserve-debug.ll          |  4 +-
 .../debug-info-on-skipped-selects.ll          |  2 +-
 .../DeadArgElim/2010-04-30-DbgInfo.ll         | 35 ++------
 .../IROutliner/outlining-debug-statements.ll  |  7 ++
 llvm/test/Transforms/LoopRotate/dbgvalue.ll   |  4 +-
 .../LoopRotate/delete-dbg-values.ll           |  4 +-
 ...e-that-exception-unwind-path-is-visited.ll |  5 +-
 llvm/test/Transforms/SROA/dbg-inline.ll       | 50 +++++------
 20 files changed, 146 insertions(+), 272 deletions(-)
 delete mode 100644 llvm/test/DebugInfo/print-non-instruction-debug-info.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index f5f3c66f0ae1e..68a89dc45c283 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -218,18 +218,11 @@ class LLVM_EXTERNAL_VISIBILITY Module {
   /// \ref BasicBlock.
   bool IsNewDbgInfoFormat;
 
-  /// Used when converting this module to the new debug info format; removes all
-  /// declarations of debug intrinsics that are replaced by non-intrinsic
-  /// records in the new format.
-  void removeDebugIntrinsicDeclarations();
-
   /// \see BasicBlock::convertToNewDbgValues.
   void convertToNewDbgValues() {
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
-    // Remove the declarations of the old debug intrinsics, if any exist.
-    removeDebugIntrinsicDeclarations();
     IsNewDbgInfoFormat = true;
   }
 
@@ -241,13 +234,6 @@ class LLVM_EXTERNAL_VISIBILITY Module {
     IsNewDbgInfoFormat = false;
   }
 
-  void setIsNewDbgInfoFormat(bool UseNewFormat) {
-    if (UseNewFormat && !IsNewDbgInfoFormat)
-      convertToNewDbgValues();
-    else if (!UseNewFormat && IsNewDbgInfoFormat)
-      convertFromNewDbgValues();
-  }
-
   /// The Module constructor. Note that there is no default constructor. You
   /// must provide a name for the module upon construction.
   explicit Module(StringRef ModuleID, LLVMContext& C);
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 3803bd05cbe57..95b97e76c867c 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,25 +78,6 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
-/// Used to temporarily set the debug info format of a function, module, or
-/// basic block for the duration of this object's lifetime, after which the
-/// prior state will be restored.
-template <typename T> class ScopedDbgInfoFormatSetter {
-  T &Obj;
-  bool OldState;
-
-public:
-  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
-    Obj.setIsNewDbgInfoFormat(NewState);
-  }
-  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
-};
-
-template <typename T>
-ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-    -> ScopedDbgInfoFormatSetter<T>;
-
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index a3da6ca811489..fba404c9b027c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -861,7 +861,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
 
-  /// Add all of the metadata from a DbgRecord.
+  /// Add all of the metadata from an instruction.
   void processDbgRecordMetadata(const DbgRecord &DPV);
 };
 
@@ -1140,9 +1140,6 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
-    // Process metadata used by DbgRecords; we only specifically care about the
-    // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
-    // Expression fields should only be printed inline and so do not use a slot.
     CreateMetadataSlot(DPV->getVariable());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
@@ -2706,7 +2703,6 @@ class AssemblyWriter {
   void printDPValue(const DPValue &DPI);
   void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
-  void printDbgRecordLine(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -3889,6 +3885,9 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
+  bool ConvertBack = F->IsNewDbgInfoFormat;
+  if (ConvertBack)
+    const_cast<Function *>(F)->convertFromNewDbgValues();
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
@@ -4031,6 +4030,8 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "}\n";
   }
 
+  if (ConvertBack)
+    const_cast<Function *>(F)->convertToNewDbgValues();
   Machine.purgeFunction();
 }
 
@@ -4097,8 +4098,6 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (const Instruction &I : *BB) {
-    for (const DbgRecord &DR : I.getDbgValueRange())
-      printDbgRecordLine(DR);
     printInstructionLine(I);
   }
 
@@ -4612,10 +4611,12 @@ void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &DPV) {
-  auto WriterCtx = getContext();
-  Out << "#dbg_";
-  switch (DPV.getType()) {
+void AssemblyWriter::printDPValue(const DPValue &Value) {
+  // There's no formal representation of a DPValue -- print purely as a
+  // debugging aid.
+  Out << "  DPValue ";
+
+  switch (Value.getType()) {
   case DPValue::LocationType::Value:
     Out << "value";
     break;
@@ -4628,39 +4629,35 @@ void AssemblyWriter::printDPValue(const DPValue &DPV) {
   default:
     llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
   }
-  Out << "(";
-  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
+  Out << " { ";
+  auto WriterCtx = getContext();
+  WriteAsOperandInternal(Out, Value.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, Value.getVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, Value.getExpression(), WriterCtx, true);
   Out << ", ";
-  if (DPV.isDbgAssign()) {
-    WriteAsOperandInternal(Out, DPV.getAssignID(), WriterCtx, true);
+  if (Value.isDbgAssign()) {
+    WriteAsOperandInternal(Out, Value.getAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, Value.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, Value.getAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
-  Out << ")";
-}
-
-/// printDbgRecordLine - Print a DbgRecord with indentation and a newline
-/// character.
-void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
-  // Print lengthier indentation to bring out-of-line with instructions.
-  Out << "    ";
-  printDbgRecord(DR);
-  Out << '\n';
+  WriteAsOperandInternal(Out, Value.getDebugLoc().get(), WriterCtx, true);
+  Out << " marker @" << Value.getMarker();
+  Out << " }";
 }
 
 void AssemblyWriter::printDPLabel(const DPLabel &Label) {
+  // There's no formal representation of a DPLabel -- print purely as
+  // a debugging aid.
+  Out << "  DPLabel { ";
   auto WriterCtx = getContext();
-  Out << "#dbg_label(";
   WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
-  Out << ")";
+  Out << " marker @" << Label.getMarker();
+  Out << " }";
 }
 
 void AssemblyWriter::printMetadataAttachments(
@@ -4808,11 +4805,19 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
+  // RemoveDIs: always print with debug-info in intrinsic format.
+  bool ConvertAfter = IsNewDbgInfoFormat;
+  if (IsNewDbgInfoFormat)
+    const_cast<Module *>(this)->convertFromNewDbgValues();
+
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printModule(this);
+
+  if (ConvertAfter)
+    const_cast<Module *>(this)->convertToNewDbgValues();
 }
 
 void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
@@ -4903,6 +4908,8 @@ void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                      bool IsForDebug) const {
+  // There's no formal representation of a DPMarker -- print purely as a
+  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4924,6 +4931,8 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
+  // There's no formal representation of a DPValue -- print purely as a
+  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4941,6 +4950,8 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
+  // There's no formal representation of a DbgLabelRecord -- print purely as
+  // a debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index c110c4c1437c3..6ea876fde5ec6 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -61,6 +61,10 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
+  // Is the command line option set?
+  if (!UseNewDbgInfoFormat)
+    return;
+
   IsNewDbgInfoFormat = true;
 
   // Iterate over all instructions in the instruction list, collecting dbg.value
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index 7b4f66a908c59..b19210e776ed5 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -23,8 +23,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> WriteNewDbgInfoFormat;
-
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -41,9 +39,11 @@ class PrintModulePassWrapper : public ModulePass {
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    // RemoveDIs: Regardless of the format we've processed this module in, use
-    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-    ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+    // RemoveDIs: there's no textual representation of the DPValue debug-info,
+    // convert to dbg.values before writing out.
+    bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
+    if (IsNewDbgInfoFormat)
+      M.convertFromNewDbgValues();
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -62,6 +62,9 @@ class PrintModulePassWrapper : public ModulePass {
       }
     }
 
+    if (IsNewDbgInfoFormat)
+      M.convertToNewDbgValues();
+
     return false;
   }
 
@@ -84,9 +87,11 @@ class PrintFunctionPassWrapper : public FunctionPass {
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    // RemoveDIs: Regardless of the format we've processed this function in, use
-    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-    ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
+    // RemoveDIs: there's no textual representation of the DPValue debug-info,
+    // convert to dbg.values before writing out.
+    bool IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+    if (IsNewDbgInfoFormat)
+      F.convertFromNewDbgValues();
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
@@ -96,6 +101,9 @@ class PrintFunctionPassWrapper : public FunctionPass {
         OS << Banner << '\n' << static_cast<Value &>(F);
     }
 
+    if (IsNewDbgInfoFormat)
+      F.convertToNewDbgValues();
+
     return false;
   }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index a8696ed9e3ce5..1946db2ee0be7 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -85,28 +85,6 @@ Module::~Module() {
   IFuncList.clear();
 }
 
-void Module::removeDebugIntrinsicDeclarations() {
-  auto *DeclareIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
-  assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
-         "Debug declare intrinsic should have had uses removed.");
-  DeclareIntrinsicFn->eraseFromParent();
-  auto *ValueIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
-  assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
-         "Debug value intrinsic should have had uses removed.");
-  ValueIntrinsicFn->eraseFromParent();
-  auto *AssignIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
-  assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
-         "Debug assign intrinsic should have had uses removed.");
-  AssignIntrinsicFn->eraseFromParent();
-  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
-  assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
-         "Debug label intrinsic should have had uses removed.");
-  LabelntrinsicFn->eraseFromParent();
-}
-
 std::unique_ptr<RandomNumberGenerator>
 Module::createRNG(const StringRef Name) const {
   SmallString<32> Salt(Name);
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index 340f4fa6bfc84..52b242b4dcd52 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -22,11 +22,6 @@
 
 using namespace llvm;
 
-cl::opt<bool> WriteNewDbgInfoFormat(
-    "write-experimental-debuginfo",
-    cl::desc("Write debug info in the new non-intrinsic format"),
-    cl::init(false));
-
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -36,9 +31,11 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: Regardless of the format we've processed this module in, use
-  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-  ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+  // RemoveDIs: there's no textual representation of the DPValue debug-info,
+  // convert to dbg.values before writing out.
+  bool ShouldConvert = M.IsNewDbgInfoFormat;
+  if (ShouldConvert)
+    M.convertFromNewDbgValues();
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -66,6 +63,9 @@ PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
     Index->print(OS);
   }
 
+  if (ShouldConvert)
+    M.convertToNewDbgValues();
+
   return PreservedAnalyses::all();
 }
 
@@ -75,9 +75,11 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  // RemoveDIs: Regardless of the format we've processed this function in, use
-  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-  ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
+  // RemoveDIs: there's no textual representation of the DPValue debug-info,
+  // convert to dbg.values before writing out.
+  bool ShouldConvert = F.IsNewDbgInfoFormat;
+  if (ShouldConvert)
+    F.convertFromNewDbgValues();
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
@@ -86,5 +88,8 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
       OS << Banner << '\n' << static_cast<Value &>(F);
   }
 
+  if (ShouldConvert)
+    F.convertToNewDbgValues();
+
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
index 9f401ceb5b6f4..9ff3d80af80d1 100644
--- a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
+++ b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
@@ -15,6 +15,8 @@
 ;; doesn't transfer the dbg.value to the entry block. This needs Special
 ;; Handling once we get rid of debug-intrinsics.
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK:    define i32 @bar()
 ; CHECK-NEXT: %1 = alloca [65 x i32], align 16
 ; CHECK-NEXT: call void @ext()
@@ -22,10 +24,9 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !12
 ; CHECK-NEXT: call void @init(ptr %1)
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 declare void @ext()
 declare void @init(ptr)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define internal i32 @foo() !dbg !4 {
   %1 = alloca [65 x i32], align 16
@@ -41,8 +42,6 @@ define i32 @bar() !dbg !16 {
   ret i32 %1
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
index b0390b9f78f41..7d580d6fba37a 100644
--- a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
+++ b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
@@ -7,6 +7,8 @@
 ;;
 ;; test should be inlined into test2
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK: define i32 @test2
 ; CHECK-NEXT: entry:
 ; CHECK:      %k.addr.i = alloca i32, align 4
@@ -45,8 +47,6 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[KVAR]], metadata !DIExpression()), !dbg ![[KLINE]]
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[K2VAR]], metadata !DIExpression()), !dbg ![[GLINE]]
 ;
-; CHECK: declare void @llvm.dbg.value(metadata,
-;
 ;; Test that the metadata maps onto the correct things, and that the DILocations
 ;; attached to the intrinsics have been inlined.
 ;
@@ -100,6 +100,8 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %3, !dbg !20
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 declare i32 @_Z8test_exti(i32)
 
 define i32 @test2(i32 %foo, i32 %bar) !dbg !10 {
@@ -116,8 +118,6 @@ try.cont:                                         ; preds = %catch, %invoke.cont
   ret i32 0, !dbg !30
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 0a618c6780d1d..9f120af13ac9c 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -2,7 +2,8 @@
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: CLONE:   #dbg_value(
+; CHECK: CLONE:   DPValue value {
+; CHECK-SAME: marker @0x0
 
 define ptr @func_10(i32 %p_11) {
 entry:
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
deleted file mode 100644
index f8271df146fe9..0000000000000
--- a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-;; Test that we can write in the new debug info format.
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
-
-;; Test also that the new flag is independent of the flag that enables use of
-;; these non-instruction debug info during LLVM passes.
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
-
-; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]])
-; CHECK-NEXT: entry:
-; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]])
-; CHECK-NEXT: {{^}}  %[[VAL_B:[0-9a-zA-Z]+]] = alloca
-; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]])
-; CHECK-NEXT: {{^}}  %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5
-; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]])
-; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]])
-; NEWDBG-NEXT: {{^}}    #dbg_label(![[LABEL_ID:[0-9]+]])
-; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
-; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
-; CHECK-NEXT: {{^}}  ret i32
-
-; OLDDBG-DAG: declare void @llvm.dbg.value
-; OLDDBG-DAG: declare void @llvm.dbg.declare
-; OLDDBG-DAG: declare void @llvm.dbg.assign
-
-; CHECK-DAG: llvm.dbg.cu
-; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a"
-; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b"
-; CHECK-DAG: ![[LOC_1]] = !DILocation(line: 3, column: 15
-; CHECK-DAG: ![[LOC_2]] = !DILocation(line: 3, column: 20
-; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
-; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
-; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
-
-define dso_local i32 @f(i32 %a) !dbg !7 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30
-  %b = alloca i32, !dbg !30, !DIAssignID !40
-  call void @llvm.dbg.declare(metadata ptr %b, metadata !21, metadata !DIExpression()), !dbg !31
-  %add = add i32 %a, 5, !dbg !31
-  call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %add), metadata !20, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg !32
-  call void @llvm.dbg.label(metadata !50), !dbg !32
-  store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
-  call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
-  ret i32 %add, !dbg !33
-
-}
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-declare void @llvm.dbg.label(metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
-!1 = !DIFile(filename: "print.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 5}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 18.0.0"}
-!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !13)
-!8 = !DISubroutineType(types: !9)
-!9 = !{!12, !12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !{!20, !21}
-!20 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !12)
-!21 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !12)
-!30 = !DILocation(line: 3, column: 15, scope: !7)
-!31 = !DILocation(line: 3, column: 20, scope: !7)
-!32 = !DILocation(line: 3, column: 25, scope: !7)
-!33 = !DILocation(line: 3, column: 30, scope: !7)
-!40 = distinct !DIAssignID()
-!50 = !DILabel(scope: !7, name: "label", file: !1, line: 3)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index e0777f9ecee8b..83226e56b5acd 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -30,7 +30,8 @@ define internal void @bar(%p_t %p)  {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@bar
 ; CGSCC-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
+; CGSCC-NEXT:    call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression())
+; CGSCC-SAME:         !dbg [[DBG5:![0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.dbg.value(metadata %p_t %p, metadata !4, metadata !5), !dbg !6
@@ -51,20 +52,21 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; TUNIT: [[META1]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; TUNIT: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
 ; TUNIT: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
-; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CGSCC: [[META1]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; CGSCC: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
 ; CGSCC: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: [[META4:![0-9]+]])
-; CGSCC: [[META4]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: [[META0]])
-; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: [[META4]])
+; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: !4)
+; CGSCC: [[META4:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: !0)
+; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: !4)
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
index 3fdf62bf57502..e185286304a68 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
@@ -3,6 +3,8 @@
 
 ;; Test that DebugLocs are preserved, and that dbg.values are duplicated.
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK-LABEL: @test1
 ; CHECK:         call void @llvm.dbg.value(metadata i32 0,
 ; CHECK-NEXT:    [[R1:%.+]] = call i32 @callee(i32 0, i32 %dd), !dbg [[DBG1:!.*]]
@@ -41,8 +43,6 @@ CallSite:                                         ; preds = %land.rhs, %entry
 ; CHECK-NEXT:    phi i32 [ [[R1]], %Header.split ], [ [[R2]], %TBB.split ], !dbg [[DBG_CALL]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 1,
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 define void @test2(ptr %ptr, i32 %i) !dbg !19 {
 Header:
   %tobool = icmp ne i32 %i, 10
diff --git a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
index ce34b16a4d7a3..ffd4a0170ac9d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
@@ -5,8 +5,8 @@
 ; Test that when we skip over multiple selects in CGP, that the debug-info
 ; attached to those selects is still fixed up.
 
-; CHECK: call void @llvm.dbg.value(metadata ptr %sunkaddr,
 ; CHECK: declare void @llvm.dbg.value(metadata,
+; CHECK: call void @llvm.dbg.value(metadata ptr %sunkaddr,
 
 source_filename = "reduced.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index d84dd10e7d28b..0dee72f4f6b6f 100644
--- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,21 +8,12 @@
 
 define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp !dbg !1 {
 ;
-; CHECK-LABEL: define {{[^@]+}}@vfs_addname
-; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[LEN:%.*]], i32 [[HASH:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[LEN]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META14:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[FLAGS]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR3:[0-9]+]], !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    ret ptr [[TMP0]], !dbg [[DBG16]]
-;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %len, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %hash, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %flags, metadata !12, metadata !DIExpression()), !dbg !DILocation(scope: !1)
+; CHECK:  call fastcc ptr @add_name_internal(ptr %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <ptr> [#uses=1]
   ret ptr %0, !dbg !13
 }
@@ -31,24 +22,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 {
 ;
-; CHECK-LABEL: define {{[^@]+}}@add_name_internal
-; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i8 poison, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[HASH]], 0, !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[BB:%.*]], label [[BB1:%.*]], !dbg [[DBG28]]
-; CHECK:       bb:
-; CHECK-NEXT:    br label [[BB2:%.*]], !dbg [[DBG30:![0-9]+]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[BB2]], !dbg [[DBG31:![0-9]+]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ @.str, [[BB]] ], [ [[NAME]], [[BB1]] ]
-; CHECK-NEXT:    ret ptr [[DOT0]], !dbg [[DBG31]]
-;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !16)
   call void @llvm.dbg.value(metadata i32 %len, metadata !20, metadata !DIExpression()), !dbg !DILocation(scope: !16)
@@ -72,7 +45,9 @@ bb2:                                              ; preds = %bb1, %bb
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind ssp }
-; CHECK: attributes #1 = { noinline nounwind ssp }
+; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #2 = { noinline nounwind ssp }
+; CHECK: attributes [[NUW]] = { nounwind }
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
@@ -93,7 +68,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !14 = distinct !DILexicalBlock(line: 12, column: 0, file: !28, scope: !1)
 !15 = !DILocalVariable(name: "name", line: 17, arg: 1, scope: !16, file: !2, type: !6)
 ; CHECK: !DISubprogram(name: "add_name_internal"
+; CHECK-SAME: type: ![[MD:[0-9]+]]
 !16 = distinct !DISubprogram(name: "add_name_internal", linkageName: "add_name_internal", line: 22, isLocal: true, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !28, scope: !2, type: !17)
+; CHECK: ![[MD]] = !DISubroutineType(cc: DW_CC_nocall, types: !{{[0-9]+}})
 !17 = !DISubroutineType(types: !18)
 !18 = !{!6, !6, !9, !9, !19, !9}
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index bf846c310a525..f932788c73e2a 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -51,7 +51,14 @@ entry:
   ret void
 }
 
+; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) #1 {
 ; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, ptr [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, ptr [[ARG1]], align 4
+; CHECK-NEXT:    store i32 4, ptr [[ARG2]], align 4
+; CHECK-NEXT:    [[AL:%.*]] = load i32, ptr [[ARG0]], align 4
+; CHECK-NEXT:    [[BL:%.*]] = load i32, ptr [[ARG1]], align 4
+; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 !0 = !DIFile(filename: "foo.c", directory: "/tmp")
 !1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 331ca9efdd4a0..92cc886bc81c1 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -4,6 +4,8 @@
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; This function rotates the exit conditon into the entry block, moving the
 ; dbg.values with it. Check that they resolve through the PHIs to the arguments
 ; only in the entry block. In the loop block, the dbg.values shift down below
@@ -263,8 +265,6 @@ L0.latch:
   br label %L0, !dbg !77
 }
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 !llvm.module.flags = !{!20}
 !llvm.dbg.cu = !{!2}
 
diff --git a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
index 8c23c0276024c..bce5ed02b43bf 100644
--- a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
+++ b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
@@ -10,6 +10,8 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK-LABEL: define void @_ZNK4llvm5APInt4sextEj(ptr
 ; CHECK-LABEL: entry:
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC:[0-9]+]],
@@ -20,8 +22,6 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SINK]],
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC]],
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define void @_ZNK4llvm5APInt4sextEj(ptr %agg.result) !dbg !5 {
diff --git a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index 22d14f52e9664..e4c123e02540a 100644
--- a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -103,9 +103,10 @@ declare void @NSLog(ptr, ...)
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { ssp uwtable }
-; CHECK: attributes #1 = { nonlazybind }
+; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #2 = { nonlazybind }
 ; CHECK: attributes [[NUW]] = { nounwind }
-; CHECK: attributes #3 = { noinline ssp uwtable }
+; CHECK: attributes #4 = { noinline ssp uwtable }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
diff --git a/llvm/test/Transforms/SROA/dbg-inline.ll b/llvm/test/Transforms/SROA/dbg-inline.ll
index 3e7683ad2db77..454ca13230bfa 100644
--- a/llvm/test/Transforms/SROA/dbg-inline.ll
+++ b/llvm/test/Transforms/SROA/dbg-inline.ll
@@ -19,10 +19,10 @@ target triple = "x86_64-apple-macosx10.15.0"
 define i64 @_Z1g4pair(i64 %p.coerce0, i64 %p.coerce1) #0 !dbg !8 {
 ; CHECK-LABEL: @_Z1g4pair(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
 ; CHECK-NEXT:    ret i64 [[P_COERCE0]], !dbg [[DBG22:![0-9]+]]
 ;
 entry:
@@ -77,32 +77,32 @@ attributes #2 = { argmemonly nounwind willreturn }
 !26 = !DILocation(line: 10, column: 3, scope: !8)
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline ssp uwtable }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], nameTableKind: None, sysroot: "/")
-; CHECK: [[META1]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: {{.*}})
-; CHECK: [[META2]] = !{}
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project 5110fd0343c2d06c8ae538741fbef13ece5e68de)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None, sysroot: "/")
+; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "/Volumes/Data/llvm-project")
+; CHECK: [[META2:![0-9]+]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META6:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: [[META8:![0-9]+]], file: [[META8]], line: 9, type: [[META9:![0-9]+]], scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK: [[META8]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
-; CHECK: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]]}
-; CHECK: [[META11]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
-; CHECK: [[META12]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: [[META8]], line: 1, size: 128, flags: DIFlagTypePassByValue, elements: [[META13:![0-9]+]], identifier: "_ZTS4pair")
-; CHECK: [[META13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]]}
-; CHECK: [[META14]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64)
-; CHECK: [[META15]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64, offset: 64)
-; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: [[META7]], file: [[META8]], line: 9, type: [[META12]])
-; CHECK: [[DBG17]] = !DILocation(line: 0, scope: [[META7]])
-; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: [[META19:![0-9]+]], file: [[META8]], line: 5, type: [[META12]])
-; CHECK: [[META19]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: [[META8]], file: [[META8]], line: 5, type: [[META9]], scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK: [[DBG20]] = !DILocation(line: 0, scope: [[META19]], inlinedAt: [[META21:![0-9]+]])
-; CHECK: [[META21]] = distinct !DILocation(line: 10, column: 10, scope: [[META7]])
-; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: [[META7]])
+; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: !8, file: !8, line: 9, type: !9, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: [[META8:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
+; CHECK: [[META9:![0-9]+]] = !DISubroutineType(types: !10)
+; CHECK: [[META10:![0-9]+]] = !{!11, !12}
+; CHECK: [[META11:![0-9]+]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+; CHECK: [[META12:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: !8, line: 1, size: 128, flags: DIFlagTypePassByValue, elements: !13, identifier: "_ZTS4pair")
+; CHECK: [[META13:![0-9]+]] = !{!14, !15}
+; CHECK: [[META14:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !12, file: !8, line: 1, baseType: !11, size: 64)
+; CHECK: [[META15:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !12, file: !8, line: 1, baseType: !11, size: 64, offset: 64)
+; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !8, line: 9, type: !12)
+; CHECK: [[DBG17]] = !DILocation(line: 0, scope: !7)
+; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: !19, file: !8, line: 5, type: !12)
+; CHECK: [[META19:![0-9]+]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: !8, file: !8, line: 5, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: [[DBG20]] = !DILocation(line: 0, scope: !19, inlinedAt: !21)
+; CHECK: [[META21:![0-9]+]] = distinct !DILocation(line: 10, column: 10, scope: !7)
+; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: !7)
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}

From 6398baa425349ce67f39aaa67af86ec2aec96c7c Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 26 Feb 2024 10:54:08 -0800
Subject: [PATCH 356/546] [flang] Fixes for LIT testing of
 FLANG_RUNTIME_F128_MATH_LIB build. (#82832)

Follow-up for #81971 to fix the disabled LIT test and add
LIT tests for lowering of the added math intrinsics.
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp    | 15 ++++----
 flang/test/Driver/linker-flags.f90            | 34 +++++++++++++------
 flang/test/Lower/Intrinsics/cabs_real16.f90   | 10 ++++++
 .../Lower/Intrinsics/missing-math-runtime.f90 | 12 +++----
 flang/test/Lower/Intrinsics/sin_real16.f90    |  9 +++++
 flang/test/Lower/Intrinsics/sqrt_real16.f90   |  9 +++++
 flang/test/lit.cfg.py                         | 14 ++++++++
 flang/test/lit.site.cfg.py.in                 |  1 +
 8 files changed, 80 insertions(+), 24 deletions(-)
 create mode 100644 flang/test/Lower/Intrinsics/cabs_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/sin_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/sqrt_real16.f90

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 347b250260c4c..faceee85a2f8d 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1316,13 +1316,16 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
   // add the correct libraries to link against as dependents in the object
   // file.
   if (!TC.getTriple().isKnownWindowsMSVCEnvironment()) {
-    StringRef f128LibName = TC.getDriver().getFlangF128MathLibrary();
-    f128LibName.consume_front_insensitive("lib");
-    if (!f128LibName.empty()) {
+    StringRef F128LibName = TC.getDriver().getFlangF128MathLibrary();
+    F128LibName.consume_front_insensitive("lib");
+    if (!F128LibName.empty()) {
+      bool AsNeeded = !TC.getTriple().isOSAIX();
       CmdArgs.push_back("-lFortranFloat128Math");
-      addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true);
-      CmdArgs.push_back(Args.MakeArgString("-l" + f128LibName));
-      addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false);
+      if (AsNeeded)
+        addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true);
+      CmdArgs.push_back(Args.MakeArgString("-l" + F128LibName));
+      if (AsNeeded)
+        addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false);
     }
     CmdArgs.push_back("-lFortranRuntime");
     CmdArgs.push_back("-lFortranDecimal");
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 5e00520fcc098..4d3d528b5e99e 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -2,15 +2,15 @@
 ! invocation. These libraries are added on top of other standard runtime
 ! libraries that the Clang driver will include.
 
-! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN
-! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
-! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU
-! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW
+! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN,DARWIN-F128%f128-lib
+! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,SOLARIS-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib
+! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU,HAIKU-F128%f128-lib
+! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW,MINGW-F128%f128-lib
 ! RUN: %flang -### --target=aarch64-unknown-linux-gnu %S/Inputs/hello.f90 -lFortran_main 2>&1 | FileCheck %s --check-prefixes=DEPRECATED
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
@@ -29,21 +29,33 @@
 !       executable and may find the GNU linker from MinGW or Cygwin.
 ! UNIX-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! UNIX-SAME: "[[object_file]]"
-! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" "-lm"
+! UNIX-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
+! UNIX-F128NONE-NOT: FortranFloat128Math
+! SOLARIS-F128NONE-NOT: FortranFloat128Math
+! UNIX-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
+! SOLARIS-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "-z" "ignore" "-lquadmath" "-z" "record"
+! UNIX-SAME: "-lFortranRuntime" "-lFortranDecimal" "-lm"
 
 ! DARWIN-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! DARWIN-SAME: "[[object_file]]"
 ! DARWIN-SAME: -lFortran_main
+! DARWIN-F128NONE-NOT: FortranFloat128Math
+! DARWIN-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! DARWIN-SAME: -lFortranRuntime
 ! DARWIN-SAME: -lFortranDecimal
 
 ! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! HAIKU-SAME: "[[object_file]]"
-! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal"
+! HAIKU-SAME: "--whole-archive" "-lFortran_main" "--no-whole-archive"
+! HAIKU-F128NONE-NOT: FortranFloat128Math
+! HAIKU-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
+! HAIKU-SAME: "-lFortranRuntime" "-lFortranDecimal"
 
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
 ! MINGW-SAME: -lFortran_main
+! MINGW-F128NONE-NOT: FortranFloat128Math
+! MINGW-F128LIBQUADMATH-SAME: "-lFortranFloat128Math" "--as-needed" "-lquadmath" "--no-as-needed"
 ! MINGW-SAME: -lFortranRuntime
 ! MINGW-SAME: -lFortranDecimal
 
diff --git a/flang/test/Lower/Intrinsics/cabs_real16.f90 b/flang/test/Lower/Intrinsics/cabs_real16.f90
new file mode 100644
index 0000000000000..363b154a8e7fe
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cabs_real16.f90
@@ -0,0 +1,10 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACAbsF128({{.*}}){{.*}}: (!fir.complex<16>) -> f128
+  complex(16) :: a
+  real(16) :: b
+  b = abs(a)
+end
diff --git a/flang/test/Lower/Intrinsics/missing-math-runtime.f90 b/flang/test/Lower/Intrinsics/missing-math-runtime.f90
index ff767ba18faae..699678fcf2bce 100644
--- a/flang/test/Lower/Intrinsics/missing-math-runtime.f90
+++ b/flang/test/Lower/Intrinsics/missing-math-runtime.f90
@@ -1,10 +1,8 @@
-! There is no quad math runtime available in lowering
-! for now. Test that the TODO are emitted correctly.
-! FIXME: the lit config has to flip a feature flag so that
-! the tests can use different checks depending on whether
-! REAL(16) math support is enabled or not.
-! XFAIL: *
-! RUN: bbc -emit-fir %s -o /dev/null 2>&1 | FileCheck %s
+! If the compiler is built without 128-bit float math
+! support, an appropriate error message is emitted.
+! UNSUPPORTED: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o /dev/null >%t 2>&1 || echo
+! RUN: FileCheck %s --input-file=%t
 
  complex(16) :: a
  real(16) :: b
diff --git a/flang/test/Lower/Intrinsics/sin_real16.f90 b/flang/test/Lower/Intrinsics/sin_real16.f90
new file mode 100644
index 0000000000000..d09c71fb03e26
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sin_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranASinF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = sin(a)
+end
diff --git a/flang/test/Lower/Intrinsics/sqrt_real16.f90 b/flang/test/Lower/Intrinsics/sqrt_real16.f90
new file mode 100644
index 0000000000000..ac08c8c79e9a8
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sqrt_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranASqrtF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = sqrt(a)
+end
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index 9b8bb83ca23c7..65467f2d23f4f 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -199,3 +199,17 @@
 result = lit_config.params.get("LIBPGMATH")
 if result:
     config.environment["LIBPGMATH"] = True
+
+# Add features and substitutions to test F128 math support.
+# %f128-lib substitution may be used to generate check prefixes
+# for LIT tests checking for F128 library support.
+if config.flang_runtime_f128_math_lib:
+    config.available_features.add("flang-supports-f128-math")
+    config.available_features.add(
+        "flang-f128-math-lib-" + config.flang_runtime_f128_math_lib
+    )
+    config.substitutions.append(
+        ("%f128-lib", config.flang_runtime_f128_math_lib.upper())
+    )
+else:
+    config.substitutions.append(("%f128-lib", "NONE"))
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index 16411d82908ca..e66f1af794526 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -25,6 +25,7 @@ config.cc = "@CMAKE_C_COMPILER@"
 config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.default_sysroot = "@DEFAULT_SYSROOT@"
+config.flang_runtime_f128_math_lib = "@FLANG_RUNTIME_F128_MATH_LIB@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)

From 5cdb8c0c8854d08ac7ca131ce3e8d78a32eb6716 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <hsiangkai.wang@arm.com>
Date: Mon, 26 Feb 2024 19:00:15 +0000
Subject: [PATCH 357/546] [mlir][vector] Add a pattern to fuse
 extract(constant_mask) (#81057)

This pattern will rewrite
ExtractOp(ConstantMaskOp) -> ConstantMaskOp
or
ExtractOp(ConstantMaskOp) -> Constant
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   | 74 +++++++++++++++++++++-
 mlir/test/Dialect/Vector/canonicalize.mlir | 73 +++++++++++++++++++++
 2 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 5be6a628904cd..8c341a347ff66 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2040,6 +2040,77 @@ class ExtractOpFromCreateMask final : public OpRewritePattern<ExtractOp> {
   }
 };
 
+// Patterns to rewrite ExtractOp(ConstantMaskOp)
+//
+// When the result of ExtractOp is a subvector of input, we can rewrite it as
+// a ConstantMaskOp with subvector ranks.
+//
+// ExtractOp(ConstantMaskOp) -> ConstantMaskOp
+//
+// When the result of ExtractOp is a scalar, we can get the scalar value
+// directly.
+//
+// ExtractOp(ConstantMaskOp) -> ConstantOp
+class ExtractOpFromConstantMask final : public OpRewritePattern<ExtractOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ExtractOp extractOp,
+                                PatternRewriter &rewriter) const override {
+    auto constantMaskOp =
+        extractOp.getVector().getDefiningOp<vector::ConstantMaskOp>();
+    if (!constantMaskOp)
+      return failure();
+
+    // All indices must be static.
+    ArrayRef<int64_t> extractOpPos = extractOp.getStaticPosition();
+    unsigned dynamicPosCount =
+        llvm::count_if(extractOpPos, ShapedType::isDynamic);
+    // If there is any dynamic position in ExtractOp, we cannot determine the
+    // scalar value.
+    if (dynamicPosCount)
+      return failure();
+
+    ArrayRef<Attribute> maskDimSizes =
+        constantMaskOp.getMaskDimSizes().getValue();
+    Type resultTy = extractOp.getResult().getType();
+    if (resultTy.isa<mlir::VectorType>()) {
+      auto resultVectorTy = resultTy.cast<mlir::VectorType>();
+      int64_t resultRank = resultVectorTy.getRank();
+      int64_t n = maskDimSizes.size();
+      std::vector<int64_t> indices;
+      for (auto i = n - resultRank; i < n; ++i)
+        indices.push_back(cast<IntegerAttr>(maskDimSizes[i]).getInt());
+
+      rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(
+          extractOp, resultVectorTy,
+          vector::getVectorSubscriptAttr(rewriter, indices));
+
+      return success();
+    } else if (resultTy.isa<mlir::IntegerType>()) {
+      // ConstantMaskOp creates and returns a vector mask where elements of the
+      // result vector are set to ‘0’ or ‘1’, based on whether the element
+      // indices are contained within a hyper-rectangular region.
+      // We go through ExtractOp static positions to determine the position is
+      // within the hyper-rectangular region or not.
+      Type boolType = rewriter.getI1Type();
+      IntegerAttr setAttr = IntegerAttr::get(boolType, 1);
+      for (size_t i = 0, end = extractOpPos.size(); i < end; ++i) {
+        if (cast<IntegerAttr>(maskDimSizes[i]).getInt() <= extractOpPos[i]) {
+          setAttr = IntegerAttr::get(boolType, 0);
+          break;
+        }
+      }
+
+      rewriter.replaceOpWithNewOp<arith::ConstantOp>(extractOp, boolType,
+                                                     setAttr);
+      return success();
+    }
+
+    return failure();
+  }
+};
+
 // Folds extract(shape_cast(..)) into shape_cast when the total element count
 // does not change.
 LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
@@ -2066,7 +2137,8 @@ LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
 void ExtractOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
   results.add<ExtractOpSplatConstantFolder, ExtractOpNonSplatConstantFolder,
-              ExtractOpFromBroadcast, ExtractOpFromCreateMask>(context);
+              ExtractOpFromBroadcast, ExtractOpFromCreateMask,
+              ExtractOpFromConstantMask>(context);
   results.add(foldExtractFromShapeCastToShapeCast);
 }
 
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index e6f045e12e519..dc486181ebe96 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2567,3 +2567,76 @@ func.func @load_store_forwarding_rank_mismatch(%v0: vector<4x1x1xf32>, %arg0: te
       tensor<4x4x4xf32>, vector<1x100x4x5xf32>
   return %r : vector<1x100x4x5xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @extract_true_from_constant_mask() -> i1 {
+func.func @extract_true_from_constant_mask() -> i1 {
+// CHECK:      %[[TRUE:.*]] = arith.constant true
+// CHECK-NEXT: return %[[TRUE]] : i1
+  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
+  %extract = vector.extract %mask[1, 1, 2] : i1 from vector<4x4x4xi1>
+  return %extract : i1
+}
+
+// -----
+
+// CHECK-LABEL: func.func @extract_false_from_constant_mask() -> i1 {
+func.func @extract_false_from_constant_mask() -> i1 {
+// CHECK:      %[[FALSE:.*]] = arith.constant false
+// CHECK-NEXT: return %[[FALSE]] : i1
+  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
+  %extract = vector.extract %mask[1, 2, 2] : i1 from vector<4x4x4xi1>
+  return %extract : i1
+}
+
+// -----
+
+// CHECK-LABEL: func.func @extract_from_create_mask() -> i1 {
+func.func @extract_from_create_mask() -> i1 {
+// CHECK:      %[[TRUE:.*]] = arith.constant true
+// CHECK-NEXT: return %[[TRUE]] : i1
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %mask = vector.create_mask %c2, %c2, %c3 : vector<4x4x4xi1>
+  %extract = vector.extract %mask[1, 1, 2] : i1 from vector<4x4x4xi1>
+  return %extract : i1
+}
+
+// -----
+
+// CHECK-LABEL: func.func @extract_subvector_from_constant_mask() ->
+// CHECK-SAME:  vector<6xi1> {
+func.func @extract_subvector_from_constant_mask() -> vector<6xi1> {
+// CHECK:      %[[S0:.*]] = vector.constant_mask [4] : vector<6xi1>
+// CHECK-NEXT: return %[[S0]] : vector<6xi1>
+  %mask = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
+  %extract = vector.extract %mask[1, 2] : vector<6xi1> from vector<4x5x6xi1>
+  return %extract : vector<6xi1>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @extract_scalar_with_dynamic_positions(
+// CHECK-SAME:    %[[INDEX:.*]]: index) -> i1 {
+func.func @extract_scalar_with_dynamic_positions(%index: index) -> i1 {
+// CHECK:       %[[S0:.*]] = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
+// CHECK-NEXT:  %[[S1:.*]] = vector.extract %[[S0]][1, 1, %[[INDEX]]] : i1 from vector<4x4x4xi1>
+// CHECK-NEXT:  return %[[S1]] : i1
+  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
+  %extract = vector.extract %mask[1, 1, %index] : i1 from vector<4x4x4xi1>
+  return %extract : i1
+}
+
+// -----
+
+// CHECK-LABEL: func.func @extract_subvector_with_dynamic_positions
+// CHECK-SAME:    %[[INDEX:.*]]: index) -> vector<6xi1> {
+func.func @extract_subvector_with_dynamic_positions(%index: index) -> vector<6xi1> {
+// CHECK:      %[[S0:.*]] = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
+// CHECK-NEXT: %[[S1:.*]] = vector.extract %[[S0]][1, %[[INDEX]]] : vector<6xi1> from vector<4x5x6xi1>
+// CHECK-NEXT: return %[[S1]] : vector<6xi1>
+  %mask = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
+  %extract = vector.extract %mask[1, %index] : vector<6xi1> from vector<4x5x6xi1>
+  return %extract : vector<6xi1>
+}

From ce4740d3e31e936aa93c115f63bf223c74c9dc20 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Mon, 26 Feb 2024 14:01:48 -0500
Subject: [PATCH 358/546] [Clang][Sema] Fix crash when MS dependent base class
 lookup occurs in an incomplete context (#83024)

When compiling the following with `-fms-compatibility`:
```
template<typename T> struct C;

// Test lookup with incomplete lookup context
template<typename T>
auto C<T>::f() -> decltype(x) { }
```
An assert fails because `CXXRecordDecl::hasAnyDependentBases` is called
on an incomplete class. This patch ensures we don't perform unqualified
lookup into dependent base classes when the lookup context is
incomplete.
---
 clang/docs/ReleaseNotes.rst                                | 2 ++
 clang/lib/Sema/SemaExpr.cpp                                | 2 +-
 .../test/SemaTemplate/ms-lookup-template-base-classes.cpp  | 7 +++++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9e67bbb789504..515dffa28df18 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -283,6 +283,8 @@ Bug Fixes to C++ Support
   (`#78524 <https://github.com/llvm/llvm-project/issues/78524>`_)
 - Clang no longer instantiates the exception specification of discarded candidate function
   templates when determining the primary template of an explicit specialization.
+- Fixed a crash in Microsoft compatibility mode where unqualified dependent base class
+  lookup searches the bases of an incomplete class.
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 816ee9e281359..403839f77a2b7 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2653,7 +2653,7 @@ recoverFromMSUnqualifiedLookup(Sema &S, ASTContext &Context,
     RD = ThisType->getPointeeType()->getAsCXXRecordDecl();
   else if (auto *MD = dyn_cast<CXXMethodDecl>(S.CurContext))
     RD = MD->getParent();
-  if (!RD || !RD->hasAnyDependentBases())
+  if (!RD || !RD->hasDefinition() || !RD->hasAnyDependentBases())
     return nullptr;
 
   // Diagnose this as unqualified lookup into a dependent base class.  If 'this'
diff --git a/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp b/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp
index 7856a0a16307b..534a5dc9ddc10 100644
--- a/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp
+++ b/clang/test/SemaTemplate/ms-lookup-template-base-classes.cpp
@@ -71,6 +71,13 @@ class B : public A<T> {
 
 template class B<int>;
 
+template<typename T> struct C;
+
+// Test lookup with incomplete lookup context
+template<typename T>
+auto C<T>::f() -> decltype(x) { } // expected-error {{use of undeclared identifier 'x'}}
+                                  // expected-error@-1 {{out-of-line definition of 'f' from class 'C<T>' without definition}}
+
 }
 
 
From 1865c7ea8561407626fe5489ae07647035413d7c Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Mon, 26 Feb 2024 11:04:08 -0800
Subject: [PATCH 359/546] [scudo] Store more blocks in each TransferBatch
 (#70390)

Instead of always storing the same number of blocks as cached, we prefer
increasing the utilization by saving more blocks in a single
TransferBatch. This may slightly impact the performance, but it will
save a lot of memory used by BatchClassId (especially for larger
blocks).
---
 .../lib/scudo/standalone/allocator_common.h   |   7 +
 compiler-rt/lib/scudo/standalone/primary32.h  | 106 ++++++------
 compiler-rt/lib/scudo/standalone/primary64.h  | 153 ++++++++++--------
 .../scudo/standalone/tests/primary_test.cpp   |  34 ++--
 4 files changed, 159 insertions(+), 141 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/allocator_common.h b/compiler-rt/lib/scudo/standalone/allocator_common.h
index 95f4776ac596d..46dc7c0f3b914 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_common.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_common.h
@@ -40,6 +40,7 @@ template <class SizeClassAllocator> struct TransferBatch {
     B->Count = static_cast<u16>(B->Count - N);
   }
   void clear() { Count = 0; }
+  bool empty() { return Count == 0; }
   void add(CompactPtrT P) {
     DCHECK_LT(Count, MaxNumCached);
     Batch[Count++] = P;
@@ -48,6 +49,12 @@ template <class SizeClassAllocator> struct TransferBatch {
     memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     clear();
   }
+
+  void moveNToArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, Count);
+    memcpy(Array, Batch + Count - N, sizeof(Batch[0]) * Count);
+    Count -= N;
+  }
   u16 getCount() const { return Count; }
   bool isEmpty() const { return Count == 0U; }
   CompactPtrT get(u16 I) const {
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 4d03b282d000d..c86e75b8fd66a 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -191,38 +191,21 @@ template <typename Config> class SizeClassAllocator32 {
     return BlockSize > PageSize;
   }
 
-  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
-  // count. Now it's the same as the number of blocks stored in the
-  // `TransferBatch`.
   u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
-                UNUSED const u16 MaxBlockCount) {
-    TransferBatchT *B = popBatch(C, ClassId);
-    if (!B)
-      return 0;
-
-    const u16 Count = B->getCount();
-    DCHECK_GT(Count, 0U);
-    B->moveToArray(ToArray);
-
-    if (ClassId != SizeClassMap::BatchClassId)
-      C->deallocate(SizeClassMap::BatchClassId, B);
-
-    return Count;
-  }
-
-  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
+                const u16 MaxBlockCount) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatchT *B = popBatchImpl(C, ClassId, Sci);
-    if (UNLIKELY(!B)) {
+
+    u16 PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
+    if (UNLIKELY(PopCount == 0)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
-        return nullptr;
-      B = popBatchImpl(C, ClassId, Sci);
-      // if `populateFreeList` succeeded, we are supposed to get free blocks.
-      DCHECK_NE(B, nullptr);
+        return 0U;
+      PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
+      DCHECK_NE(PopCount, 0U);
     }
-    return B;
+
+    return PopCount;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -510,7 +493,7 @@ template <typename Config> class SizeClassAllocator32 {
     // by TransferBatch is also free for use. We don't need to recycle the
     // TransferBatch. Note that the correctness is maintained by the invariant,
     //
-    //   The unit of each popBatch() request is entire TransferBatch. Return
+    //   Each popBlocks() request returns the entire TransferBatch. Returning
     //   part of the blocks in a TransferBatch is invalid.
     //
     // This ensures that TransferBatch won't leak the address itself while it's
@@ -634,7 +617,7 @@ template <typename Config> class SizeClassAllocator32 {
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
 
       return BG;
     };
@@ -726,14 +709,11 @@ template <typename Config> class SizeClassAllocator32 {
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
-  // group id will be considered first.
-  //
-  // The region mutex needs to be held while calling this method.
-  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+  u16 popBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
+                    CompactPtrT *ToArray, const u16 MaxBlockCount)
       REQUIRES(Sci->Mutex) {
     if (Sci->FreeListInfo.BlockList.empty())
-      return nullptr;
+      return 0U;
 
     SinglyLinkedList<TransferBatchT> &Batches =
         Sci->FreeListInfo.BlockList.front()->Batches;
@@ -746,33 +726,57 @@ template <typename Config> class SizeClassAllocator32 {
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
       TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
-      TB->clear();
-      TB->add(
-          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
+      ToArray[0] =
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
       Sci->FreeListInfo.PoppedBlocks += 1;
-      return TB;
+      return 1U;
     }
 
+    // So far, instead of always filling the blocks to `MaxBlockCount`, we only
+    // examine single `TransferBatch` to minimize the time spent on the primary
+    // allocator. Besides, the sizes of `TransferBatch` and
+    // `CacheT::getMaxCached()` may also impact the time spent on accessing the
+    // primary allocator.
+    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
+    // blocks and/or adjust the size of `TransferBatch` according to
+    // `CacheT::getMaxCached()`.
     TransferBatchT *B = Batches.front();
-    Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    if (Batches.empty()) {
-      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
-      Sci->FreeListInfo.BlockList.pop_front();
-
-      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
-      // allocating. Note that block used by constructing BatchGroup is recorded
-      // as free blocks in the last element of BatchGroup::Batches. Which means,
-      // once we pop the last TransferBatch, the block is implicitly
-      // deallocated.
+    // BachClassId should always take all blocks in the TransferBatch. Read the
+    // comment in `pushBatchClassBlocks()` for more details.
+    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
+                             ? B->getCount()
+                             : Min(MaxBlockCount, B->getCount());
+    B->moveNToArray(ToArray, PopCount);
+
+    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
+    // done without holding `Mutex`.
+    if (B->empty()) {
+      Batches.pop_front();
+      // `TransferBatch` of BatchClassId is self-contained, no need to
+      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
+      // details.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, BG);
+        C->deallocate(SizeClassMap::BatchClassId, B);
+
+      if (Batches.empty()) {
+        BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
+        Sci->FreeListInfo.BlockList.pop_front();
+
+        // We don't keep BatchGroup with zero blocks to avoid empty-checking
+        // while allocating. Note that block used for constructing BatchGroup is
+        // recorded as free blocks in the last element of BatchGroup::Batches.
+        // Which means, once we pop the last TransferBatch, the block is
+        // implicitly deallocated.
+        if (ClassId != SizeClassMap::BatchClassId)
+          C->deallocate(SizeClassMap::BatchClassId, BG);
+      }
     }
 
-    Sci->FreeListInfo.PoppedBlocks += B->getCount();
-    return B;
+    Sci->FreeListInfo.PoppedBlocks += PopCount;
+    return PopCount;
   }
 
   NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 9a642d23620e1..971a65ee15f0f 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -12,6 +12,7 @@
 #include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
+#include "condition_variable.h"
 #include "list.h"
 #include "local_cache.h"
 #include "mem_map.h"
@@ -22,8 +23,6 @@
 #include "string_utils.h"
 #include "thread_annotations.h"
 
-#include "condition_variable.h"
-
 namespace scudo {
 
 // SizeClassAllocator64 is an allocator tuned for 64-bit address space.
@@ -221,41 +220,24 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
   }
 
-  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
-  // count. Now it's the same as the number of blocks stored in the
-  // `TransferBatch`.
   u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
-                UNUSED const u16 MaxBlockCount) {
-    TransferBatchT *B = popBatch(C, ClassId);
-    if (!B)
-      return 0;
-
-    const u16 Count = B->getCount();
-    DCHECK_GT(Count, 0U);
-    B->moveToArray(ToArray);
-
-    if (ClassId != SizeClassMap::BatchClassId)
-      C->deallocate(SizeClassMap::BatchClassId, B);
-
-    return Count;
-  }
-
-  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
+                const u16 MaxBlockCount) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
+    u16 PopCount = 0;
 
     {
       ScopedLock L(Region->FLLock);
-      TransferBatchT *B = popBatchImpl(C, ClassId, Region);
-      if (LIKELY(B))
-        return B;
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
+        return PopCount;
     }
 
     bool ReportRegionExhausted = false;
-    TransferBatchT *B = nullptr;
 
     if (conditionVariableEnabled()) {
-      B = popBatchWithCV(C, ClassId, Region, ReportRegionExhausted);
+      PopCount = popBlocksWithCV(C, ClassId, Region, ToArray, MaxBlockCount,
+                                 ReportRegionExhausted);
     } else {
       while (true) {
         // When two threads compete for `Region->MMLock`, we only want one of
@@ -264,13 +246,15 @@ template <typename Config> class SizeClassAllocator64 {
         ScopedLock ML(Region->MMLock);
         {
           ScopedLock FL(Region->FLLock);
-          if ((B = popBatchImpl(C, ClassId, Region)))
-            break;
+          PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+          if (PopCount != 0U)
+            return PopCount;
         }
 
         const bool RegionIsExhausted = Region->Exhausted;
         if (!RegionIsExhausted)
-          B = populateFreeListAndPopBatch(C, ClassId, Region);
+          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
+                                                  MaxBlockCount);
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
         break;
       }
@@ -286,7 +270,7 @@ template <typename Config> class SizeClassAllocator64 {
         reportOutOfBatchClass();
     }
 
-    return B;
+    return PopCount;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -640,7 +624,7 @@ template <typename Config> class SizeClassAllocator64 {
     // by TransferBatch is also free for use. We don't need to recycle the
     // TransferBatch. Note that the correctness is maintained by the invariant,
     //
-    //   The unit of each popBatch() request is entire TransferBatch. Return
+    //   Each popBlocks() request returns the entire TransferBatch. Returning
     //   part of the blocks in a TransferBatch is invalid.
     //
     // This ensures that TransferBatch won't leak the address itself while it's
@@ -763,7 +747,7 @@ template <typename Config> class SizeClassAllocator64 {
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
 
       return BG;
     };
@@ -855,9 +839,10 @@ template <typename Config> class SizeClassAllocator64 {
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  TransferBatchT *popBatchWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
-                                 bool &ReportRegionExhausted) {
-    TransferBatchT *B = nullptr;
+  u16 popBlocksWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
+                      CompactPtrT *ToArray, const u16 MaxBlockCount,
+                      bool &ReportRegionExhausted) {
+    u16 PopCount = 0;
 
     while (true) {
       // We only expect one thread doing the freelist refillment and other
@@ -878,7 +863,8 @@ template <typename Config> class SizeClassAllocator64 {
 
         const bool RegionIsExhausted = Region->Exhausted;
         if (!RegionIsExhausted)
-          B = populateFreeListAndPopBatch(C, ClassId, Region);
+          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
+                                                  MaxBlockCount);
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
 
         {
@@ -905,7 +891,8 @@ template <typename Config> class SizeClassAllocator64 {
       // blocks were used up right after the refillment. Therefore, we have to
       // check if someone is still populating the freelist.
       ScopedLock FL(Region->FLLock);
-      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
         break;
 
       if (!Region->isPopulatingFreeList)
@@ -918,21 +905,19 @@ template <typename Config> class SizeClassAllocator64 {
       // `pushBatchClassBlocks()` and `mergeGroupsToReleaseBack()`.
       Region->FLLockCV.wait(Region->FLLock);
 
-      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
         break;
     }
 
-    return B;
+    return PopCount;
   }
 
-  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
-  // group id will be considered first.
-  //
-  // The region mutex needs to be held while calling this method.
-  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
+  u16 popBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
+                    CompactPtrT *ToArray, const u16 MaxBlockCount)
       REQUIRES(Region->FLLock) {
     if (Region->FreeListInfo.BlockList.empty())
-      return nullptr;
+      return 0U;
 
     SinglyLinkedList<TransferBatchT> &Batches =
         Region->FreeListInfo.BlockList.front()->Batches;
@@ -945,39 +930,64 @@ template <typename Config> class SizeClassAllocator64 {
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
       TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
-      TB->clear();
-      TB->add(
-          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
+      ToArray[0] =
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
       Region->FreeListInfo.PoppedBlocks += 1;
-      return TB;
+      return 1U;
     }
 
+    // So far, instead of always fill blocks to `MaxBlockCount`, we only exmaine
+    // single `TransferBatch` to minimize the time spent in the primary
+    // allocator. Besides, the sizes of `TransferBatch` and
+    // `CacheT::getMaxCached()` may also impact the time spent on accessing the
+    // primary allocator.
+    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
+    // blocks and/or adjust the size of `TransferBatch` according to
+    // `CacheT::getMaxCached()`.
     TransferBatchT *B = Batches.front();
-    Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    if (Batches.empty()) {
-      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
-      Region->FreeListInfo.BlockList.pop_front();
-
-      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
-      // allocating. Note that block used by constructing BatchGroup is recorded
-      // as free blocks in the last element of BatchGroup::Batches. Which means,
-      // once we pop the last TransferBatch, the block is implicitly
-      // deallocated.
+    // BachClassId should always take all blocks in the TransferBatch. Read the
+    // comment in `pushBatchClassBlocks()` for more details.
+    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
+                             ? B->getCount()
+                             : Min(MaxBlockCount, B->getCount());
+    B->moveNToArray(ToArray, PopCount);
+
+    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
+    // done without holding `FLLock`.
+    if (B->empty()) {
+      Batches.pop_front();
+      // `TransferBatch` of BatchClassId is self-contained, no need to
+      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
+      // details.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, BG);
+        C->deallocate(SizeClassMap::BatchClassId, B);
+
+      if (Batches.empty()) {
+        BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
+        Region->FreeListInfo.BlockList.pop_front();
+
+        // We don't keep BatchGroup with zero blocks to avoid empty-checking
+        // while allocating. Note that block used for constructing BatchGroup is
+        // recorded as free blocks in the last element of BatchGroup::Batches.
+        // Which means, once we pop the last TransferBatch, the block is
+        // implicitly deallocated.
+        if (ClassId != SizeClassMap::BatchClassId)
+          C->deallocate(SizeClassMap::BatchClassId, BG);
+      }
     }
 
-    Region->FreeListInfo.PoppedBlocks += B->getCount();
+    Region->FreeListInfo.PoppedBlocks += PopCount;
 
-    return B;
+    return PopCount;
   }
 
-  // Refill the freelist and return one batch.
-  NOINLINE TransferBatchT *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
-                                                       RegionInfo *Region)
+  NOINLINE u16 populateFreeListAndPopBlocks(CacheT *C, uptr ClassId,
+                                            RegionInfo *Region,
+                                            CompactPtrT *ToArray,
+                                            const u16 MaxBlockCount)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr Size = getSizeByClassId(ClassId);
     const u16 MaxCount = CacheT::getMaxCached(Size);
@@ -994,7 +1004,7 @@ template <typename Config> class SizeClassAllocator64 {
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
       if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
         Region->Exhausted = true;
-        return nullptr;
+        return 0U;
       }
 
       if (UNLIKELY(!Region->MemMapInfo.MemMap.remap(
@@ -1002,7 +1012,7 @@ template <typename Config> class SizeClassAllocator64 {
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
                   (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG
                                                             : 0)))) {
-        return nullptr;
+        return 0U;
       }
       Region->MemMapInfo.MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
@@ -1049,8 +1059,9 @@ template <typename Config> class SizeClassAllocator64 {
       pushBatchClassBlocks(Region, ShuffleArray, NumberOfBlocks);
     }
 
-    TransferBatchT *B = popBatchImpl(C, ClassId, Region);
-    DCHECK_NE(B, nullptr);
+    const u16 PopCount =
+        popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+    DCHECK_NE(PopCount, 0U);
 
     // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
     // the requests from `PushBlocks` and `PopBatch` which are external
@@ -1062,7 +1073,7 @@ template <typename Config> class SizeClassAllocator64 {
     C->getStats().add(StatFree, AllocatedUser);
     Region->MemMapInfo.AllocatedUser += AllocatedUser;
 
-    return B;
+    return PopCount;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region)
@@ -1186,7 +1197,7 @@ template <typename Config> class SizeClassAllocator64 {
     }
 
     // Note that we have extracted the `GroupsToRelease` from region freelist.
-    // It's safe to let pushBlocks()/popBatches() access the remaining region
+    // It's safe to let pushBlocks()/popBlocks() access the remaining region
     // freelist. In the steps 3 and 4, we will temporarily release the FLLock
     // and lock it again before step 5.
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index 18171511758a1..f64a5143b30d4 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -237,7 +237,6 @@ struct SmallRegionsConfig {
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
   using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
-  using TransferBatch = Primary::TransferBatchT;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
@@ -245,29 +244,26 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
   Stats.init();
   Cache.init(&Stats, &Allocator);
   bool AllocationFailed = false;
-  std::vector<TransferBatch *> Batches;
+  std::vector<void *> Blocks;
   const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
   const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
-  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
+  const scudo::u16 MaxCachedBlockCount = Primary::CacheT::getMaxCached(Size);
 
   for (scudo::uptr I = 0; I < 10000U; I++) {
-    TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
-    if (!B) {
-      AllocationFailed = true;
-      break;
+    for (scudo::uptr J = 0; J < MaxCachedBlockCount; ++J) {
+      void *Ptr = Cache.allocate(ClassId);
+      if (Ptr == nullptr) {
+        AllocationFailed = true;
+        break;
+      }
+      memset(Ptr, 'B', Size);
+      Blocks.push_back(Ptr);
     }
-    for (scudo::u16 J = 0; J < B->getCount(); J++)
-      memset(Allocator.decompactPtr(ClassId, B->get(J)), 'B', Size);
-    Batches.push_back(B);
-  }
-  while (!Batches.empty()) {
-    TransferBatch *B = Batches.back();
-    Batches.pop_back();
-    const scudo::u16 Count = B->getCount();
-    B->moveToArray(Blocks);
-    Allocator.pushBlocks(&Cache, ClassId, Blocks, Count);
-    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
+
+  for (auto *Ptr : Blocks)
+    Cache.deallocate(ClassId, Ptr);
+
   Cache.destroy(nullptr);
   Allocator.releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
@@ -342,7 +338,7 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
           V.push_back(std::make_pair(ClassId, P));
       }
 
-      // Try to interleave pushBlocks(), popBatch() and releaseToOS().
+      // Try to interleave pushBlocks(), popBlocks() and releaseToOS().
       Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 
       while (!V.empty()) {

From 911055e34f2b1530d9900e23873e300b77ea8d96 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 26 Feb 2024 19:06:43 +0000
Subject: [PATCH 360/546] [VPlan] Consistently use (Part, 0) for first lane
 scalar values (#80271)

At the moment, some VPInstructions create only a single scalar value,
but use VPTransformatState's 'vector' storage for this value. Those
values are effectively uniform-per-VF (or in some cases
uniform-across-VF-and-UF). Using the vector/per-part storage doesn't
interact well with other recipes, that more accurately using (Part,
Lane) to look up scalar values and prevents VPInstructions creating
scalars from interacting with other recipes working with scalars.

This PR tries to unify handling of scalars by using (Part, 0) for scalar
values where only the first lane is demanded. This allows using
VPInstructions with other recipes like VPScalarCastRecipe and is also
needed when using VPInstructions in more cases otuside the vector loop
region to generate scalars.

Depends on https://github.com/llvm/llvm-project/pull/80269
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 15 ++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 35 ++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 26 +++++++++++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 36 +++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlanValue.h    | 12 +++----
 .../AArch64/tail-folding-styles.ll            |  4 +--
 .../LoopVectorize/X86/small-size.ll           |  2 +-
 7 files changed, 79 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 373ea751d568a..e5deac7975728 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9127,7 +9127,7 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
          "Unexpected type.");
 
   auto *IVR = getParent()->getPlan()->getCanonicalIV();
-  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
 
   if (onlyScalarsGenerated(State.VF.isScalable())) {
     // This is the normalized GEP that starts counting at zero.
@@ -9243,7 +9243,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
 void VPReductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Reduction being replicated.");
-  Value *PrevInChain = State.get(getChainOp(), 0);
+  Value *PrevInChain = State.get(getChainOp(), 0, /*IsScalar*/ true);
   RecurKind Kind = RdxDesc.getRecurrenceKind();
   bool IsOrdered = State.ILV->useOrderedReductions(RdxDesc);
   // Propagate the fast-math flags carried by the underlying instruction.
@@ -9252,8 +9252,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewVecOp = State.get(getVecOp(), Part);
     if (VPValue *Cond = getCondOp()) {
-      Value *NewCond = State.VF.isVector() ? State.get(Cond, Part)
-                                           : State.get(Cond, {Part, 0});
+      Value *NewCond = State.get(Cond, Part, State.VF.isScalar());
       VectorType *VecTy = dyn_cast<VectorType>(NewVecOp->getType());
       Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
       Value *Iden = RdxDesc.getRecurrenceIdentity(Kind, ElementTy,
@@ -9278,7 +9277,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
             NewVecOp);
       PrevInChain = NewRed;
     } else {
-      PrevInChain = State.get(getChainOp(), Part);
+      PrevInChain = State.get(getChainOp(), Part, /*IsScalar*/ true);
       NewRed = createTargetReduction(State.Builder, RdxDesc, NewVecOp);
     }
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
@@ -9289,7 +9288,7 @@ void VPReductionRecipe::execute(VPTransformState &State) {
     else
       NextInChain = State.Builder.CreateBinOp(
           (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), NewRed, PrevInChain);
-    State.set(this, NextInChain, Part);
+    State.set(this, NextInChain, Part, /*IsScalar*/ true);
   }
 }
 
@@ -9404,7 +9403,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
           // We don't want to update the value in the map as it might be used in
           // another expression. So don't call resetVectorValue(StoredVal).
         }
-        auto *VecPtr = State.get(getAddr(), Part);
+        auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
         if (isMaskRequired)
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
@@ -9428,7 +9427,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
                                          nullptr, "wide.masked.gather");
       State.addMetadata(NewLI, LI);
     } else {
-      auto *VecPtr = State.get(getAddr(), Part);
+      auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(
             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4ffbed4b705ca..4aeab6fc61998 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -242,7 +242,16 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
   return Extract;
 }
 
-Value *VPTransformState::get(VPValue *Def, unsigned Part) {
+Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
+  if (NeedsScalar) {
+    assert((VF.isScalar() || Def->isLiveIn() ||
+            (hasScalarValue(Def, VPIteration(Part, 0)) &&
+             Data.PerPartScalars[Def][Part].size() == 1)) &&
+           "Trying to access a single scalar per part but has multiple scalars "
+           "per part.");
+    return get(Def, VPIteration(Part, 0));
+  }
+
   // If Values have been set for this Def return the one relevant for \p Part.
   if (hasVectorValue(Def, Part))
     return Data.PerPartOutput[Def][Part];
@@ -789,21 +798,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
     auto *TCMO = Builder.CreateSub(TripCountV,
                                    ConstantInt::get(TripCountV->getType(), 1),
                                    "trip.count.minus.1");
-    auto VF = State.VF;
-    Value *VTCMO =
-        VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
-    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-      State.set(BackedgeTakenCount, VTCMO, Part);
+    BackedgeTakenCount->setUnderlyingValue(TCMO);
   }
 
-  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(&VectorTripCount, VectorTripCountV, Part);
+  VectorTripCount.setUnderlyingValue(VectorTripCountV);
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  State.set(&VFxUF,
-            createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF),
-            0);
+  VFxUF.setUnderlyingValue(
+      createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
 
   // When vectorizing the epilogue loop, the canonical induction start value
   // needs to be changed from zero to the value after the main vector loop.
@@ -884,12 +887,16 @@ void VPlan::execute(VPTransformState *State) {
                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
                             (isa<VPReductionPHIRecipe>(PhiR) &&
                              cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
+    bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+                       (isa<VPReductionPHIRecipe>(PhiR) &&
+                        cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
 
     for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-      Value *Phi = State->get(PhiR, Part);
-      Value *Val = State->get(PhiR->getBackedgeValue(),
-                              SinglePartNeeded ? State->UF - 1 : Part);
+      Value *Phi = State->get(PhiR, Part, NeedsScalar);
+      Value *Val =
+          State->get(PhiR->getBackedgeValue(),
+                     SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index aaddaaff2aba2..47cebecdb2708 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -259,9 +259,10 @@ struct VPTransformState {
     DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
   } Data;
 
-  /// Get the generated Value for the given VPValue \p Def and the given \p Part.
-  /// \see set.
-  Value *get(VPValue *Def, unsigned Part);
+  /// Get the generated vector Value for a given VPValue \p Def and a given \p
+  /// Part if \p IsScalar is false, otherwise return the generated scalar
+  /// for \p Part. \See set.
+  Value *get(VPValue *Def, unsigned Part, bool IsScalar = false);
 
   /// Get the generated Value for a given VPValue and given Part and Lane.
   Value *get(VPValue *Def, const VPIteration &Instance);
@@ -282,14 +283,22 @@ struct VPTransformState {
            I->second[Instance.Part][CacheIdx];
   }
 
-  /// Set the generated Value for a given VPValue and a given Part.
-  void set(VPValue *Def, Value *V, unsigned Part) {
+  /// Set the generated vector Value for a given VPValue and a given Part, if \p
+  /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0).
+  void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) {
+    if (IsScalar) {
+      set(Def, V, VPIteration(Part, 0));
+      return;
+    }
+    assert((VF.isScalar() || V->getType()->isVectorTy()) &&
+           "scalar values must be stored as (Part, 0)");
     if (!Data.PerPartOutput.count(Def)) {
       DataState::PerPartValuesTy Entry(UF);
       Data.PerPartOutput[Def] = Entry;
     }
     Data.PerPartOutput[Def][Part] = V;
   }
+
   /// Reset an existing vector value for \p Def and a given \p Part.
   void reset(VPValue *Def, Value *V, unsigned Part) {
     auto Iter = Data.PerPartOutput.find(Def);
@@ -1376,6 +1385,13 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {
 
   /// Returns the result type of the cast.
   Type *getResultType() const { return ResultTy; }
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    // At the moment, only uniform codegen is implemented.
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
 };
 
 /// A recipe for widening Call instructions.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d2f6acf913f1..27b72575ddd51 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -279,11 +279,12 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
   Builder.SetCurrentDebugLocation(getDebugLoc());
 
   if (Instruction::isBinaryOp(getOpcode())) {
+    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     if (Part != 0 && vputils::onlyFirstPartUsed(this))
-      return State.get(this, 0);
+      return State.get(this, 0, OnlyFirstLaneUsed);
 
-    Value *A = State.get(getOperand(0), Part);
-    Value *B = State.get(getOperand(1), Part);
+    Value *A = State.get(getOperand(0), Part, OnlyFirstLaneUsed);
+    Value *B = State.get(getOperand(1), Part, OnlyFirstLaneUsed);
     auto *Res =
         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
     if (auto *I = dyn_cast<Instruction>(Res))
@@ -385,8 +386,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     if (Part != 0)
       return nullptr;
     // First create the compare.
-    Value *IV = State.get(getOperand(0), Part);
-    Value *TC = State.get(getOperand(1), Part);
+    Value *IV = State.get(getOperand(0), Part, /*IsScalar*/ true);
+    Value *TC = State.get(getOperand(1), Part, /*IsScalar*/ true);
     Value *Cond = Builder.CreateICmpEQ(IV, TC);
 
     // Now create the branch.
@@ -407,7 +408,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
   }
   case VPInstruction::ComputeReductionResult: {
     if (Part != 0)
-      return State.get(this, 0);
+      return State.get(this, 0, /*IsScalar*/ true);
 
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
     // and will be removed by breaking up the recipe further.
@@ -424,7 +425,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
     Type *PhiTy = OrigPhi->getType();
     VectorParts RdxParts(State.UF);
     for (unsigned Part = 0; Part < State.UF; ++Part)
-      RdxParts[Part] = State.get(LoopExitingDef, Part);
+      RdxParts[Part] = State.get(LoopExitingDef, Part, PhiR->isInLoop());
 
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
@@ -512,9 +513,15 @@ void VPInstruction::execute(VPTransformState &State) {
     if (!hasResult())
       continue;
     assert(GeneratedValue && "generateInstruction must produce a value");
-    State.set(this, GeneratedValue, Part);
+
+    bool IsVector = GeneratedValue->getType()->isVectorTy();
+    State.set(this, GeneratedValue, Part, !IsVector);
+    assert((IsVector || getOpcode() == VPInstruction::ComputeReductionResult ||
+            State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
+           "scalar value but not only first lane used");
   }
 }
+
 bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
   if (Instruction::isBinaryOp(getOpcode()))
@@ -530,8 +537,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::CalculateTripCountMinusVF:
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
-    // TODO: Cover additional operands.
-    return getOperand(0) == Op;
+    return true;
   };
   llvm_unreachable("switch should return");
 }
@@ -1344,7 +1350,7 @@ void VPVectorPointerRecipe ::execute(VPTransformState &State) {
       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
     }
 
-    State.set(this, PartPtr, Part);
+    State.set(this, PartPtr, Part, /*IsScalar*/ true);
   }
 }
 
@@ -1640,7 +1646,7 @@ void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
   EntryPart->addIncoming(Start, VectorPH);
   EntryPart->setDebugLoc(getDebugLoc());
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-    State.set(this, EntryPart, Part);
+    State.set(this, EntryPart, Part, /*IsScalar*/ true);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1711,7 +1717,7 @@ void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
-  Value *CanonicalIV = State.get(getOperand(0), 0);
+  Value *CanonicalIV = State.get(getOperand(0), 0, /*IsScalar*/ true);
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
@@ -1801,7 +1807,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
-    State.set(this, EntryPart, Part);
+    State.set(this, EntryPart, Part, IsInLoop);
   }
 
   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
@@ -1833,7 +1839,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   }
 
   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-    Value *EntryPart = State.get(this, Part);
+    Value *EntryPart = State.get(this, Part, IsInLoop);
     // Make sure to add the reduction start value only to the
     // first unroll part.
     Value *StartVal = (Part == 0) ? StartV : Iden;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index b114716b7c13d..1d2c17e91b7ab 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -73,12 +73,6 @@ class VPValue {
   // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
   // back-end and analysis information for the new IR.
 
-  // Set \p Val as the underlying Value of this VPValue.
-  void setUnderlyingValue(Value *Val) {
-    assert(!UnderlyingVal && "Underlying Value is already set.");
-    UnderlyingVal = Val;
-  }
-
 public:
   /// Return the underlying Value attached to this VPValue.
   Value *getUnderlyingValue() { return UnderlyingVal; }
@@ -192,6 +186,12 @@ class VPValue {
   /// is a live-in value.
   /// TODO: Also handle recipes defined in pre-header blocks.
   bool isDefinedOutsideVectorRegions() const { return !hasDefiningRecipe(); }
+
+  // Set \p Val as the underlying Value of this VPValue.
+  void setUnderlyingValue(Value *Val) {
+    assert(!UnderlyingVal && "Underlying Value is already set.");
+    UnderlyingVal = Val;
+  }
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index 94f24fea3609c..13fc0eaafb808 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -117,10 +117,10 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
 ; DATA_NO_LANEMASK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; DATA_NO_LANEMASK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; DATA_NO_LANEMASK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; DATA_NO_LANEMASK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT4]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; DATA_NO_LANEMASK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index be83329d30fef..51d2648205030 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -142,7 +142,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_116]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT17]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY19:%.*]]
-; CHECK:       vector.body19:
+; CHECK:       vector.body17:
 ; CHECK-NEXT:    [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0

From 38515580c4c5068e204ff69494ad11bbfacc89b4 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 26 Feb 2024 11:10:56 -0800
Subject: [PATCH 361/546] [lldb] Remove LLDB_DEBUGSERVER_CODESIGN_IDENTITY
 (NFC)

This property was previously used by the test suite.
---
 lldb/tools/debugserver/source/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lldb/tools/debugserver/source/CMakeLists.txt b/lldb/tools/debugserver/source/CMakeLists.txt
index f0b9756becab6..1a433898f6aa4 100644
--- a/lldb/tools/debugserver/source/CMakeLists.txt
+++ b/lldb/tools/debugserver/source/CMakeLists.txt
@@ -117,10 +117,6 @@ get_debugserver_codesign_identity(debugserver_codesign_identity)
 # Override locally, so the identity is used for targets created in this scope.
 set(LLVM_CODESIGNING_IDENTITY ${debugserver_codesign_identity})
 
-# Use the same identity later in the test suite.
-set_property(GLOBAL PROPERTY
-  LLDB_DEBUGSERVER_CODESIGN_IDENTITY ${debugserver_codesign_identity})
-
 if(APPLE)
   set(LIBCOMPRESSION compression)
   if(APPLE_EMBEDDED)

From 1f8b7e3c0b0743dbc899278f016b710f18240b9b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 26 Feb 2024 14:33:58 -0500
Subject: [PATCH 362/546] [libc++] Fix non-uglified name in
 scoped_allocator_adaptor (#80706)

As mentioned in #78754, the 'base' typedef in scoped_allocator_adaptor
was not uglified properly.
---
 libcxx/include/scoped_allocator               | 22 +++++++--------
 .../base-is-uglified.compile.pass.cpp         | 27 +++++++++++++++++++
 2 files changed, 38 insertions(+), 11 deletions(-)
 create mode 100644 libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp

diff --git a/libcxx/include/scoped_allocator b/libcxx/include/scoped_allocator
index eff6fbdf6edd8..fa6c6c5d20d86 100644
--- a/libcxx/include/scoped_allocator
+++ b/libcxx/include/scoped_allocator
@@ -334,12 +334,12 @@ struct __outermost<_Alloc, true> {
 template <class _OuterAlloc, class... _InnerAllocs>
 class _LIBCPP_TEMPLATE_VIS scoped_allocator_adaptor<_OuterAlloc, _InnerAllocs...>
     : public __scoped_allocator_storage<_OuterAlloc, _InnerAllocs...> {
-  typedef __scoped_allocator_storage<_OuterAlloc, _InnerAllocs...> base;
+  typedef __scoped_allocator_storage<_OuterAlloc, _InnerAllocs...> _Base;
   typedef allocator_traits<_OuterAlloc> _OuterTraits;
 
 public:
   typedef _OuterAlloc outer_allocator_type;
-  typedef typename base::inner_allocator_type inner_allocator_type;
+  typedef typename _Base::inner_allocator_type inner_allocator_type;
   typedef typename _OuterTraits::size_type size_type;
   typedef typename _OuterTraits::difference_type difference_type;
   typedef typename _OuterTraits::pointer pointer;
@@ -365,29 +365,29 @@ public:
   template <class _OuterA2, __enable_if_t<is_constructible<outer_allocator_type, _OuterA2>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   scoped_allocator_adaptor(_OuterA2&& __outer_alloc, const _InnerAllocs&... __inner_allocs) _NOEXCEPT
-      : base(std::forward<_OuterA2>(__outer_alloc), __inner_allocs...) {}
+      : _Base(std::forward<_OuterA2>(__outer_alloc), __inner_allocs...) {}
   // scoped_allocator_adaptor(const scoped_allocator_adaptor& __other) = default;
   template <class _OuterA2, __enable_if_t<is_constructible<outer_allocator_type, const _OuterA2&>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   scoped_allocator_adaptor(const scoped_allocator_adaptor<_OuterA2, _InnerAllocs...>& __other) _NOEXCEPT
-      : base(__other) {}
+      : _Base(__other) {}
   template <class _OuterA2, __enable_if_t<is_constructible<outer_allocator_type, _OuterA2>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI
   scoped_allocator_adaptor(scoped_allocator_adaptor<_OuterA2, _InnerAllocs...>&& __other) _NOEXCEPT
-      : base(std::move(__other)) {}
+      : _Base(std::move(__other)) {}
 
   // scoped_allocator_adaptor& operator=(const scoped_allocator_adaptor&) = default;
   // scoped_allocator_adaptor& operator=(scoped_allocator_adaptor&&) = default;
   // ~scoped_allocator_adaptor() = default;
 
-  _LIBCPP_HIDE_FROM_ABI inner_allocator_type& inner_allocator() _NOEXCEPT { return base::inner_allocator(); }
+  _LIBCPP_HIDE_FROM_ABI inner_allocator_type& inner_allocator() _NOEXCEPT { return _Base::inner_allocator(); }
   _LIBCPP_HIDE_FROM_ABI const inner_allocator_type& inner_allocator() const _NOEXCEPT {
-    return base::inner_allocator();
+    return _Base::inner_allocator();
   }
 
-  _LIBCPP_HIDE_FROM_ABI outer_allocator_type& outer_allocator() _NOEXCEPT { return base::outer_allocator(); }
+  _LIBCPP_HIDE_FROM_ABI outer_allocator_type& outer_allocator() _NOEXCEPT { return _Base::outer_allocator(); }
   _LIBCPP_HIDE_FROM_ABI const outer_allocator_type& outer_allocator() const _NOEXCEPT {
-    return base::outer_allocator();
+    return _Base::outer_allocator();
   }
 
   _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_HIDE_FROM_ABI pointer allocate(size_type __n) {
@@ -472,12 +472,12 @@ public:
   }
 
   _LIBCPP_HIDE_FROM_ABI scoped_allocator_adaptor select_on_container_copy_construction() const _NOEXCEPT {
-    return base::select_on_container_copy_construction();
+    return _Base::select_on_container_copy_construction();
   }
 
 private:
   _LIBCPP_HIDE_FROM_ABI explicit scoped_allocator_adaptor(
-      outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : base(std::move(__o), std::move(__i)) {}
+      outer_allocator_type&& __o, inner_allocator_type&& __i) _NOEXCEPT : _Base(std::move(__o), std::move(__i)) {}
 
   template <class _Tp, class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __construct(integral_constant<int, 0>, _Tp* __p, _Args&&... __args) {
diff --git a/libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp b/libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp
new file mode 100644
index 0000000000000..2581ac079dc5d
--- /dev/null
+++ b/libcxx/test/std/utilities/allocator.adaptor/base-is-uglified.compile.pass.cpp
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// <memory>
+
+// This test ensures that we don't use a non-uglified name 'base' in the
+// implementation of scoped_allocator_adaptor.
+//
+// See https://github.com/llvm/llvm-project/issues/78754.
+
+#include <memory>
+#include <scoped_allocator>
+
+using ScopedAlloc = std::scoped_allocator_adaptor<std::allocator<int>, std::allocator<int>>;
+struct MyBase {
+  using base = MyBase;
+};
+struct MyDerived : ScopedAlloc, MyBase {};
+
+using T = MyDerived::base; // Should be well-formed

From deaf53e632f37be1e4f7a57959c35fcae161b141 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 26 Feb 2024 14:34:43 -0500
Subject: [PATCH 363/546] [libc++][NFC] Reorder availability macros in
 introduction order (#82531)

Reorder the availability macros in __availability to respect the order
in which they were introduced in the dylib. This simple refactor will
greatly simplify an upcoming change I am working on.
---
 libcxx/include/__availability | 54 +++++++++++++++++------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index c5069a027750e..78438c55a3b7b 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -101,12 +101,6 @@
 #  define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST 1
 #  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
 
-// These macros controls the availability of __cxa_init_primary_exception
-// in the built library, which std::make_exception_ptr might use
-// (see libcxx/include/__exception/exception_ptr.h).
-#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 1
-#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
-
 // These macros control the availability of all parts of <filesystem> that
 // depend on something in the dylib.
 #  define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY 1
@@ -114,11 +108,6 @@
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_PUSH
 #  define _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY_POP
 
-// This controls the availability of floating-point std::to_chars functions.
-// These overloads were added later than the integer overloads.
-#  define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 1
-#  define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
-
 // This controls the availability of the C++20 synchronization library,
 // which requires shared library support for various operations
 // (see libcxx/src/atomic.cpp). This includes <barier>, <latch>,
@@ -126,6 +115,24 @@
 #  define _LIBCPP_AVAILABILITY_HAS_SYNC 1
 #  define _LIBCPP_AVAILABILITY_SYNC
 
+// Enable additional explicit instantiations of iostreams components. This
+// reduces the number of weak definitions generated in programs that use
+// iostreams by providing a single strong definition in the shared library.
+//
+// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
+//       or once libc++ doesn't use the attribute anymore.
+// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
+#  if !defined(_LIBCPP_COMPILER_GCC) && !defined(_WIN32)
+#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
+#  else
+#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
+#  endif
+
+// This controls the availability of floating-point std::to_chars functions.
+// These overloads were added later than the integer overloads.
+#  define _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT 1
+#  define _LIBCPP_AVAILABILITY_TO_CHARS_FLOATING_POINT
+
 // This controls whether the library claims to provide a default verbose
 // termination function, and consequently whether the headers will try
 // to use it when the mechanism isn't overriden at compile-time.
@@ -137,10 +144,11 @@
 #  define _LIBCPP_AVAILABILITY_HAS_PMR 1
 #  define _LIBCPP_AVAILABILITY_PMR
 
-// This controls the availability of the C++20 time zone database.
-// The parser code is built in the library.
-#  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
-#  define _LIBCPP_AVAILABILITY_TZDB
+// These macros controls the availability of __cxa_init_primary_exception
+// in the built library, which std::make_exception_ptr might use
+// (see libcxx/include/__exception/exception_ptr.h).
+#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 1
+#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
 
 // This controls the availability of C++23 <print>, which
 // has a dependency on the built library (it needs access to
@@ -148,18 +156,10 @@
 #  define _LIBCPP_AVAILABILITY_HAS_PRINT 1
 #  define _LIBCPP_AVAILABILITY_PRINT
 
-// Enable additional explicit instantiations of iostreams components. This
-// reduces the number of weak definitions generated in programs that use
-// iostreams by providing a single strong definition in the shared library.
-//
-// TODO: Enable additional explicit instantiations on GCC once it supports exclude_from_explicit_instantiation,
-//       or once libc++ doesn't use the attribute anymore.
-// TODO: Enable them on Windows once https://llvm.org/PR41018 has been fixed.
-#  if !defined(_LIBCPP_COMPILER_GCC) && !defined(_WIN32)
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
-#  else
-#    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 0
-#  endif
+// This controls the availability of the C++20 time zone database.
+// The parser code is built in the library.
+#  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
+#  define _LIBCPP_AVAILABILITY_TZDB
 
 #elif defined(__APPLE__)
 

From 5e6f50eaa9510366352dde60b48bf7c3169762ea Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 26 Feb 2024 14:46:15 -0500
Subject: [PATCH 364/546] [libc++] Remove LIBCXX_EXECUTOR and
 LIBCXXABI_EXECUTOR (#79886)

Those were deprecated in LLVM 18 and their removal was planned for LLVM 19.
---
 libcxx/cmake/caches/AndroidNDK.cmake | 4 ----
 libcxx/docs/ReleaseNotes/19.rst      | 5 ++---
 libcxx/test/CMakeLists.txt           | 5 -----
 libcxx/utils/ci/run-buildbot         | 2 +-
 libcxxabi/test/CMakeLists.txt        | 5 -----
 5 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/libcxx/cmake/caches/AndroidNDK.cmake b/libcxx/cmake/caches/AndroidNDK.cmake
index 23fbc3fa6bddd..298518781e9b7 100644
--- a/libcxx/cmake/caches/AndroidNDK.cmake
+++ b/libcxx/cmake/caches/AndroidNDK.cmake
@@ -35,7 +35,3 @@ set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
 # them.
 set(LIBCXX_TEST_CONFIG "llvm-libc++-android-ndk.cfg.in" CACHE STRING "")
 set(LIBCXXABI_TEST_CONFIG "llvm-libc++abi-android-ndk.cfg.in" CACHE STRING "")
-
-# CMAKE_SOURCE_DIR refers to the "<monorepo>/runtimes" directory.
-set(LIBCXX_EXECUTOR "${CMAKE_SOURCE_DIR}/../libcxx/utils/adb_run.py" CACHE STRING "")
-set(LIBCXXABI_EXECUTOR "${LIBCXX_EXECUTOR}" CACHE STRING "")
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index a7f108c44e717..6c8f8d17af9b1 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -51,8 +51,6 @@ Improvements and New Features
 Deprecations and Removals
 -------------------------
 
-- TODO: The ``LIBCXX_EXECUTOR`` CMake variables have been removed.
-
 - TODO: The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode has been deprecated and setting
   it triggers an error; use the ``LIBCXX_HARDENING_MODE`` CMake variable with the value ``extensive`` instead. Similarly,
   the ``_LIBCPP_ENABLE_ASSERTIONS`` macro has been deprecated (setting it to ``1`` still enables the extensive mode in
@@ -98,4 +96,5 @@ TODO
 Build System Changes
 --------------------
 
-TODO
+- The ``LIBCXX_EXECUTOR`` and ``LIBCXXABI_EXECUTOR`` CMake variables have been removed. Please
+  set ``LIBCXX_TEST_PARAMS`` to ``executor=<...>`` instead.
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 52620fc55feeb..e0d3a0dbc4003 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -10,11 +10,6 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxx configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-if (LIBCXX_EXECUTOR)
-  message(DEPRECATION "LIBCXX_EXECUTOR is deprecated, please add executor=... to LIBCXX_TEST_PARAMS")
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBCXX_EXECUTOR}")
-endif()
-
 if (NOT LIBCXX_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index ddfe53d2d8bd6..2905745355b68 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -758,7 +758,7 @@ android-ndk-*)
     # level. When tests are run against a device with a newer API level, test
     # programs can be built for any supported API level, but building for the
     # newest API (i.e. the system image's API) is probably the most interesting.
-    PARAMS="target_triple=$(triple_of_arch ${ARCH})$(api_of_emu_img ${ANDROID_EMU_IMG})"
+    PARAMS="executor=${MONOREPO_ROOT}/libcxx/utils/adb_run.py;target_triple=$(triple_of_arch ${ARCH})$(api_of_emu_img ${ANDROID_EMU_IMG})"
     generate-cmake-android -C "${MONOREPO_ROOT}/runtimes/cmake/android/Arch-${ARCH}.cmake" \
                            -C "${MONOREPO_ROOT}/libcxx/cmake/caches/AndroidNDK.cmake" \
                            -DCMAKE_SYSROOT=/opt/android/ndk/sysroot \
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 9f95c736d63f4..586927189cf1d 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -24,11 +24,6 @@ endif()
 set(AUTO_GEN_COMMENT "## Autogenerated by libcxxabi configuration.\n# Do not edit!")
 set(SERIALIZED_LIT_PARAMS "# Lit parameters serialized here for llvm-lit to pick them up\n")
 
-if (LIBCXXABI_EXECUTOR)
-  message(DEPRECATION "LIBCXXABI_EXECUTOR is deprecated, please add executor=... to LIBCXXABI_TEST_PARAMS")
-  serialize_lit_string_param(SERIALIZED_LIT_PARAMS executor "${LIBCXXABI_EXECUTOR}")
-endif()
-
 if (NOT LIBCXXABI_ENABLE_EXCEPTIONS)
   serialize_lit_param(SERIALIZED_LIT_PARAMS enable_exceptions False)
 endif()

From 7b66b5d6c2fadb1e9f8cfc8d1864d4109105001f Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 26 Feb 2024 20:49:52 +0100
Subject: [PATCH 365/546] [mlir][Transforms] Track erased ops separately
 (#83051)

#83023 fixed a performance regression related to "ignored" ops. This
broke some downstream projects that access ops after they were replaced
(an API violation). This change restores the original behavior before
#83023 (but without the performance regression), to give downstream
users more time to fix their code.
---
 .../Transforms/Utils/DialectConversion.cpp    | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 4165e0a52428f..f967e8352bf4c 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -153,9 +153,9 @@ namespace {
 /// This is useful when saving and undoing a set of rewrites.
 struct RewriterState {
   RewriterState(unsigned numRewrites, unsigned numIgnoredOperations,
-                unsigned numErased)
+                unsigned numErased, unsigned numReplacedOps)
       : numRewrites(numRewrites), numIgnoredOperations(numIgnoredOperations),
-        numErased(numErased) {}
+        numErased(numErased), numReplacedOps(numReplacedOps) {}
 
   /// The current number of rewrites performed.
   unsigned numRewrites;
@@ -165,6 +165,9 @@ struct RewriterState {
 
   /// The current number of erased operations/blocks.
   unsigned numErased;
+
+  /// The current number of replaced ops that are scheduled for erasure.
+  unsigned numReplacedOps;
 };
 
 //===----------------------------------------------------------------------===//
@@ -954,6 +957,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// operation was ignored.
   SetVector<Operation *> ignoredOps;
 
+  // A set of operations that were erased.
+  SetVector<Operation *> replacedOps;
+
   /// The current type converter, or nullptr if no type converter is currently
   /// active.
   const TypeConverter *currentTypeConverter = nullptr;
@@ -1152,7 +1158,7 @@ void ConversionPatternRewriterImpl::applyRewrites() {
 
 RewriterState ConversionPatternRewriterImpl::getCurrentState() {
   return RewriterState(rewrites.size(), ignoredOps.size(),
-                       eraseRewriter.erased.size());
+                       eraseRewriter.erased.size(), replacedOps.size());
 }
 
 void ConversionPatternRewriterImpl::resetState(RewriterState state) {
@@ -1165,6 +1171,9 @@ void ConversionPatternRewriterImpl::resetState(RewriterState state) {
 
   while (eraseRewriter.erased.size() != state.numErased)
     eraseRewriter.erased.pop_back();
+
+  while (replacedOps.size() != state.numReplacedOps)
+    replacedOps.pop_back();
 }
 
 void ConversionPatternRewriterImpl::undoRewrites(unsigned numRewritesToKeep) {
@@ -1228,9 +1237,11 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
   return success();
 }
 
+// TODO: This function is a misnomer. It does not actually check if `op` is in
+// `ignoredOps`.
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
   // Check to see if this operation or the parent operation is ignored.
-  return ignoredOps.count(op->getParentOp()) || ignoredOps.count(op);
+  return ignoredOps.count(op->getParentOp()) || replacedOps.count(op);
 }
 
 void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
@@ -1479,7 +1490,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
 void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
                                                      ValueRange newValues) {
   assert(newValues.size() == op->getNumResults());
-  assert(!ignoredOps.contains(op) && "operation was already replaced");
+  assert(!replacedOps.contains(op) && "operation was already replaced");
 
   // Track if any of the results changed, e.g. erased and replaced with null.
   bool resultChanged = false;
@@ -1500,7 +1511,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
 
   // Mark this operation as recursively ignored so that we don't need to
   // convert any nested operations.
-  ignoredOps.insert(op);
+  replacedOps.insert(op);
   markNestedOpsIgnored(op);
 }
 

From 6a884a9aef397f76178b141d789efadf280f2c3f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 26 Feb 2024 14:55:10 -0500
Subject: [PATCH 366/546] [libc++] Always keep libc++abi re-exports up-to-date
 (#79012)

Previously, the list of libc++abi symbols that we re-export from libc++
would be partly encoded in libc++abi (and re-exported automatically via
the cxxabi-reexports target), and partly hard-coded in
libcxx/lib/libc++abi.exp. The duplication of information led to symbols
not being exported from libc++ after being added to libc++abi when they
should have been.

This patch removes the duplication of information. After this patch, the
full list of symbols to re-export from libc++abi is handled by the
cxxabi-reexports target and is stored in libcxxabi.

The symbols newly re-exported from libc++ are mainly new fundamental
typeinfos and a bunch of functions and classes that are part of
libc++abi but are most likely implementation details. In the future, it
would be possible to try to trim down the set of what we export from
libc++abi (and hence what we re-export from libc++) to remove some
implementation detail symbols.

Fixes #79008
---
 libcxx/lib/abi/CHANGELOG.TXT                  |  65 ++++
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  98 +++++
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  98 +++++
 libcxx/src/CMakeLists.txt                     |   9 +-
 libcxxabi/lib/cxxabiv1.exp                    |  38 ++
 libcxxabi/lib/fundamental-types.exp           | 153 ++++++++
 libcxxabi/lib/itanium-base.exp                | 325 ----------------
 ...{exceptions.exp => itanium-exceptions.exp} |   1 +
 libcxxabi/lib/new-delete.exp                  |   1 +
 .../lib/std-exceptions.exp                    | 350 +++++-------------
 libcxxabi/lib/std-misc.exp                    |   9 +
 libcxxabi/src/CMakeLists.txt                  |  38 +-
 libcxxabi/test/uncaught_exception.pass.cpp    |   7 +
 13 files changed, 579 insertions(+), 613 deletions(-)
 create mode 100644 libcxxabi/lib/cxxabiv1.exp
 create mode 100644 libcxxabi/lib/fundamental-types.exp
 rename libcxxabi/lib/{exceptions.exp => itanium-exceptions.exp} (84%)
 rename libcxx/lib/libc++abi.exp => libcxxabi/lib/std-exceptions.exp (51%)
 create mode 100644 libcxxabi/lib/std-misc.exp

diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index 7ff604959f4d5..7a8d5052a0830 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -12,6 +12,71 @@ To generate a summary, re-generate the new ABI list using the
 
 New entries should be added directly below the "Version" header.
 
+------------
+Version 19.0
+------------
+
+* [libc++] Always keep libc++abi re-exports up-to-date
+
+  This patch makes sure that the set of libc++abi symbols re-exported from libc++
+  is up-to-date with the symbols that libc++abi re-exports. As such, it adds several
+  symbols that were left out of the libc++ re-exports list. Exporting new symbols
+  is not an ABI break.
+
+  <arch>-apple-darwin
+  -------------------
+  Symbol reexported: ___cxa_current_primary_exception
+  Symbol reexported: ___cxa_decrement_exception_refcount
+  Symbol reexported: ___cxa_increment_exception_refcount
+  Symbol reexported: ___cxa_new_handler
+  Symbol reexported: ___cxa_rethrow_primary_exception
+  Symbol reexported: ___cxa_terminate_handler
+  Symbol reexported: ___cxa_uncaught_exception
+  Symbol reexported: ___cxa_unexpected_handler
+  Symbol reexported: __ZTIDh
+  Symbol reexported: __ZTIDu
+  Symbol reexported: __ZTIg
+  Symbol reexported: __ZTIn
+  Symbol reexported: __ZTIN10__cxxabiv116__enum_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv116__shim_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv117__array_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv117__class_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv117__pbase_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv119__pointer_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv120__function_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv120__si_class_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv121__vmi_class_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv123__fundamental_type_infoE
+  Symbol reexported: __ZTIN10__cxxabiv129__pointer_to_member_type_infoE
+  Symbol reexported: __ZTIo
+  Symbol reexported: __ZTIPDh
+  Symbol reexported: __ZTIPDu
+  Symbol reexported: __ZTIPg
+  Symbol reexported: __ZTIPKDh
+  Symbol reexported: __ZTIPKDu
+  Symbol reexported: __ZTIPKg
+  Symbol reexported: __ZTIPKn
+  Symbol reexported: __ZTIPKo
+  Symbol reexported: __ZTIPn
+  Symbol reexported: __ZTIPo
+  Symbol reexported: __ZTSDh
+  Symbol reexported: __ZTSDu
+  Symbol reexported: __ZTSg
+  Symbol reexported: __ZTSn
+  Symbol reexported: __ZTSN10__cxxabiv116__shim_type_infoE
+  Symbol reexported: __ZTSo
+  Symbol reexported: __ZTSPDh
+  Symbol reexported: __ZTSPDu
+  Symbol reexported: __ZTSPg
+  Symbol reexported: __ZTSPKDh
+  Symbol reexported: __ZTSPKDu
+  Symbol reexported: __ZTSPKg
+  Symbol reexported: __ZTSPKn
+  Symbol reexported: __ZTSPKo
+  Symbol reexported: __ZTSPn
+  Symbol reexported: __ZTSPo
+  Symbol reexported: __ZTVN10__cxxabiv116__shim_type_infoE
+
 ------------
 Version 18.0
 ------------
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index 2064f45bf8c08..46353986f5d7d 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -70,26 +70,46 @@
 {'is_defined': False, 'name': '__ZSt15get_new_handlerv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt15set_new_handlerPFvvE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt9terminatev', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKv', 'type': 'U'}
@@ -102,11 +122,14 @@
 {'is_defined': False, 'name': '__ZTIPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPv', 'type': 'U'}
@@ -135,21 +158,27 @@
 {'is_defined': False, 'name': '__ZTId', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIw', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIy', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -159,23 +188,30 @@
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKv', 'type': 'U'}
@@ -188,11 +224,14 @@
 {'is_defined': False, 'name': '__ZTSPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPv', 'type': 'U'}
@@ -221,11 +260,14 @@
 {'is_defined': False, 'name': '__ZTSd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSv', 'type': 'U'}
@@ -233,6 +275,7 @@
 {'is_defined': False, 'name': '__ZTSx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSy', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -301,12 +344,16 @@
 {'is_defined': False, 'name': '___cxa_guard_release', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_increment_exception_refcount', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_init_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_new_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_pure_virtual', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_terminate_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw_bad_array_new_length', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_uncaught_exception', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_uncaught_exceptions', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_unexpected_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cctor', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cleanup', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_ctor', 'type': 'U'}
@@ -1945,9 +1992,22 @@
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_13basic_istreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_14basic_iostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE16_NS_13basic_ostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTINSt12experimental15fundamentals_v112bad_any_castE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt12experimental19bad_optional_accessE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__110istrstreamE', 'size': 0, 'type': 'OBJECT'}
@@ -2055,23 +2115,30 @@
 {'is_defined': True, 'name': '__ZTINSt3__19money_putIcNS_19ostreambuf_iteratorIcNS_11char_traitsIcEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKv', 'type': 'I'}
@@ -2084,11 +2151,14 @@
 {'is_defined': True, 'name': '__ZTIPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPv', 'type': 'I'}
@@ -2121,21 +2191,27 @@
 {'is_defined': True, 'name': '__ZTId', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIv', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIw', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIx', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIy', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2227,23 +2303,30 @@
 {'is_defined': True, 'name': '__ZTSNSt3__19money_putIcNS_19ostreambuf_iteratorIcNS_11char_traitsIcEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTSPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKv', 'type': 'I'}
@@ -2256,11 +2339,14 @@
 {'is_defined': True, 'name': '__ZTSPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPv', 'type': 'I'}
@@ -2293,11 +2379,14 @@
 {'is_defined': True, 'name': '__ZTSd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSv', 'type': 'I'}
@@ -2318,6 +2407,7 @@
 {'is_defined': True, 'name': '__ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTTNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2501,6 +2591,8 @@
 {'is_defined': True, 'name': '___cxa_begin_catch', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_call_unexpected', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_exception_type', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_current_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_decrement_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_deleted_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_demangle', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_end_catch', 'type': 'I'}
@@ -2512,12 +2604,18 @@
 {'is_defined': True, 'name': '___cxa_guard_abort', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_acquire', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_release', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_increment_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_init_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_new_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_rethrow_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_terminate_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw_bad_array_new_length', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_uncaught_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_uncaught_exceptions', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_unexpected_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cctor', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cleanup', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_ctor', 'type': 'I'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index bced6b2ea81ba..c169b4a992521 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -70,26 +70,46 @@
 {'is_defined': False, 'name': '__ZSt15get_new_handlerv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt15set_new_handlerPFvvE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZSt9terminatev', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPKv', 'type': 'U'}
@@ -102,11 +122,14 @@
 {'is_defined': False, 'name': '__ZTIPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIPv', 'type': 'U'}
@@ -135,21 +158,27 @@
 {'is_defined': False, 'name': '__ZTId', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTIo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIv', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIw', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTIy', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -159,23 +188,30 @@
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv121__vmi_class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv123__fundamental_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPDu', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDn', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKDs', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKDu', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKa', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKb', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKc', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPKo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPKv', 'type': 'U'}
@@ -188,11 +224,14 @@
 {'is_defined': False, 'name': '__ZTSPd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSPo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSPv', 'type': 'U'}
@@ -221,11 +260,14 @@
 {'is_defined': False, 'name': '__ZTSd', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSe', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSf', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSg', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSh', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSi', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSj', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSl', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSm', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSn', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTSo', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSs', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSt', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSv', 'type': 'U'}
@@ -233,6 +275,7 @@
 {'is_defined': False, 'name': '__ZTSx', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTSy', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'U'}
+{'is_defined': False, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'U'}
 {'is_defined': False, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'U'}
@@ -301,12 +344,16 @@
 {'is_defined': False, 'name': '___cxa_guard_release', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_increment_exception_refcount', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_init_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_new_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_pure_virtual', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow_primary_exception', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_terminate_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_throw_bad_array_new_length', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_uncaught_exception', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_uncaught_exceptions', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_unexpected_handler', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cctor', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_cleanup', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_vec_ctor', 'type': 'U'}
@@ -1945,9 +1992,22 @@
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_13basic_istreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE0_NS_14basic_iostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTCNSt3__19strstreamE16_NS_13basic_ostreamIcNS_11char_traitsIcEEEE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__array_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv119__pointer_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__function_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv120__si_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv121__vmi_class_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv123__fundamental_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIN10__cxxabiv129__pointer_to_member_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTINSt12experimental15fundamentals_v112bad_any_castE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt12experimental19bad_optional_accessE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__110__time_getE', 'size': 0, 'type': 'OBJECT'}
@@ -2078,23 +2138,30 @@
 {'is_defined': True, 'name': '__ZTINSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__19time_baseE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTIPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPKv', 'type': 'I'}
@@ -2107,11 +2174,14 @@
 {'is_defined': True, 'name': '__ZTIPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIPv', 'type': 'I'}
@@ -2144,21 +2214,27 @@
 {'is_defined': True, 'name': '__ZTId', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTIo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIv', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIw', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIx', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTIy', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2261,23 +2337,30 @@
 {'is_defined': True, 'name': '__ZTSNSt3__19money_putIwNS_19ostreambuf_iteratorIwNS_11char_traitsIwEEEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__19time_baseE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTSPDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPDu', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDn', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKDs', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKDu', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKa', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKb', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKc', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPKo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPKv', 'type': 'I'}
@@ -2290,11 +2373,14 @@
 {'is_defined': True, 'name': '__ZTSPd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSPo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSPv', 'type': 'I'}
@@ -2327,11 +2413,14 @@
 {'is_defined': True, 'name': '__ZTSd', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSe', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSf', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSg', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSh', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSi', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSj', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSl', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSm', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSn', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTSo', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSs', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSt', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTSv', 'type': 'I'}
@@ -2352,6 +2441,7 @@
 {'is_defined': True, 'name': '__ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTTNSt3__19strstreamE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv116__enum_type_infoE', 'type': 'I'}
+{'is_defined': True, 'name': '__ZTVN10__cxxabiv116__shim_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__array_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__class_type_infoE', 'type': 'I'}
 {'is_defined': True, 'name': '__ZTVN10__cxxabiv117__pbase_type_infoE', 'type': 'I'}
@@ -2535,6 +2625,8 @@
 {'is_defined': True, 'name': '___cxa_begin_catch', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_call_unexpected', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_current_exception_type', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_current_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_decrement_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_deleted_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_demangle', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_end_catch', 'type': 'I'}
@@ -2546,12 +2638,18 @@
 {'is_defined': True, 'name': '___cxa_guard_abort', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_acquire', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_release', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_increment_exception_refcount', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_init_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_new_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_rethrow_primary_exception', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_terminate_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw_bad_array_new_length', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_uncaught_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_uncaught_exceptions', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_unexpected_handler', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cctor', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_cleanup', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_vec_ctor', 'type': 'I'}
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index cc6954a7bac37..07ffc8bfdaae3 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -231,19 +231,14 @@ if (LIBCXX_ENABLE_SHARED)
   # In particular, we don't re-export the symbols if libc++abi is merged statically
   # into libc++ because in that case there's no dylib to re-export from.
   if (APPLE AND LIBCXX_CXX_ABI MATCHES "libcxxabi$"
-            AND NOT DEFINED LIBCXX_OSX_REEXPORT_LIBCXXABI_SYMBOLS
             AND NOT LIBCXX_STATICALLY_LINK_ABI_IN_SHARED_LIBRARY)
-    set(LIBCXX_OSX_REEXPORT_LIBCXXABI_SYMBOLS ON)
-  endif()
+    target_link_libraries(cxx_shared PRIVATE cxxabi-reexports)
 
-  if (LIBCXX_OSX_REEXPORT_LIBCXXABI_SYMBOLS)
+    # TODO: These exports controls should not be tied to whether we re-export libc++abi symbols
     target_link_libraries(cxx_shared PRIVATE
       "-Wl,-unexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++unexp.exp"
-      "-Wl,-reexported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/libc++abi.exp"
       "-Wl,-force_symbols_not_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/notweak.exp"
       "-Wl,-force_symbols_weak_list,${CMAKE_CURRENT_SOURCE_DIR}/../lib/weak.exp")
-
-    target_link_libraries(cxx_shared PRIVATE $<TARGET_NAME_IF_EXISTS:cxxabi-reexports>)
   endif()
 
   # Generate a linker script in place of a libc++.so symlink.
diff --git a/libcxxabi/lib/cxxabiv1.exp b/libcxxabi/lib/cxxabiv1.exp
new file mode 100644
index 0000000000000..b1bab45ef3347
--- /dev/null
+++ b/libcxxabi/lib/cxxabiv1.exp
@@ -0,0 +1,38 @@
+# Typeinfos for types from libc++abi
+__ZTIN10__cxxabiv116__enum_type_infoE
+__ZTIN10__cxxabiv116__shim_type_infoE
+__ZTIN10__cxxabiv117__array_type_infoE
+__ZTIN10__cxxabiv117__class_type_infoE
+__ZTIN10__cxxabiv117__pbase_type_infoE
+__ZTIN10__cxxabiv119__pointer_type_infoE
+__ZTIN10__cxxabiv120__function_type_infoE
+__ZTIN10__cxxabiv120__si_class_type_infoE
+__ZTIN10__cxxabiv121__vmi_class_type_infoE
+__ZTIN10__cxxabiv123__fundamental_type_infoE
+__ZTIN10__cxxabiv129__pointer_to_member_type_infoE
+
+# Typeinfo names for types from libc++abi
+__ZTSN10__cxxabiv116__enum_type_infoE
+__ZTSN10__cxxabiv116__shim_type_infoE
+__ZTSN10__cxxabiv117__array_type_infoE
+__ZTSN10__cxxabiv117__class_type_infoE
+__ZTSN10__cxxabiv117__pbase_type_infoE
+__ZTSN10__cxxabiv119__pointer_type_infoE
+__ZTSN10__cxxabiv120__function_type_infoE
+__ZTSN10__cxxabiv120__si_class_type_infoE
+__ZTSN10__cxxabiv121__vmi_class_type_infoE
+__ZTSN10__cxxabiv123__fundamental_type_infoE
+__ZTSN10__cxxabiv129__pointer_to_member_type_infoE
+
+# Vtables for libc++abi types
+__ZTVN10__cxxabiv116__enum_type_infoE
+__ZTVN10__cxxabiv116__shim_type_infoE
+__ZTVN10__cxxabiv117__array_type_infoE
+__ZTVN10__cxxabiv117__class_type_infoE
+__ZTVN10__cxxabiv117__pbase_type_infoE
+__ZTVN10__cxxabiv119__pointer_type_infoE
+__ZTVN10__cxxabiv120__function_type_infoE
+__ZTVN10__cxxabiv120__si_class_type_infoE
+__ZTVN10__cxxabiv121__vmi_class_type_infoE
+__ZTVN10__cxxabiv123__fundamental_type_infoE
+__ZTVN10__cxxabiv129__pointer_to_member_type_infoE
diff --git a/libcxxabi/lib/fundamental-types.exp b/libcxxabi/lib/fundamental-types.exp
new file mode 100644
index 0000000000000..fe21022328fe4
--- /dev/null
+++ b/libcxxabi/lib/fundamental-types.exp
@@ -0,0 +1,153 @@
+# Typeinfos for fundamental types
+__ZTIa
+__ZTIb
+__ZTIc
+__ZTId
+__ZTIDh
+__ZTIDi
+__ZTIDn
+__ZTIDs
+__ZTIDu
+__ZTIe
+__ZTIf
+__ZTIg
+__ZTIh
+__ZTIi
+__ZTIj
+__ZTIl
+__ZTIm
+__ZTIn
+__ZTIo
+__ZTIPa
+__ZTIPb
+__ZTIPc
+__ZTIPd
+__ZTIPDh
+__ZTIPDi
+__ZTIPDn
+__ZTIPDs
+__ZTIPDu
+__ZTIPe
+__ZTIPf
+__ZTIPg
+__ZTIPh
+__ZTIPi
+__ZTIPj
+__ZTIPKa
+__ZTIPKb
+__ZTIPKc
+__ZTIPKd
+__ZTIPKDh
+__ZTIPKDi
+__ZTIPKDn
+__ZTIPKDs
+__ZTIPKDu
+__ZTIPKe
+__ZTIPKf
+__ZTIPKg
+__ZTIPKh
+__ZTIPKi
+__ZTIPKj
+__ZTIPKl
+__ZTIPKm
+__ZTIPKn
+__ZTIPKo
+__ZTIPKs
+__ZTIPKt
+__ZTIPKv
+__ZTIPKw
+__ZTIPKx
+__ZTIPKy
+__ZTIPl
+__ZTIPm
+__ZTIPn
+__ZTIPo
+__ZTIPs
+__ZTIPt
+__ZTIPv
+__ZTIPw
+__ZTIPx
+__ZTIPy
+__ZTIs
+__ZTIt
+__ZTIv
+__ZTIw
+__ZTIx
+__ZTIy
+
+# Typeinfo names for fundamental types
+__ZTSa
+__ZTSb
+__ZTSc
+__ZTSd
+__ZTSDh
+__ZTSDi
+__ZTSDn
+__ZTSDs
+__ZTSDu
+__ZTSe
+__ZTSf
+__ZTSg
+__ZTSh
+__ZTSi
+__ZTSj
+__ZTSl
+__ZTSm
+__ZTSn
+__ZTSo
+__ZTSPa
+__ZTSPb
+__ZTSPc
+__ZTSPd
+__ZTSPDh
+__ZTSPDi
+__ZTSPDn
+__ZTSPDs
+__ZTSPDu
+__ZTSPe
+__ZTSPf
+__ZTSPg
+__ZTSPh
+__ZTSPi
+__ZTSPj
+__ZTSPKa
+__ZTSPKb
+__ZTSPKc
+__ZTSPKd
+__ZTSPKDh
+__ZTSPKDi
+__ZTSPKDn
+__ZTSPKDs
+__ZTSPKDu
+__ZTSPKe
+__ZTSPKf
+__ZTSPKg
+__ZTSPKh
+__ZTSPKi
+__ZTSPKj
+__ZTSPKl
+__ZTSPKm
+__ZTSPKn
+__ZTSPKo
+__ZTSPKs
+__ZTSPKt
+__ZTSPKv
+__ZTSPKw
+__ZTSPKx
+__ZTSPKy
+__ZTSPl
+__ZTSPm
+__ZTSPn
+__ZTSPo
+__ZTSPs
+__ZTSPt
+__ZTSPv
+__ZTSPw
+__ZTSPx
+__ZTSPy
+__ZTSs
+__ZTSt
+__ZTSv
+__ZTSw
+__ZTSx
+__ZTSy
diff --git a/libcxxabi/lib/itanium-base.exp b/libcxxabi/lib/itanium-base.exp
index 1b14cb8fba175..002e062df423e 100644
--- a/libcxxabi/lib/itanium-base.exp
+++ b/libcxxabi/lib/itanium-base.exp
@@ -1,250 +1,3 @@
-# Typeinfos for fundamental types
-__ZTIa
-__ZTIb
-__ZTIc
-__ZTId
-__ZTIDh
-__ZTIDi
-__ZTIDn
-__ZTIDs
-__ZTIDu
-__ZTIe
-__ZTIf
-__ZTIg
-__ZTIh
-__ZTIi
-__ZTIj
-__ZTIl
-__ZTIm
-__ZTIn
-__ZTIo
-__ZTIPa
-__ZTIPb
-__ZTIPc
-__ZTIPd
-__ZTIPDh
-__ZTIPDi
-__ZTIPDn
-__ZTIPDs
-__ZTIPDu
-__ZTIPe
-__ZTIPf
-__ZTIPg
-__ZTIPh
-__ZTIPi
-__ZTIPj
-__ZTIPKa
-__ZTIPKb
-__ZTIPKc
-__ZTIPKd
-__ZTIPKDh
-__ZTIPKDi
-__ZTIPKDn
-__ZTIPKDs
-__ZTIPKDu
-__ZTIPKe
-__ZTIPKf
-__ZTIPKg
-__ZTIPKh
-__ZTIPKi
-__ZTIPKj
-__ZTIPKl
-__ZTIPKm
-__ZTIPKn
-__ZTIPKo
-__ZTIPKs
-__ZTIPKt
-__ZTIPKv
-__ZTIPKw
-__ZTIPKx
-__ZTIPKy
-__ZTIPl
-__ZTIPm
-__ZTIPn
-__ZTIPo
-__ZTIPs
-__ZTIPt
-__ZTIPv
-__ZTIPw
-__ZTIPx
-__ZTIPy
-__ZTIs
-__ZTIt
-__ZTIv
-__ZTIw
-__ZTIx
-__ZTIy
-
-# Typeinfo names for fundamental types
-__ZTSa
-__ZTSb
-__ZTSc
-__ZTSd
-__ZTSDh
-__ZTSDi
-__ZTSDn
-__ZTSDs
-__ZTSDu
-__ZTSe
-__ZTSf
-__ZTSg
-__ZTSh
-__ZTSi
-__ZTSj
-__ZTSl
-__ZTSm
-__ZTSn
-__ZTSo
-__ZTSPa
-__ZTSPb
-__ZTSPc
-__ZTSPd
-__ZTSPDh
-__ZTSPDi
-__ZTSPDn
-__ZTSPDs
-__ZTSPDu
-__ZTSPe
-__ZTSPf
-__ZTSPg
-__ZTSPh
-__ZTSPi
-__ZTSPj
-__ZTSPKa
-__ZTSPKb
-__ZTSPKc
-__ZTSPKd
-__ZTSPKDh
-__ZTSPKDi
-__ZTSPKDn
-__ZTSPKDs
-__ZTSPKDu
-__ZTSPKe
-__ZTSPKf
-__ZTSPKg
-__ZTSPKh
-__ZTSPKi
-__ZTSPKj
-__ZTSPKl
-__ZTSPKm
-__ZTSPKn
-__ZTSPKo
-__ZTSPKs
-__ZTSPKt
-__ZTSPKv
-__ZTSPKw
-__ZTSPKx
-__ZTSPKy
-__ZTSPl
-__ZTSPm
-__ZTSPn
-__ZTSPo
-__ZTSPs
-__ZTSPt
-__ZTSPv
-__ZTSPw
-__ZTSPx
-__ZTSPy
-__ZTSs
-__ZTSt
-__ZTSv
-__ZTSw
-__ZTSx
-__ZTSy
-
-# Typeinfos for types from libc++abi
-__ZTIN10__cxxabiv116__enum_type_infoE
-__ZTIN10__cxxabiv116__shim_type_infoE
-__ZTIN10__cxxabiv117__array_type_infoE
-__ZTIN10__cxxabiv117__class_type_infoE
-__ZTIN10__cxxabiv117__pbase_type_infoE
-__ZTIN10__cxxabiv119__pointer_type_infoE
-__ZTIN10__cxxabiv120__function_type_infoE
-__ZTIN10__cxxabiv120__si_class_type_infoE
-__ZTIN10__cxxabiv121__vmi_class_type_infoE
-__ZTIN10__cxxabiv123__fundamental_type_infoE
-__ZTIN10__cxxabiv129__pointer_to_member_type_infoE
-
-# Typeinfo names for types from libc++abi
-__ZTSN10__cxxabiv116__enum_type_infoE
-__ZTSN10__cxxabiv116__shim_type_infoE
-__ZTSN10__cxxabiv117__array_type_infoE
-__ZTSN10__cxxabiv117__class_type_infoE
-__ZTSN10__cxxabiv117__pbase_type_infoE
-__ZTSN10__cxxabiv119__pointer_type_infoE
-__ZTSN10__cxxabiv120__function_type_infoE
-__ZTSN10__cxxabiv120__si_class_type_infoE
-__ZTSN10__cxxabiv121__vmi_class_type_infoE
-__ZTSN10__cxxabiv123__fundamental_type_infoE
-__ZTSN10__cxxabiv129__pointer_to_member_type_infoE
-
-# Typeinfos for std:: exception types
-__ZTISt10bad_typeid
-__ZTISt11logic_error
-__ZTISt11range_error
-__ZTISt12domain_error
-__ZTISt12length_error
-__ZTISt12out_of_range
-__ZTISt13bad_exception
-__ZTISt13runtime_error
-__ZTISt14overflow_error
-__ZTISt15underflow_error
-__ZTISt16invalid_argument
-__ZTISt20bad_array_new_length
-__ZTISt8bad_cast
-__ZTISt9bad_alloc
-__ZTISt9exception
-__ZTISt9type_info
-
-# Typeinfo names for std:: exception types
-__ZTSSt10bad_typeid
-__ZTSSt11logic_error
-__ZTSSt11range_error
-__ZTSSt12domain_error
-__ZTSSt12length_error
-__ZTSSt12out_of_range
-__ZTSSt13bad_exception
-__ZTSSt13runtime_error
-__ZTSSt14overflow_error
-__ZTSSt15underflow_error
-__ZTSSt16invalid_argument
-__ZTSSt20bad_array_new_length
-__ZTSSt8bad_cast
-__ZTSSt9bad_alloc
-__ZTSSt9exception
-__ZTSSt9type_info
-
-# Vtables for libc++abi types
-__ZTVN10__cxxabiv116__enum_type_infoE
-__ZTVN10__cxxabiv116__shim_type_infoE
-__ZTVN10__cxxabiv117__array_type_infoE
-__ZTVN10__cxxabiv117__class_type_infoE
-__ZTVN10__cxxabiv117__pbase_type_infoE
-__ZTVN10__cxxabiv119__pointer_type_infoE
-__ZTVN10__cxxabiv120__function_type_infoE
-__ZTVN10__cxxabiv120__si_class_type_infoE
-__ZTVN10__cxxabiv121__vmi_class_type_infoE
-__ZTVN10__cxxabiv123__fundamental_type_infoE
-__ZTVN10__cxxabiv129__pointer_to_member_type_infoE
-
-# Vtables for std:: exception types
-__ZTVSt10bad_typeid
-__ZTVSt11logic_error
-__ZTVSt11range_error
-__ZTVSt12domain_error
-__ZTVSt12length_error
-__ZTVSt12out_of_range
-__ZTVSt13bad_exception
-__ZTVSt13runtime_error
-__ZTVSt14overflow_error
-__ZTVSt15underflow_error
-__ZTVSt16invalid_argument
-__ZTVSt20bad_array_new_length
-__ZTVSt8bad_cast
-__ZTVSt9bad_alloc
-__ZTVSt9exception
-__ZTVSt9type_info
-
 # Itanium C++ ABI requirements (minus most exception support)
 ___cxa_bad_cast
 ___cxa_bad_typeid
@@ -277,81 +30,3 @@ ___dynamic_cast
 ___cxa_terminate_handler
 ___cxa_unexpected_handler
 ___cxa_new_handler
-
-# ::what() functions for std:: exception types
-__ZNKSt10bad_typeid4whatEv
-__ZNKSt11logic_error4whatEv
-__ZNKSt13bad_exception4whatEv
-__ZNKSt13runtime_error4whatEv
-__ZNKSt20bad_array_new_length4whatEv
-__ZNKSt8bad_cast4whatEv
-__ZNKSt9bad_alloc4whatEv
-__ZNKSt9exception4whatEv
-
-# Default constructors and destructors for std:: exception types
-__ZNSt10bad_typeidC1Ev
-__ZNSt10bad_typeidC2Ev
-__ZNSt10bad_typeidD0Ev
-__ZNSt10bad_typeidD1Ev
-__ZNSt10bad_typeidD2Ev
-__ZNSt11logic_errorD0Ev
-__ZNSt11logic_errorD1Ev
-__ZNSt11logic_errorD2Ev
-__ZNSt11range_errorD0Ev
-__ZNSt11range_errorD1Ev
-__ZNSt11range_errorD2Ev
-__ZNSt12domain_errorD0Ev
-__ZNSt12domain_errorD1Ev
-__ZNSt12domain_errorD2Ev
-__ZNSt12length_errorD0Ev
-__ZNSt12length_errorD1Ev
-__ZNSt12length_errorD2Ev
-__ZNSt12out_of_rangeD0Ev
-__ZNSt12out_of_rangeD1Ev
-__ZNSt12out_of_rangeD2Ev
-__ZNSt13bad_exceptionD0Ev
-__ZNSt13bad_exceptionD1Ev
-__ZNSt13bad_exceptionD2Ev
-__ZNSt13runtime_errorD0Ev
-__ZNSt13runtime_errorD1Ev
-__ZNSt13runtime_errorD2Ev
-__ZNSt14overflow_errorD0Ev
-__ZNSt14overflow_errorD1Ev
-__ZNSt14overflow_errorD2Ev
-__ZNSt15underflow_errorD0Ev
-__ZNSt15underflow_errorD1Ev
-__ZNSt15underflow_errorD2Ev
-__ZNSt16invalid_argumentD0Ev
-__ZNSt16invalid_argumentD1Ev
-__ZNSt16invalid_argumentD2Ev
-__ZNSt20bad_array_new_lengthC1Ev
-__ZNSt20bad_array_new_lengthC2Ev
-__ZNSt20bad_array_new_lengthD0Ev
-__ZNSt20bad_array_new_lengthD1Ev
-__ZNSt20bad_array_new_lengthD2Ev
-__ZNSt8bad_castC1Ev
-__ZNSt8bad_castC2Ev
-__ZNSt8bad_castD0Ev
-__ZNSt8bad_castD1Ev
-__ZNSt8bad_castD2Ev
-__ZNSt9bad_allocC1Ev
-__ZNSt9bad_allocC2Ev
-__ZNSt9bad_allocD0Ev
-__ZNSt9bad_allocD1Ev
-__ZNSt9bad_allocD2Ev
-__ZNSt9exceptionD0Ev
-__ZNSt9exceptionD1Ev
-__ZNSt9exceptionD2Ev
-__ZNSt9type_infoD0Ev
-__ZNSt9type_infoD1Ev
-__ZNSt9type_infoD2Ev
-
-# Other std:: functions implemented in libc++abi
-__ZSt10unexpectedv
-__ZSt13get_terminatev
-__ZSt13set_terminatePFvvE
-__ZSt14get_unexpectedv
-__ZSt14set_unexpectedPFvvE
-__ZSt15get_new_handlerv
-__ZSt15set_new_handlerPFvvE
-__ZSt9terminatev
diff --git a/libcxxabi/lib/exceptions.exp b/libcxxabi/lib/itanium-exceptions.exp
similarity index 84%
rename from libcxxabi/lib/exceptions.exp
rename to libcxxabi/lib/itanium-exceptions.exp
index 9dcfbdbd3598e..21b19edb92dfc 100644
--- a/libcxxabi/lib/exceptions.exp
+++ b/libcxxabi/lib/itanium-exceptions.exp
@@ -1,3 +1,4 @@
+# Itanium C++ ABI requirements related to exceptions
 ___cxa_allocate_dependent_exception
 ___cxa_allocate_exception
 ___cxa_begin_catch
diff --git a/libcxxabi/lib/new-delete.exp b/libcxxabi/lib/new-delete.exp
index 086d2fec24ea7..4f97de7e8bb6f 100644
--- a/libcxxabi/lib/new-delete.exp
+++ b/libcxxabi/lib/new-delete.exp
@@ -1,3 +1,4 @@
+# Symbols for all variants of global operator new and delete
 __Znwm
 __ZnwmRKSt9nothrow_t
 __ZnwmSt11align_val_t
diff --git a/libcxx/lib/libc++abi.exp b/libcxxabi/lib/std-exceptions.exp
similarity index 51%
rename from libcxx/lib/libc++abi.exp
rename to libcxxabi/lib/std-exceptions.exp
index 6a3e6b9f0222e..bb4a2b2e93eab 100644
--- a/libcxx/lib/libc++abi.exp
+++ b/libcxxabi/lib/std-exceptions.exp
@@ -1,295 +1,121 @@
-___cxa_demangle
-___cxa_get_globals
-___cxa_get_globals_fast
-___cxa_guard_abort
-___cxa_guard_acquire
-___cxa_guard_release
-___cxa_pure_virtual
-___cxa_deleted_virtual
-___cxa_throw_bad_array_new_length
-___cxa_uncaught_exceptions
-___cxa_vec_cctor
-___cxa_vec_cleanup
-___cxa_vec_ctor
-___cxa_vec_delete
-___cxa_vec_delete2
-___cxa_vec_delete3
-___cxa_vec_dtor
-___cxa_vec_new
-___cxa_vec_new2
-___cxa_vec_new3
-___dynamic_cast
-__ZTIDi
-__ZTIDn
-__ZTIDs
-__ZTIPDi
-__ZTIPDn
-__ZTIPDs
-__ZTIPKDi
-__ZTIPKDn
-__ZTIPKDs
-__ZTSPm
-__ZTSPl
-__ZTSPj
-__ZTSPi
-__ZTSPh
-__ZTSPf
-__ZTSPe
-__ZTSPd
-__ZTSPc
-__ZTSPb
-__ZTSPa
-__ZTSPKc
-__ZTSPKy
-__ZTSPKx
-__ZTSPKw
-__ZTSPKv
-__ZTSPKt
-__ZTSPKs
-__ZTSPKm
-__ZTSPKl
-__ZTSPKi
-__ZTSPKh
-__ZTSPs
-__ZTSPt
-__ZTSPv
-__ZTSPw
-__ZTSPKa
-__ZTSPx
-__ZTSPy
-__ZTSPKd
-__ZTSPKe
-__ZTSPKj
-__ZTSPKb
-__ZTSPKf
-__ZTSv
-__ZTSt
-__ZTSs
-__ZTSm
-__ZTSl
-__ZTSj
-__ZTSi
-__ZTSh
-__ZTSf
-__ZTSe
-__ZTSd
-__ZTSc
-__ZTSw
-__ZTSx
-__ZTSy
-__ZTSb
-__ZTSa
-__ZTIPKh
-__ZTIPKf
-__ZTIPKe
-__ZTIPKd
-__ZTIPKc
-__ZTIPKb
-__ZTIPKa
-__ZTIPy
-__ZTIPx
-__ZTIPw
-__ZTIPv
-__ZTIPt
-__ZTIPs
-__ZTIPm
-__ZTIPl
-__ZTIPj
-__ZTIPi
-__ZTIPKi
-__ZTIPKj
-__ZTIPKl
-__ZTIPKm
-__ZTIPKs
-__ZTIPKt
-__ZTIPKv
-__ZTIPKw
-__ZTIPKx
-__ZTIPKy
-__ZTIPa
-__ZTIPb
-__ZTIPc
-__ZTIPd
-__ZTIPe
-__ZTIPf
-__ZTIPh
-__ZTVN10__cxxabiv129__pointer_to_member_type_infoE
-__ZTVN10__cxxabiv116__enum_type_infoE
-__ZTVN10__cxxabiv117__array_type_infoE
-__ZTVN10__cxxabiv117__class_type_infoE
-__ZTVN10__cxxabiv117__pbase_type_infoE
-__ZTVN10__cxxabiv119__pointer_type_infoE
-__ZTVN10__cxxabiv120__function_type_infoE
-__ZTVN10__cxxabiv120__si_class_type_infoE
-__ZTVN10__cxxabiv121__vmi_class_type_infoE
-__ZTVN10__cxxabiv123__fundamental_type_infoE
-__ZTIa
-__ZTIb
-__ZTIc
-__ZTId
-__ZTIe
-__ZTIf
-__ZTIh
-__ZTIi
-__ZTIj
-__ZTIl
-__ZTIm
-__ZTIs
-__ZTIt
-__ZTSN10__cxxabiv129__pointer_to_member_type_infoE
-__ZTSN10__cxxabiv123__fundamental_type_infoE
-__ZTSN10__cxxabiv121__vmi_class_type_infoE
-__ZTSN10__cxxabiv120__si_class_type_infoE
-__ZTSN10__cxxabiv120__function_type_infoE
-__ZTSN10__cxxabiv119__pointer_type_infoE
-__ZTSN10__cxxabiv117__pbase_type_infoE
-__ZTSN10__cxxabiv117__class_type_infoE
-__ZTSN10__cxxabiv117__array_type_infoE
-__ZTSN10__cxxabiv116__enum_type_infoE
-__ZTIy
-__ZTIx
-__ZTIw
-__ZTIv
-__ZSt13get_terminatev
-__ZSt13set_terminatePFvvE
-__ZSt14get_unexpectedv
-__ZSt14set_unexpectedPFvvE
-__ZSt15get_new_handlerv
-__ZSt15set_new_handlerPFvvE
-__ZSt9terminatev
-__ZNSt9bad_allocD1Ev
-__ZTISt9bad_alloc
-__ZNSt9bad_allocC1Ev
+# Typeinfos for std:: exception types
+__ZTISt10bad_typeid
+__ZTISt11logic_error
+__ZTISt11range_error
+__ZTISt12domain_error
+__ZTISt12length_error
+__ZTISt12out_of_range
 __ZTISt13bad_exception
+__ZTISt13runtime_error
+__ZTISt14overflow_error
+__ZTISt15underflow_error
+__ZTISt16invalid_argument
+__ZTISt20bad_array_new_length
+__ZTISt8bad_cast
+__ZTISt9bad_alloc
+__ZTISt9exception
+__ZTISt9type_info
+
+# Typeinfo names for std:: exception types
+__ZTSSt10bad_typeid
+__ZTSSt11logic_error
+__ZTSSt11range_error
+__ZTSSt12domain_error
+__ZTSSt12length_error
+__ZTSSt12out_of_range
+__ZTSSt13bad_exception
+__ZTSSt13runtime_error
+__ZTSSt14overflow_error
+__ZTSSt15underflow_error
+__ZTSSt16invalid_argument
+__ZTSSt20bad_array_new_length
+__ZTSSt8bad_cast
+__ZTSSt9bad_alloc
+__ZTSSt9exception
+__ZTSSt9type_info
+
+# Vtables for std:: exception types
 __ZTVSt10bad_typeid
-__ZTVSt9exception
-__ZNSt10bad_typeidC1Ev
-__ZNSt10bad_typeidC1Ev
-__ZNKSt10bad_typeid4whatEv
-__ZNSt10bad_typeidD1Ev
+__ZTVSt11logic_error
+__ZTVSt11range_error
+__ZTVSt12domain_error
+__ZTVSt12length_error
+__ZTVSt12out_of_range
+__ZTVSt13bad_exception
+__ZTVSt13runtime_error
+__ZTVSt14overflow_error
+__ZTVSt15underflow_error
+__ZTVSt16invalid_argument
+__ZTVSt20bad_array_new_length
 __ZTVSt8bad_cast
-__ZNSt8bad_castC1Ev
-__ZNSt8bad_castC2Ev
-__ZNSt8bad_castD0Ev
-__ZNKSt8bad_cast4whatEv
-__ZNSt8bad_castD1Ev
-__ZNSt8bad_castD2Ev
 __ZTVSt9bad_alloc
-__ZTVSt20bad_array_new_length
-__ZTVSt13bad_exception
-__ZNKSt9exception4whatEv
-__ZNKSt9bad_alloc4whatEv
-__ZNSt9bad_allocC2Ev
-__ZNSt9bad_allocD0Ev
-__ZNSt9bad_allocD2Ev
-__ZNSt9exceptionD0Ev
-__ZNSt20bad_array_new_lengthC1Ev
+__ZTVSt9exception
+__ZTVSt9type_info
+
+# ::what() functions for std:: exception types
+__ZNKSt10bad_typeid4whatEv
+__ZNKSt11logic_error4whatEv
 __ZNKSt13bad_exception4whatEv
-__ZNSt9exceptionD1Ev
+__ZNKSt13runtime_error4whatEv
 __ZNKSt20bad_array_new_length4whatEv
-__ZNSt13bad_exceptionD1Ev
-__ZNSt20bad_array_new_lengthD1Ev
-__ZNSt9exceptionD2Ev
-__ZNSt9type_infoD0Ev
-__ZNSt9type_infoD1Ev
-__ZNSt9type_infoD2Ev
+__ZNKSt8bad_cast4whatEv
+__ZNKSt9bad_alloc4whatEv
+__ZNKSt9exception4whatEv
+
+# Default constructors and destructors for std:: exception types
+__ZNSt10bad_typeidC1Ev
 __ZNSt10bad_typeidC2Ev
 __ZNSt10bad_typeidD0Ev
+__ZNSt10bad_typeidD1Ev
 __ZNSt10bad_typeidD2Ev
-__ZNSt13bad_exceptionD0Ev
-__ZNSt13bad_exceptionD2Ev
-__ZNSt20bad_array_new_lengthC2Ev
-__ZNSt20bad_array_new_lengthD0Ev
-__ZNSt20bad_array_new_lengthD2Ev
-__ZSt10unexpectedv
-__ZTISt10bad_typeid
-__ZTISt8bad_cast
-___cxa_bad_typeid
-___cxa_bad_cast
-__ZTISt9exception
-__ZTISt9type_info
-__ZTISt20bad_array_new_length
-
-__ZNKSt11logic_error4whatEv
 __ZNSt11logic_errorD0Ev
 __ZNSt11logic_errorD1Ev
 __ZNSt11logic_errorD2Ev
-__ZTISt11logic_error
-__ZTSSt11logic_error
-__ZTVSt11logic_error
-
-__ZNKSt13runtime_error4whatEv
-__ZNSt13runtime_errorD0Ev
-__ZNSt13runtime_errorD1Ev
-__ZNSt13runtime_errorD2Ev
-__ZTISt13runtime_error
-__ZTSSt13runtime_error
-__ZTVSt13runtime_error
-
 __ZNSt11range_errorD0Ev
 __ZNSt11range_errorD1Ev
 __ZNSt11range_errorD2Ev
-__ZTISt11range_error
-__ZTSSt11range_error
-__ZTVSt11range_error
-
 __ZNSt12domain_errorD0Ev
 __ZNSt12domain_errorD1Ev
 __ZNSt12domain_errorD2Ev
-__ZTISt12domain_error
-__ZTSSt12domain_error
-__ZTVSt12domain_error
-
 __ZNSt12length_errorD0Ev
 __ZNSt12length_errorD1Ev
 __ZNSt12length_errorD2Ev
-__ZTISt12length_error
-__ZTSSt12length_error
-__ZTVSt12length_error
-
 __ZNSt12out_of_rangeD0Ev
 __ZNSt12out_of_rangeD1Ev
 __ZNSt12out_of_rangeD2Ev
-__ZTISt12out_of_range
-__ZTSSt12out_of_range
-__ZTVSt12out_of_range
-
+__ZNSt13bad_exceptionD0Ev
+__ZNSt13bad_exceptionD1Ev
+__ZNSt13bad_exceptionD2Ev
+__ZNSt13runtime_errorD0Ev
+__ZNSt13runtime_errorD1Ev
+__ZNSt13runtime_errorD2Ev
 __ZNSt14overflow_errorD0Ev
 __ZNSt14overflow_errorD1Ev
 __ZNSt14overflow_errorD2Ev
-__ZTISt14overflow_error
-__ZTSSt14overflow_error
-__ZTVSt14overflow_error
-
 __ZNSt15underflow_errorD0Ev
 __ZNSt15underflow_errorD1Ev
 __ZNSt15underflow_errorD2Ev
-__ZTISt15underflow_error
-__ZTSSt15underflow_error
-__ZTVSt15underflow_error
-
 __ZNSt16invalid_argumentD0Ev
 __ZNSt16invalid_argumentD1Ev
 __ZNSt16invalid_argumentD2Ev
-__ZTISt16invalid_argument
-__ZTSSt16invalid_argument
-__ZTVSt16invalid_argument
-
-__ZTSDi
-__ZTSDn
-__ZTSDs
-__ZTSPDi
-__ZTSPDn
-__ZTSPDs
-__ZTSPKDi
-__ZTSPKDn
-__ZTSPKDs
-
-__ZTSSt8bad_cast
-__ZTSSt9bad_alloc
-__ZTSSt9exception
-__ZTSSt9type_info
-__ZTSSt10bad_typeid
-__ZTSSt13bad_exception
-__ZTSSt20bad_array_new_length
-__ZTVSt9type_info
+__ZNSt20bad_array_new_lengthC1Ev
+__ZNSt20bad_array_new_lengthC2Ev
+__ZNSt20bad_array_new_lengthD0Ev
+__ZNSt20bad_array_new_lengthD1Ev
+__ZNSt20bad_array_new_lengthD2Ev
+__ZNSt8bad_castC1Ev
+__ZNSt8bad_castC2Ev
+__ZNSt8bad_castD0Ev
+__ZNSt8bad_castD1Ev
+__ZNSt8bad_castD2Ev
+__ZNSt9bad_allocC1Ev
+__ZNSt9bad_allocC2Ev
+__ZNSt9bad_allocD0Ev
+__ZNSt9bad_allocD1Ev
+__ZNSt9bad_allocD2Ev
+__ZNSt9exceptionD0Ev
+__ZNSt9exceptionD1Ev
+__ZNSt9exceptionD2Ev
+__ZNSt9type_infoD0Ev
+__ZNSt9type_infoD1Ev
+__ZNSt9type_infoD2Ev
diff --git a/libcxxabi/lib/std-misc.exp b/libcxxabi/lib/std-misc.exp
new file mode 100644
index 0000000000000..0ce6a29443984
--- /dev/null
+++ b/libcxxabi/lib/std-misc.exp
@@ -0,0 +1,9 @@
+# Other std:: functions implemented in libc++abi
+__ZSt10unexpectedv
+__ZSt13get_terminatev
+__ZSt13set_terminatePFvvE
+__ZSt14get_unexpectedv
+__ZSt14set_unexpectedPFvvE
+__ZSt15get_new_handlerv
+__ZSt15set_new_handlerPFvvE
+__ZSt9terminatev
diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt
index 4198827203fc8..0f17ea9184c82 100644
--- a/libcxxabi/src/CMakeLists.txt
+++ b/libcxxabi/src/CMakeLists.txt
@@ -213,31 +213,31 @@ if (LIBCXXABI_ENABLE_SHARED)
   endif()
 
   add_library(cxxabi-reexports INTERFACE)
-
-  # -exported_symbols_list is only available on Apple platforms
-  if (APPLE)
-    function(export_symbols file)
+  function(reexport_symbols file)
+    # -exported_symbols_list is only available on Apple platforms
+    if (APPLE)
       target_link_libraries(cxxabi_shared PRIVATE "-Wl,-exported_symbols_list,${file}")
-    endfunction()
-    function(reexport_symbols file)
-      export_symbols("${file}")
       target_link_libraries(cxxabi-reexports INTERFACE "-Wl,-reexported_symbols_list,${file}")
-    endfunction()
+    endif()
+  endfunction()
 
-    export_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/cxxabiv1.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/fundamental-types.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-base.exp")
+  reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-misc.exp")
 
-    if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
-      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
-    endif()
+  if (LIBCXXABI_ENABLE_NEW_DELETE_DEFINITIONS)
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/new-delete.exp")
+  endif()
 
-    if (LIBCXXABI_ENABLE_EXCEPTIONS)
-      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/exceptions.exp")
+  if (LIBCXXABI_ENABLE_EXCEPTIONS)
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/itanium-exceptions.exp")
+    reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/std-exceptions.exp")
 
-      if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
-        reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
-      else()
-        reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
-      endif()
+    if ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "^(armv6|armv7|armv7s)$")
+      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-sjlj.exp")
+    else()
+      reexport_symbols("${CMAKE_CURRENT_SOURCE_DIR}/../lib/personality-v0.exp")
     endif()
   endif()
 endif()
diff --git a/libcxxabi/test/uncaught_exception.pass.cpp b/libcxxabi/test/uncaught_exception.pass.cpp
index eb80fcc14f459..9087059aeba54 100644
--- a/libcxxabi/test/uncaught_exception.pass.cpp
+++ b/libcxxabi/test/uncaught_exception.pass.cpp
@@ -11,6 +11,13 @@
 // This tests that libc++abi still provides __cxa_uncaught_exception() for
 // ABI compatibility, even though the Standard doesn't require it to.
 
+// __cxa_uncaught_exception was not re-exported from libc++ previously. This leads
+// to undefined symbols when linking against a libc++ that re-exports the symbols,
+// but running against a libc++ that doesn't. Fortunately, usage of __cxa_uncaught_exception()
+// in the wild seems to be close to non-existent.
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
+// XFAIL: stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(11|12|13|14)([.][0-9]+)?}}
+
 #include <cxxabi.h>
 #include <cassert>
 

From 435e75db80e1ffd0f9752534d4544eba5e0610df Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu@apple.com>
Date: Mon, 26 Feb 2024 12:19:21 -0800
Subject: [PATCH 367/546] [Tests][LoopDistribute] Re-adding target triple in a
 test (#82954)

This test triple was removed earlier to fix build errors. To preserve the original test intention the triple is re-added with an explicit target requirement.
---
 llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
index 2b9f777e9f3b2..97ea2c6708dad 100644
--- a/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
+++ b/llvm/test/Transforms/LoopDistribute/basic-with-memchecks.ll
@@ -1,3 +1,4 @@
+; REQUIRES: x86-registered-target
 ; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S \
 ; RUN:   < %s | FileCheck %s
 
@@ -18,6 +19,7 @@
 ;   }
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
 
 @B = common global ptr null, align 8
 @A = common global ptr null, align 8
@@ -77,6 +79,7 @@ entry:
 ; CHECK: for.end:
 
 
+; VECTORIZE: mul <4 x i32>
 ; VECTORIZE: mul <4 x i32>
 ; VECTORIZE-NOT: mul <4 x i32>
 

From 113052b2b022c4ce45c8003057ae4297d48ed843 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Thu, 9 Nov 2023 12:03:06 -0800
Subject: [PATCH 368/546] [AMDGPU] Prefer lower total register usage in regions
 with spilling

Change-Id: Ia5c434b0945bdcbc357c5e06c3164118fc91df25
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |  12 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     |  96 +++-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  16 +-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |   6 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   6 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   4 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   4 +
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 436 +++++++++---------
 .../CodeGen/AMDGPU/spill-regpressure-less.mir | 353 ++++++++++++++
 9 files changed, 707 insertions(+), 226 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index cdc9de7f65e3e..aebfe154b3139 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -409,9 +409,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
 
 // Sort recorded regions by pressure - highest at the front
 void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) {
-    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  llvm::sort(Regions, [this, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(MF, R1->MaxPressure, TargetOcc);
   });
 }
 
@@ -517,26 +516,25 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
 // Minimal Register Strategy
 
 void GCNIterativeScheduler::scheduleMinReg(bool force) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const auto TgtOcc = MFI->getOccupancy();
   sortRegionsByPressure(TgtOcc);
 
   auto MaxPressure = Regions.front()->MaxPressure;
   for (auto *R : Regions) {
-    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+    if (!force && R->MaxPressure.less(MF, MaxPressure, TgtOcc))
       break;
 
     BuildDAG DAG(*R, *this);
     const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
 
     const auto RP = getSchedulePressure(*R, MinSchedule);
-    LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+    LLVM_DEBUG(if (R->MaxPressure.less(MF, RP, TgtOcc)) {
       dbgs() << "\nWarning: Pressure becomes worse after minreg!";
       printSchedRP(dbgs(), R->MaxPressure, RP);
     });
 
-    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+    if (!force && MaxPressure.less(MF, RP, TgtOcc))
       break;
 
     scheduleRegion(*R, MinSchedule, RP);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index fd8f0bebd3bec..5c394e6d6296d 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -88,9 +88,10 @@ void GCNRegPressure::inc(unsigned Reg,
   }
 }
 
-bool GCNRegPressure::less(const GCNSubtarget &ST,
-                          const GCNRegPressure& O,
+bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc =
@@ -104,18 +105,103 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+
+  // Give first precedence to the better occupancy.
   if (Occ != OtherOcc)
     return Occ > OtherOcc;
 
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+
+  // SGPR excess pressure conditions
+  unsigned ExcessSGPR = std::max(static_cast<int>(getSGPRNum() - MaxSGPRs), 0);
+  unsigned OtherExcessSGPR =
+      std::max(static_cast<int>(O.getSGPRNum() - MaxSGPRs), 0);
+
+  auto WaveSize = ST.getWavefrontSize();
+  // The number of virtual VGPRs required to handle excess SGPR
+  unsigned VGPRForSGPRSpills = (ExcessSGPR + (WaveSize - 1)) / WaveSize;
+  unsigned OtherVGPRForSGPRSpills =
+      (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
+
+  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+
+  // Unified excess pressure conditions, accounting for VGPRs used for SGPR
+  // spills
+  unsigned ExcessVGPR =
+      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
+                                VGPRForSGPRSpills - MaxVGPRs),
+               0);
+  unsigned OtherExcessVGPR =
+      std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
+                                OtherVGPRForSGPRSpills - MaxVGPRs),
+               0);
+  // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
+  // spills
+  unsigned ExcessArchVGPR = std::max(
+      static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
+      0);
+  unsigned OtherExcessArchVGPR =
+      std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
+                                MaxArchVGPRs),
+               0);
+  // AGPR excess pressure conditions
+  unsigned ExcessAGPR = std::max(
+      static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
+                                           : (getAGPRNum() - MaxVGPRs)),
+      0);
+  unsigned OtherExcessAGPR = std::max(
+      static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
+                                           : (O.getAGPRNum() - MaxVGPRs)),
+      0);
+
+  bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
+  bool OtherExcessRP = OtherExcessSGPR || OtherExcessVGPR ||
+                       OtherExcessArchVGPR || OtherExcessAGPR;
+
+  // Give second precedence to the reduced number of spills to hold the register
+  // pressure.
+  if (ExcessRP || OtherExcessRP) {
+    // The difference in excess VGPR pressure, after including VGPRs used for
+    // SGPR spills
+    int VGPRDiff = ((OtherExcessVGPR + OtherExcessArchVGPR + OtherExcessAGPR) -
+                    (ExcessVGPR + ExcessArchVGPR + ExcessAGPR));
+
+    int SGPRDiff = OtherExcessSGPR - ExcessSGPR;
+
+    if (VGPRDiff != 0)
+      return VGPRDiff > 0;
+    if (SGPRDiff != 0) {
+      unsigned PureExcessVGPR =
+          std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+                   0) +
+          std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
+      unsigned OtherPureExcessVGPR =
+          std::max(
+              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+              0) +
+          std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
+
+      // If we have a special case where there is a tie in excess VGPR, but one
+      // of the pressures has VGPR usage from SGPR spills, prefer the pressure
+      // with SGPR spills.
+      if (PureExcessVGPR != OtherPureExcessVGPR)
+        return SGPRDiff < 0;
+      // If both pressures have the same excess pressure before and after
+      // accounting for SGPR spills, prefer fewer SGPR spills.
+      return SGPRDiff > 0;
+    }
+  }
+
   bool SGPRImportant = SGPROcc < VGPROcc;
   const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
 
-  // if both pressures disagree on what is more important compare vgprs
+  // If both pressures disagree on what is more important compare vgprs.
   if (SGPRImportant != OtherSGPRImportant) {
     SGPRImportant = false;
   }
 
-  // compare large regs pressure
+  // Give third precedence to lower register tuple pressure.
   bool SGPRFirst = SGPRImportant;
   for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
     if (SGPRFirst) {
@@ -130,6 +216,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
         return VW < OtherVW;
     }
   }
+
+  // Give final precedence to lower general RP.
   return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
                          (getVGPRNum(ST.hasGFX90AInsts()) <
                           O.getVGPRNum(ST.hasGFX90AInsts()));
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 4100970fe1a96..752f53752fa68 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -74,8 +74,20 @@ struct GCNRegPressure {
     return getOccupancy(ST) > O.getOccupancy(ST);
   }
 
-  bool less(const GCNSubtarget &ST, const GCNRegPressure& O,
-    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+  /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
+  /// less. Since GCNRegpressure contains different types of pressures, and due
+  /// to target-specific pecularities (e.g. we care about occupancy rather than
+  /// raw register usage), we determine if \p this GCNRegPressure is less than
+  /// \p O based on the following tiered comparisons (in order order of
+  /// precedence):
+  /// 1. Better occupancy
+  /// 2. Less spilling (first preference to VGPR spills, then to SGPR spills)
+  /// 3. Less tuple register pressure (first preference to VGPR tuples if we
+  /// determine that SGPR pressure is not important)
+  /// 4. Less raw register pressure (first preference to VGPR tuples if we
+  /// determine that SGPR pressure is not important)
+  bool less(const MachineFunction &MF, const GCNRegPressure &O,
+            unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
 
   bool operator==(const GCNRegPressure &O) const {
     return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 3f3550d3029aa..f993ec8409c99 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -977,6 +977,7 @@ void GCNSchedStage::checkScheduling() {
 
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+
   if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
       PressureAfter.getAGPRNum() > MaxVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
@@ -1199,9 +1200,8 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 
 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
-  if (WavesAfter <= MFI.getMinWavesPerEU() &&
-      !PressureAfter.less(ST, PressureBefore) &&
-      isRegionWithExcessRP()) {
+  if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
+      !PressureAfter.less(MF, PressureBefore)) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 46fceb8ae22c1..a933c16b6ed51 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1382,6 +1382,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
   }
 
+  /// \returns Addressable number of architectural VGPRs supported by the
+  /// subtarget.
+  unsigned getAddressableNumArchVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
+  }
+
   /// \returns Addressable number of VGPRs supported by the subtarget.
   unsigned getAddressableNumVGPRs() const {
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ce91e05e5cc81..177d99a0ac0ab 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1107,10 +1107,12 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
   return IsWave32 ? 1024 : 512;
 }
 
+unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
+
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
     return 512;
-  return 256;
+  return getAddressableNumArchVGPRs(STI);
 }
 
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 6826cd2731950..9a6d0834679ea 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -295,6 +295,10 @@ unsigned getVGPREncodingGranule(
 /// \returns Total number of VGPRs for given subtarget \p STI.
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
 
+/// \returns Addressable number of architectural VGPRs for a given subtarget \p
+/// STI.
+unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
+
 /// \returns Addressable number of VGPRs for given subtarget \p STI.
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
 
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 8ec7dfd93cd09..c773742a45950 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -37615,266 +37615,283 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-LABEL: v_vselect_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GCN-NEXT:    v_and_b32_e32 v36, 1, v13
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:48
-; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:176
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:180
 ; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:184
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
 ; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:192
+; GCN-NEXT:    v_and_b32_e32 v53, 1, v26
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_and_b32_e32 v27, 1, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 1, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 1, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 1, v30
-; GCN-NEXT:    v_and_b32_e32 v48, 1, v28
-; GCN-NEXT:    v_and_b32_e32 v50, 1, v27
-; GCN-NEXT:    v_and_b32_e32 v52, 1, v26
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:220
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:96
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:224
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:232
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:240
-; GCN-NEXT:    s_waitcnt expcnt(6)
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:120
-; GCN-NEXT:    s_waitcnt expcnt(5)
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124
-; GCN-NEXT:    s_waitcnt expcnt(4)
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:252
-; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:128
-; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:256
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:252
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:248
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:244
+; GCN-NEXT:    s_waitcnt expcnt(6)
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:240
 ; GCN-NEXT:    s_waitcnt vmcnt(14)
-; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v37
-; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v38
+; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v37
+; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v38
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v36
-; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v43
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v56
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v44
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v30
 ; GCN-NEXT:    v_cndmask_b32_e64 v30, v37, v36, s[4:5]
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32
-; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:116
-; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:244
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:100
-; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:236
+; GCN-NEXT:    s_waitcnt expcnt(5)
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:232
+; GCN-NEXT:    s_waitcnt expcnt(4)
 ; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:228
+; GCN-NEXT:    s_waitcnt expcnt(3)
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:224
 ; GCN-NEXT:    s_waitcnt expcnt(2)
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:220
 ; GCN-NEXT:    s_waitcnt expcnt(1)
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:216
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:68
-; GCN-NEXT:    v_mul_f32_e32 v38, 1.0, v46
-; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(6)
-; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
-; GCN-NEXT:    s_waitcnt vmcnt(5)
-; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
-; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
-; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:212
+; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:128
+; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v45
+; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v46
 ; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v47
+; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_f32_e32 v36, 1.0, v36
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v29
-; GCN-NEXT:    v_cndmask_b32_e64 v29, v46, v38, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v48
-; GCN-NEXT:    v_cndmask_b32_e64 v36, v37, v36, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v50
-; GCN-NEXT:    v_cndmask_b32_e64 v37, v45, v44, s[4:5]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v52
-; GCN-NEXT:    v_cndmask_b32_e64 v38, v40, v55, s[4:5]
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:8
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:136
-; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:12
-; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:140
-; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:16
-; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:144
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:24
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:152
-; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
-; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
-; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
-; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
-; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
-; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_cndmask_b32_e64 v29, v43, v42, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v28
+; GCN-NEXT:    v_cndmask_b32_e64 v28, v44, v41, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v27
+; GCN-NEXT:    v_cndmask_b32_e64 v27, v45, v55, s[4:5]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v53
+; GCN-NEXT:    v_cndmask_b32_e64 v36, v36, v54, s[4:5]
+; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:132
+; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:144
+; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 1, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
+; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
 ; GCN-NEXT:    v_and_b32_e32 v22, 1, v22
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    v_and_b32_e32 v23, 1, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 1, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 1, v25
+; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v56
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v57
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
 ; GCN-NEXT:    s_waitcnt vmcnt(12)
-; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v56
-; GCN-NEXT:    s_waitcnt vmcnt(11)
-; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v57
+; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v58
 ; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v59
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v25
-; GCN-NEXT:    v_cndmask_b32_e64 v25, v54, v53, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v25, v46, v52, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v24
-; GCN-NEXT:    v_cndmask_b32_e64 v24, v56, v47, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v24, v47, v51, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v23
-; GCN-NEXT:    v_cndmask_b32_e64 v23, v51, v49, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v23, v56, v50, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v22
-; GCN-NEXT:    v_cndmask_b32_e64 v22, v39, v28, s[4:5]
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:72
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:200
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:76
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:204
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:80
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:208
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:88
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:216
-; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_cndmask_b32_e64 v22, v57, v49, s[4:5]
+; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:196
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:200
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:204
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:208
 ; GCN-NEXT:    v_and_b32_e32 v19, 1, v19
-; GCN-NEXT:    v_and_b32_e32 v18, 1, v18
-; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
-; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
-; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v20, 1, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 1, v21
+; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v60
+; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v61
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v58
-; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v59
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v21
-; GCN-NEXT:    v_cndmask_b32_e64 v21, v56, v47, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v21, v58, v48, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v20
-; GCN-NEXT:    v_cndmask_b32_e64 v20, v58, v57, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v20, v59, v39, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v19
-; GCN-NEXT:    v_cndmask_b32_e64 v19, v54, v53, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v19, v57, v56, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
-; GCN-NEXT:    v_cndmask_b32_e64 v18, v51, v49, s[4:5]
-; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:196
-; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:52
-; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:180
-; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:36
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:164
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:20
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:148
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:4
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v60
-; GCN-NEXT:    s_waitcnt vmcnt(7)
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
-; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    v_cndmask_b32_e64 v18, v47, v46, s[4:5]
+; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:156
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:160
+; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 1, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
+; GCN-NEXT:    v_and_b32_e32 v14, 1, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 1, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 1, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 1, v17
 ; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
+; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v17
-; GCN-NEXT:    v_cndmask_b32_e64 v17, v39, v28, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v17, v52, v51, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; GCN-NEXT:    v_cndmask_b32_e64 v16, v49, v59, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v16, v50, v49, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v15
 ; GCN-NEXT:    v_cndmask_b32_e64 v15, v35, v34, s[4:5]
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
 ; GCN-NEXT:    v_cndmask_b32_e64 v14, v33, v32, s[4:5]
-; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:28
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
-; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:160
-; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:40
-; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:168
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:168
 ; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:44
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:172
-; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:172
+; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:176
 ; GCN-NEXT:    v_and_b32_e32 v11, 1, v11
-; GCN-NEXT:    v_and_b32_e32 v10, 1, v10
-; GCN-NEXT:    v_and_b32_e32 v8, 1, v8
-; GCN-NEXT:    v_and_b32_e32 v7, 1, v7
-; GCN-NEXT:    v_and_b32_e32 v6, 1, v6
-; GCN-NEXT:    v_and_b32_e32 v4, 1, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 1, v3
-; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v41, v42, v41, vcc
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132
-; GCN-NEXT:    v_and_b32_e32 v43, 1, v43
-; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
+; GCN-NEXT:    v_and_b32_e32 v12, 1, v12
+; GCN-NEXT:    v_cndmask_b32_e32 v38, v38, v40, vcc
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:256
+; GCN-NEXT:    v_and_b32_e32 v26, 1, v26
+; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
+; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
+; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
+; GCN-NEXT:    v_mul_f32_e32 v41, 1.0, v41
+; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    v_mul_f32_e32 v43, 1.0, v43
 ; GCN-NEXT:    v_mul_f32_e32 v44, 1.0, v44
 ; GCN-NEXT:    v_mul_f32_e32 v45, 1.0, v45
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
+; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
 ; GCN-NEXT:    v_mul_f32_e32 v46, 1.0, v46
-; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_waitcnt vmcnt(13)
+; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
+; GCN-NEXT:    s_waitcnt vmcnt(12)
+; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
+; GCN-NEXT:    s_waitcnt vmcnt(11)
+; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
+; GCN-NEXT:    s_waitcnt vmcnt(10)
+; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
+; GCN-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-NEXT:    v_mul_f32_e32 v35, 1.0, v35
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
 ; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_mul_f32_e32 v39, 1.0, v39
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
-; GCN-NEXT:    v_mul_f32_e32 v53, 1.0, v53
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
 ; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_mul_f32_e32 v49, 1.0, v49
+; GCN-NEXT:    v_mul_f32_e32 v51, 1.0, v51
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_mul_f32_e32 v59, 1.0, v59
-; GCN-NEXT:    v_mul_f32_e32 v54, 1.0, v54
-; GCN-NEXT:    v_mul_f32_e32 v47, 1.0, v47
-; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT:    v_mul_f32_e32 v34, 1.0, v34
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT:    v_mul_f32_e32 v56, 1.0, v56
-; GCN-NEXT:    v_mul_f32_e32 v57, 1.0, v57
 ; GCN-NEXT:    v_mul_f32_e32 v52, 1.0, v52
-; GCN-NEXT:    v_mul_f32_e32 v55, 1.0, v55
-; GCN-NEXT:    v_mul_f32_e32 v48, 1.0, v48
-; GCN-NEXT:    v_mul_f32_e32 v50, 1.0, v50
-; GCN-NEXT:    v_mul_f32_e32 v58, 1.0, v58
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v37, 1.0, v37
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v42, 1.0, v42
+; GCN-NEXT:    v_mul_f32_e32 v40, 1.0, v40
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
-; GCN-NEXT:    v_cndmask_b32_e32 v12, v53, v51, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v12, v31, v13, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v11
-; GCN-NEXT:    v_cndmask_b32_e32 v11, v31, v13, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v11, v52, v51, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GCN-NEXT:    v_cndmask_b32_e32 v10, v59, v49, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v10, v50, v49, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v9
-; GCN-NEXT:    v_cndmask_b32_e32 v9, v39, v35, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v9, v35, v34, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GCN-NEXT:    v_cndmask_b32_e32 v8, v47, v54, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v8, v33, v32, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
-; GCN-NEXT:    v_cndmask_b32_e32 v7, v34, v33, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v7, v59, v58, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v6, v32, v28, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v6, v57, v56, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v46, v45, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v5, v47, v46, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GCN-NEXT:    v_cndmask_b32_e32 v4, v57, v56, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v48, v39, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v55, v52, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v45, v44, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v50, v48, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v43, v42, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v44, v40, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v41, v55, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v42, v58, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v43
-; GCN-NEXT:    v_cndmask_b32_e32 v31, v27, v26, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v54, v53, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v26
+; GCN-NEXT:    v_cndmask_b32_e32 v31, v40, v37, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -37888,7 +37905,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v41
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
@@ -37901,25 +37918,26 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v38
-; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v37
-; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v36
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v36
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
new file mode 100644
index 0000000000000..f50688240fe8b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-regpressure-less.mir
@@ -0,0 +1,353 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @spill_regpressure_less() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="8,8" }
+...
+
+---
+name:            spill_regpressure_less
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  occupancy:       8
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: spill_regpressure_less
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF31:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF32:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF33:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF34:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF35:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF36:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF37:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF38:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF39:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF40:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF41:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF42:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF43:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF44:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF45:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF46:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF47:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF48:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF49:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF50:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF51:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF52:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF53:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF54:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF55:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF56:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF57:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF58:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF59:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF60:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF61:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF62:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF63:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF64:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF65:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF66:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF]], implicit [[DEF1]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]], implicit [[DEF10]], implicit [[DEF11]], implicit [[DEF12]], implicit [[DEF13]], implicit [[DEF14]], implicit [[DEF15]], implicit [[DEF16]], implicit [[DEF17]], implicit [[DEF18]], implicit [[DEF19]], implicit [[DEF20]], implicit [[DEF21]], implicit [[DEF22]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]], implicit [[DEF29]], implicit [[DEF30]], implicit [[DEF31]], implicit [[DEF32]], implicit [[DEF33]], implicit [[DEF34]], implicit [[DEF35]], implicit [[DEF36]], implicit [[DEF37]], implicit [[DEF38]], implicit [[DEF39]], implicit [[DEF40]], implicit [[DEF41]], implicit [[DEF42]], implicit [[DEF43]], implicit [[DEF44]], implicit [[DEF45]], implicit [[DEF46]], implicit [[DEF47]], implicit [[DEF48]], implicit [[DEF49]], implicit [[DEF50]], implicit [[DEF51]], implicit [[DEF52]], implicit [[DEF53]], implicit [[DEF54]], implicit [[DEF55]], implicit [[DEF56]], implicit [[DEF57]], implicit [[DEF58]], implicit [[DEF59]], implicit [[DEF60]], implicit [[DEF61]], implicit [[DEF62]], implicit [[DEF63]], implicit [[DEF64]], implicit [[DEF65]], implicit [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF]]
+    ; GCN-NEXT: KILL [[DEF1]]
+    ; GCN-NEXT: KILL [[DEF10]]
+    ; GCN-NEXT: KILL [[DEF12]]
+    ; GCN-NEXT: KILL [[DEF13]]
+    ; GCN-NEXT: KILL [[DEF14]]
+    ; GCN-NEXT: KILL [[DEF15]]
+    ; GCN-NEXT: KILL [[DEF16]]
+    ; GCN-NEXT: [[DEF67:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF17]]
+    ; GCN-NEXT: [[DEF68:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF69:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF69]], implicit [[DEF23]], implicit [[DEF24]], implicit [[DEF25]], implicit [[DEF26]], implicit [[DEF27]], implicit [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF2]]
+    ; GCN-NEXT: KILL [[DEF3]]
+    ; GCN-NEXT: KILL [[DEF4]]
+    ; GCN-NEXT: KILL [[DEF5]]
+    ; GCN-NEXT: KILL [[DEF6]]
+    ; GCN-NEXT: KILL [[DEF7]]
+    ; GCN-NEXT: KILL [[DEF8]]
+    ; GCN-NEXT: KILL [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF18]]
+    ; GCN-NEXT: KILL [[DEF19]]
+    ; GCN-NEXT: [[DEF70:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, implicit [[DEF70]], implicit [[DEF2]], implicit [[DEF3]], implicit [[DEF4]], implicit [[DEF5]], implicit [[DEF6]], implicit [[DEF7]], implicit [[DEF8]], implicit [[DEF9]]
+    ; GCN-NEXT: KILL [[DEF69]], implicit-def %70, implicit-def %71, implicit-def %72, implicit-def %73, implicit-def %74, implicit-def %75, implicit-def %76, implicit-def %77
+    ; GCN-NEXT: [[DEF71:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF72:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF20]]
+    ; GCN-NEXT: [[DEF73:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF11]]
+    ; GCN-NEXT: [[DEF74:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF21]]
+    ; GCN-NEXT: [[DEF75:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF22]]
+    ; GCN-NEXT: [[DEF76:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: KILL [[DEF23]]
+    ; GCN-NEXT: KILL [[DEF24]]
+    ; GCN-NEXT: KILL [[DEF25]]
+    ; GCN-NEXT: KILL [[DEF26]]
+    ; GCN-NEXT: KILL [[DEF27]]
+    ; GCN-NEXT: KILL [[DEF28]]
+    ; GCN-NEXT: KILL [[DEF29]]
+    ; GCN-NEXT: KILL [[DEF30]]
+    ; GCN-NEXT: KILL [[DEF31]]
+    ; GCN-NEXT: KILL [[DEF32]]
+    ; GCN-NEXT: KILL [[DEF33]]
+    ; GCN-NEXT: KILL [[DEF34]]
+    ; GCN-NEXT: KILL [[DEF35]]
+    ; GCN-NEXT: KILL [[DEF36]]
+    ; GCN-NEXT: KILL [[DEF37]]
+    ; GCN-NEXT: KILL [[DEF38]]
+    ; GCN-NEXT: KILL [[DEF39]]
+    ; GCN-NEXT: KILL [[DEF40]]
+    ; GCN-NEXT: KILL [[DEF41]]
+    ; GCN-NEXT: KILL [[DEF42]]
+    ; GCN-NEXT: KILL [[DEF43]]
+    ; GCN-NEXT: KILL [[DEF44]]
+    ; GCN-NEXT: KILL [[DEF45]]
+    ; GCN-NEXT: KILL [[DEF46]]
+    ; GCN-NEXT: KILL [[DEF47]]
+    ; GCN-NEXT: KILL [[DEF48]]
+    ; GCN-NEXT: KILL [[DEF49]]
+    ; GCN-NEXT: KILL [[DEF50]]
+    ; GCN-NEXT: KILL [[DEF51]]
+    ; GCN-NEXT: KILL [[DEF52]]
+    ; GCN-NEXT: KILL [[DEF53]]
+    ; GCN-NEXT: KILL [[DEF54]]
+    ; GCN-NEXT: KILL [[DEF55]]
+    ; GCN-NEXT: KILL [[DEF56]]
+    ; GCN-NEXT: KILL [[DEF57]]
+    ; GCN-NEXT: KILL [[DEF58]]
+    ; GCN-NEXT: KILL [[DEF59]]
+    ; GCN-NEXT: KILL [[DEF60]]
+    ; GCN-NEXT: KILL [[DEF61]]
+    ; GCN-NEXT: KILL [[DEF62]]
+    ; GCN-NEXT: KILL [[DEF63]]
+    ; GCN-NEXT: KILL [[DEF64]]
+    ; GCN-NEXT: KILL [[DEF65]]
+    ; GCN-NEXT: KILL [[DEF66]]
+    ; GCN-NEXT: KILL [[DEF67]]
+    ; GCN-NEXT: KILL [[DEF68]]
+    ; GCN-NEXT: KILL [[DEF71]]
+    ; GCN-NEXT: KILL [[DEF72]]
+    ; GCN-NEXT: KILL [[DEF73]]
+    ; GCN-NEXT: KILL [[DEF74]]
+    ; GCN-NEXT: KILL [[DEF75]]
+    ; GCN-NEXT: KILL [[DEF76]]
+    ; GCN-NEXT: KILL [[DEF70]]
+    ; GCN-NEXT: KILL %70
+    ; GCN-NEXT: KILL %71
+    ; GCN-NEXT: KILL %72
+    ; GCN-NEXT: KILL %73
+    ; GCN-NEXT: KILL %74
+    ; GCN-NEXT: KILL %75
+    ; GCN-NEXT: KILL %76
+    ; GCN-NEXT: KILL %77
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    %6:vgpr_32 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vgpr_32 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %42:vgpr_32 = IMPLICIT_DEF
+    %43:vgpr_32 = IMPLICIT_DEF
+    %44:vgpr_32 = IMPLICIT_DEF
+    %45:vgpr_32 = IMPLICIT_DEF
+    %46:vgpr_32 = IMPLICIT_DEF
+    %47:vgpr_32 = IMPLICIT_DEF
+    %48:vgpr_32 = IMPLICIT_DEF
+    %49:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = IMPLICIT_DEF
+    %51:vgpr_32 = IMPLICIT_DEF
+    %52:vgpr_32 = IMPLICIT_DEF
+    %53:vgpr_32 = IMPLICIT_DEF
+    %54:vgpr_32 = IMPLICIT_DEF
+    %55:vgpr_32 = IMPLICIT_DEF
+    %56:vgpr_32 = IMPLICIT_DEF
+    %57:vgpr_32 = IMPLICIT_DEF
+    %58:vgpr_32 = IMPLICIT_DEF
+    %59:vgpr_32 = IMPLICIT_DEF
+    %60:vgpr_32 = IMPLICIT_DEF
+    %61:vgpr_32 = IMPLICIT_DEF
+    %62:vgpr_32 = IMPLICIT_DEF
+    %63:vgpr_32 = IMPLICIT_DEF
+    %64:vgpr_32 = IMPLICIT_DEF
+    %65:vgpr_32 = IMPLICIT_DEF
+    %66:vgpr_32 = IMPLICIT_DEF
+    %67:vgpr_32 = IMPLICIT_DEF
+    %68:vgpr_32 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11, implicit %12, implicit %13, implicit %14, implicit %15, implicit %16, implicit %17, implicit %18, implicit %19, implicit %20, implicit %21, implicit %22, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28, implicit %29, implicit %30, implicit %31, implicit %32, implicit %33, implicit %34, implicit %35, implicit %36, implicit %37, implicit %38, implicit %39, implicit %40, implicit %41, implicit %42, implicit %43, implicit %44, implicit %45, implicit %46, implicit %47, implicit %48, implicit %49, implicit %50, implicit %51, implicit %52, implicit %53, implicit %54, implicit %55, implicit %56, implicit %57, implicit %58, implicit %59, implicit %60, implicit %61, implicit %62, implicit %63, implicit %64, implicit %65, implicit %66
+    %69:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %69, implicit %23, implicit %24, implicit %25, implicit %26, implicit %27, implicit %28
+    KILL %0
+    KILL %1
+    KILL %2
+    KILL %3
+    KILL %4
+    KILL %5
+    KILL %6
+    KILL %7
+    KILL %8
+    KILL %9
+    KILL %10
+    KILL %12
+    KILL %13
+    KILL %14
+    KILL %15
+    KILL %16
+    KILL %17
+    KILL %18
+    KILL %19
+    KILL %69:sgpr_128, implicit-def %77:vgpr_32, implicit-def %78:vgpr_32, implicit-def %79:vgpr_32, implicit-def %80:vgpr_32, implicit-def %81:vgpr_32, implicit-def %82:vgpr_32, implicit-def %83:vgpr_32, implicit-def %84:vgpr_32
+    %70:vgpr_32 = IMPLICIT_DEF
+    %71:vgpr_32 = IMPLICIT_DEF
+    %72:vgpr_32 = IMPLICIT_DEF
+    %73:vgpr_32 = IMPLICIT_DEF
+    %74:vgpr_32 = IMPLICIT_DEF
+    %75:vgpr_32 = IMPLICIT_DEF
+    %76:sgpr_128 = IMPLICIT_DEF
+    INLINEASM &"", 1, implicit %76, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9
+    KILL %20
+    KILL %11
+    KILL %21
+    KILL %22
+    KILL %23
+    KILL %24
+    KILL %25
+    KILL %26
+    KILL %27
+    KILL %28
+    KILL %29
+    KILL %30
+    KILL %31
+    KILL %32
+    KILL %33
+    KILL %34
+    KILL %35
+    KILL %36
+    KILL %37
+    KILL %38
+    KILL %39
+    KILL %40
+    KILL %41
+    KILL %42
+    KILL %43
+    KILL %44
+    KILL %45
+    KILL %46
+    KILL %47
+    KILL %48
+    KILL %49
+    KILL %50
+    KILL %51
+    KILL %52
+    KILL %53
+    KILL %54
+    KILL %55
+    KILL %56
+    KILL %57
+    KILL %58
+    KILL %59
+    KILL %60
+    KILL %61
+    KILL %62
+    KILL %63
+    KILL %64
+    KILL %65
+    KILL %66
+    KILL %67
+    KILL %68
+    KILL %70
+    KILL %71
+    KILL %72
+    KILL %73
+    KILL %74
+    KILL %75
+    KILL %76
+    KILL %77
+    KILL %78
+    KILL %79
+    KILL %80
+    KILL %81
+    KILL %82
+    KILL %83
+    KILL %84
+...
+## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+# GCN: {{.*}}

From 860b6edfa9b344fbf8c500c17158c8212ea87d1c Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Tue, 27 Feb 2024 05:08:58 +0800
Subject: [PATCH 369/546] MIPS: fix emitDirectiveCpsetup on N32 (#80534)

In gas, .cpsetup may expand to one of two code sequences (one is related to `__gnu_local_gp`), depending on -mno-shared and -msym32.
Since Clang doesn't support -mno-shared or -msym32, .cpsetup expands to one code sequence.
The N32 condition incorrectly leads to the incorrect `__gnu_local_gp` code sequence.

```
00000000 <t1>:
   0:   ffbc0008        sd      gp,8(sp)
   4:   3c1c0000        lui     gp,0x0
                        4: R_MIPS_HI16  __gnu_local_gp
   8:   279c0000        addiu   gp,gp,0
                        8: R_MIPS_LO16  __gnu_local_gp
```

Fixes: #52785
---
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  | 12 +++--
 llvm/test/MC/Mips/cpsetup.s                   | 47 ++++++++++++++-----
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 27d7f0f261d10..adfcea7361583 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -1255,7 +1255,9 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  if (getABI().IsN32()) {
+#if 0
+  // We haven't support -mabicalls -mno-shared yet.
+  if (-mno-shared) {
     MCSymbol *GPSym = MCA.getContext().getOrCreateSymbol("__gnu_local_gp");
     const MipsMCExpr *HiExpr = MipsMCExpr::create(
         MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GPSym, MCA.getContext()),
@@ -1273,6 +1275,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
 
     return;
   }
+#endif
 
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
       MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
@@ -1288,8 +1291,11 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   emitRRX(Mips::ADDiu, GPReg, GPReg, MCOperand::createExpr(LoExpr), SMLoc(),
           &STI);
 
-  // daddu  $gp, $gp, $funcreg
-  emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  // (d)addu  $gp, $gp, $funcreg
+  if (getABI().IsN32())
+    emitRRR(Mips::ADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
+  else
+    emitRRR(Mips::DADDu, GPReg, GPReg, RegNo, SMLoc(), &STI);
 }
 
 void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s
index 8e587aea3e7e6..4a027c6e796ae 100644
--- a/llvm/test/MC/Mips/cpsetup.s
+++ b/llvm/test/MC/Mips/cpsetup.s
@@ -4,8 +4,6 @@
 # RUN: llvm-mc -triple mips-unknown-linux -target-abi o32 %s | \
 # RUN:   FileCheck -check-prefixes=ASM,ASM-O32 %s
 
-# FIXME: Now we check .cpsetup expansion for `-mno-shared` case only.
-#        We also need to implement/check the `-mshared` case.
 # RUN: llvm-mc -triple mips64-unknown-linux -target-abi n32 -filetype=obj -o - %s | \
 # RUN:   llvm-objdump --no-print-imm-hex -d -r -z - | \
 # RUN:   FileCheck -check-prefixes=ALL,NXX,N32 %s
@@ -35,11 +33,16 @@ t1:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -64,11 +67,16 @@ t2:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, $2, __cerror
@@ -101,11 +109,16 @@ t3:
 
 # NXX-NEXT: move     $2, $gp
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: {{^ *0+}}38: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}40: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16 .text
+# N32-NEXT: {{^ *0+}}40: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: {{^ *0+}}3c: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: {{^ *0+}}44: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16 .text
+# N32-NEXT: {{^ *0+}}44: R_MIPS_GPREL16 .text
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 # NXX-NEXT: nop
 # NXX-NEXT: sub $3, $3, $2
@@ -158,11 +171,16 @@ t5:
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror
@@ -184,11 +202,16 @@ IMM_8 = 8
 
 # NXX-NEXT: sd       $gp, 8($sp)
 # NXX-NEXT: lui      $gp, 0
-# N32-NEXT: R_MIPS_HI16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_HI16
 # NXX-NEXT: addiu    $gp, $gp, 0
-# N32-NEXT: R_MIPS_LO16 __gnu_local_gp
 # N64-NEXT: R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16  __cerror
+# N32-NEXT: R_MIPS_GPREL16 __cerror
+# N32-NEXT: R_MIPS_SUB
+# N32-NEXT: R_MIPS_LO16
+# N32-NEXT: addu     $gp, $gp, $25
 # N64-NEXT: daddu    $gp, $gp, $25
 
 # ASM-NEXT: .cpsetup $25, 8, __cerror

From 99335a646bc0b10066d77cec08ae8cab0162efde Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 26 Feb 2024 22:17:33 +0100
Subject: [PATCH 370/546] [flang][OpenMP] Add missing implementation for
 'omp_is_initial_device' (#83056)

Resolve issue #82047
---
 openmp/libomptarget/DeviceRTL/src/State.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index 40f99e07b44c1..a1e4fa2449d9a 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -426,6 +426,8 @@ int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); }
 int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 
 int omp_get_initial_device(void) { return -1; }
+
+int omp_is_initial_device(void) { return 0; }
 }
 
 extern "C" {

From acdd36e677e396909f700e5dfb519d907a6b4560 Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard@google.com>
Date: Mon, 26 Feb 2024 13:23:30 -0800
Subject: [PATCH 371/546] [ItaniumDemangle] reject A-F in FP literals (#82864)

The Itanium C++ ABI specifies that FP literals are encoded using a
lowercase hexadecimal string. Previously, libc++abi allowed uppercase
A-F characters but decoded them by subtracting 'a' from them, producing
negative digit values. It is especially confusing to accept an 'E' digit
because 'E' marks the end of the FP literal.
---
 libcxxabi/src/demangle/ItaniumDemangle.h | 2 +-
 libcxxabi/test/test_demangle.pass.cpp    | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 04755e2be3c5d..4a0444d407ea7 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -5541,7 +5541,7 @@ Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
     return nullptr;
   std::string_view Data(First, N);
   for (char C : Data)
-    if (!std::isxdigit(C))
+    if (!(C >= '0' && C <= '9') && !(C >= 'a' && C <= 'f'))
       return nullptr;
   First += N;
   if (!consumeIf('E'))
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index b7e41099ebfc5..88637b84de016 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -30222,9 +30222,8 @@ struct FPLiteralCase {
      }},
 #endif
 #if LDBL_FP128
-    // This was found by libFuzzer+HWASan on aarch64 Android.
-    {"1\006ILeeeEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE",
-     {"\x6<-0x1.cecececececececececececececep+11983L>"}},
+    // A 32-character FP literal of long double type
+    {"3FooILeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeEE", {"Foo<-0x1.eeeeeeeeeeeeeeeeeeeeeeeeeeeep+12015L>"}},
 #endif
 };
 const unsigned NF = sizeof(fp_literal_cases) / sizeof(fp_literal_cases[0]);
@@ -30238,6 +30237,8 @@ const char* invalid_cases[] =
     "NSoERj5E=Y1[uM:ga",
     "Aon_PmKVPDk7?fg4XP5smMUL6;<WsI_mgbf23cCgsHbT<l8EE\0uVRkNOoXDrgdA4[8IU>Vl<>IL8ayHpiVDDDXTY;^o9;i",
     "_ZNSt16allocator_traitsISaIN4llvm3sys2fs18directory_iteratorEEE9constructIS3_IS3_EEEDTcl12_S_constructfp_fp0_spcl7forwardIT0_Efp1_EEERS4_PT_DpOS7_",
+    "3FooILdaaaaaaaaaaAAAAaaEE",
+    "3FooILdaaaaaaaaaaaaaaEE",
 #if !LDBL_FP80
     "_ZN5test01hIfEEvRAcvjplstT_Le4001a000000000000000E_c",
 #endif

From e9cdd165d7bce695d13f10df3480f9f66fd28b21 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Mon, 26 Feb 2024 13:30:31 -0800
Subject: [PATCH 372/546] [docs] Remove the Packaging "Tips" which seems to be
 about pre-cmake ./configure (#82958)

It was already marked outdated in 2012: 5b26461e0999 and hasn't been
updated since.
---
 llvm/docs/Packaging.rst | 73 -----------------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 llvm/docs/Packaging.rst

diff --git a/llvm/docs/Packaging.rst b/llvm/docs/Packaging.rst
deleted file mode 100644
index 176e5b391229b..0000000000000
--- a/llvm/docs/Packaging.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-========================
-Advice on Packaging LLVM
-========================
-
-.. contents::
-   :local:
-
-Overview
-========
-
-LLVM sets certain default configure options to make sure our developers don't
-break things for constrained platforms.  These settings are not optimal for most
-desktop systems, and we hope that packagers (e.g., Redhat, Debian, MacPorts,
-etc.) will tweak them.  This document lists settings we suggest you tweak.
-
-LLVM's API changes with each release, so users are likely to want, for example,
-both LLVM-2.6 and LLVM-2.7 installed at the same time to support apps developed
-against each.
-
-Compile Flags
-=============
-
-LLVM runs much more quickly when it's optimized and assertions are removed.
-However, such a build is currently incompatible with users who build without
-defining ``NDEBUG``, and the lack of assertions makes it hard to debug problems
-in user code.  We recommend allowing users to install both optimized and debug
-versions of LLVM in parallel.  The following configure flags are relevant:
-
-``--disable-assertions``
-    Builds LLVM with ``NDEBUG`` defined.  Changes the LLVM ABI.  Also available
-    by setting ``DISABLE_ASSERTIONS=0|1`` in ``make``'s environment.  This
-    defaults to enabled regardless of the optimization setting, but it slows
-    things down.
-
-``--enable-debug-symbols``
-    Builds LLVM with ``-g``.  Also available by setting ``DEBUG_SYMBOLS=0|1`` in
-    ``make``'s environment.  This defaults to disabled when optimizing, so you
-    should turn it back on to let users debug their programs.
-
-``--enable-optimized``
-    (For git checkouts) Builds LLVM with ``-O2`` and, by default, turns off
-    debug symbols.  Also available by setting ``ENABLE_OPTIMIZED=0|1`` in
-    ``make``'s environment.  This defaults to enabled when not in a
-    checkout.
-
-C++ Features
-============
-
-RTTI
-    LLVM disables RTTI by default.  Add ``REQUIRES_RTTI=1`` to your environment
-    while running ``make`` to re-enable it.  This will allow users to build with
-    RTTI enabled and still inherit from LLVM classes.
-
-Shared Library
-==============
-
-Configure with ``--enable-shared`` to build
-``libLLVM-<major>.<minor>.(so|dylib)`` and link the tools against it.  This
-saves lots of binary size at the cost of some startup time.
-
-Dependencies
-============
-
-``--enable-libffi``
-    Depend on `libffi <http://sources.redhat.com/libffi/>`_ to allow the LLVM
-    interpreter to call external functions.
-
-``--with-oprofile``
-
-    Depend on `libopagent
-    <http://oprofile.sourceforge.net/doc/devel/index.html>`_ (>=version 0.9.4)
-    to let the LLVM JIT tell oprofile about function addresses and line
-    numbers.

From 15a7de697ae5ad88fd96ef7dc39ac479cc6e2eaf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 23 Feb 2024 12:18:50 -0600
Subject: [PATCH 373/546] [SelectionDAG] Support sign tracking through
 `{S|U}INT_TO_FP`

Just a minimal amount of easily provable tracking.

Proofs: https://alive2.llvm.org/ce/z/RQYbdw

Closes #82808

Alive2 to has an issue with `(sitofp i1)`, but it can
be verified by hand: https://godbolt.org/z/qKr7hT7s9
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 12 +++
 .../fold-int-pow2-with-fmul-or-fdiv.ll        | 10 +--
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 80 ++++++++---------
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 85 ++-----------------
 .../X86/fold-int-pow2-with-fmul-or-fdiv.ll    | 54 +++---------
 5 files changed, 75 insertions(+), 166 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0ceda27d40665..e150f27240d7f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4004,6 +4004,18 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
     break;
   }
+  case ISD::UINT_TO_FP: {
+    Known.makeNonNegative();
+    break;
+  }
+  case ISD::SINT_TO_FP: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Known2.isNonNegative())
+      Known.makeNonNegative();
+    else if (Known2.isNegative())
+      Known.makeNegative();
+    break;
+  }
   case ISD::FP_TO_UINT_SAT: {
     // FP_TO_UINT_SAT produces an unsigned value that fits in the saturating VT.
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index 8f3100c82772b..ad062da5491da 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -525,12 +525,10 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-LABEL: fdiv_pow_shl_cnt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #8 // =0x8
-; CHECK-NEXT:    and x9, x0, #0x1f
-; CHECK-NEXT:    fmov s1, #-0.50000000
-; CHECK-NEXT:    lsl x8, x8, x9
-; CHECK-NEXT:    scvtf s0, x8
-; CHECK-NEXT:    fdiv s0, s1, s0
+; CHECK-NEXT:    mov w8, #-1115684864 // =0xbd800000
+; CHECK-NEXT:    and w9, w0, #0x1f
+; CHECK-NEXT:    sub w8, w8, w9, lsl #23
+; CHECK-NEXT:    fmov s0, w8
 ; CHECK-NEXT:    ret
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index c773742a45950..ebb77c13c4af7 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -32379,7 +32379,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_i16_to_bf16:
@@ -32387,7 +32387,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_i16_to_bf16:
@@ -32455,8 +32455,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32466,8 +32466,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32566,9 +32566,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32580,9 +32580,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32682,10 +32682,10 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32699,10 +32699,10 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32857,14 +32857,14 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_i32_to_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_i32_to_bf16:
@@ -32928,8 +32928,8 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -32937,8 +32937,8 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -33031,9 +33031,9 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -33042,9 +33042,9 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -33140,10 +33140,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -33153,10 +33153,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX7-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index b261b3129f3fe..7c5f6d5e33efe 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -2532,58 +2532,16 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v0, 31, v0
-; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT:    v_ffbh_i32_e32 v3, v1
-; VI-NEXT:    v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT:    v_add_u32_e32 v3, vcc, -1, v3
-; VI-NEXT:    v_min_u32_e32 v2, v3, v2
-; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; VI-NEXT:    v_min_u32_e32 v0, 1, v0
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
-; VI-NEXT:    v_ldexp_f32 v0, v0, v1
-; VI-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, -0.5
-; VI-NEXT:    v_div_scale_f32 v2, vcc, -0.5, v0, -0.5
-; VI-NEXT:    v_rcp_f32_e32 v3, v1
-; VI-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT:    v_fma_f32 v3, v4, v3, v3
-; VI-NEXT:    v_mul_f32_e32 v4, v2, v3
-; VI-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT:    v_fma_f32 v4, v5, v3, v4
-; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
-; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, 0xbd800000, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 31, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
-; GFX10-NEXT:    v_min_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT:    v_div_scale_f32 v1, s4, v0, v0, -0.5
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX10-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
-; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX10-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX10-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX10-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX10-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX10-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0xbd800000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt:
@@ -2591,39 +2549,8 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 31, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cls_i32_e32 v3, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 32, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
-; GFX11-NEXT:    v_min_u32_e32 v2, v3, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT:    v_div_scale_f32 v1, null, v0, v0, -0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
-; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v2
-; GFX11-NEXT:    v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX11-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v2
-; GFX11-NEXT:    v_fma_f32 v1, -v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX11-NEXT:    v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 23, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0xbd800000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 2001fddfaac40..5f326b6d6998f 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1368,49 +1368,21 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
 ; CHECK-SSE:       # %bb.0:
-; CHECK-SSE-NEXT:    movq %rdi, %rcx
-; CHECK-SSE-NEXT:    andb $31, %cl
-; CHECK-SSE-NEXT:    movl $8, %eax
-; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT:    shlq %cl, %rax
-; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
+; CHECK-SSE-NEXT:    andl $31, %edi
+; CHECK-SSE-NEXT:    shll $23, %edi
+; CHECK-SSE-NEXT:    movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-SSE-NEXT:    subl %edi, %eax
+; CHECK-SSE-NEXT:    movd %eax, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt:
-; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    movq %rdi, %rcx
-; CHECK-AVX2-NEXT:    andb $31, %cl
-; CHECK-AVX2-NEXT:    movl $8, %eax
-; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-NO-FASTFMA:       # %bb.0:
-; CHECK-NO-FASTFMA-NEXT:    movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT:    andb $31, %cl
-; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-FMA:       # %bb.0:
-; CHECK-FMA-NEXT:    andb $31, %dil
-; CHECK-FMA-NEXT:    movl $8, %eax
-; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT:    retq
+; CHECK-AVX-LABEL: fdiv_pow_shl_cnt:
+; CHECK-AVX:       # %bb.0:
+; CHECK-AVX-NEXT:    andl $31, %edi
+; CHECK-AVX-NEXT:    shll $23, %edi
+; CHECK-AVX-NEXT:    movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-AVX-NEXT:    subl %edi, %eax
+; CHECK-AVX-NEXT:    vmovd %eax, %xmm0
+; CHECK-AVX-NEXT:    retq
   %cnt = and i64 %cnt_in, 31
   %shl = shl i64 8, %cnt
   %conv = sitofp i64 %shl to float

From 841a4168addba6931114e81d446c35114208eda2 Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Mon, 26 Feb 2024 13:43:39 -0800
Subject: [PATCH 374/546] [BOLT] Fix runtime/instrument-wrong-target.s test
 (#82858)

Test was failing when only X86 was specified for LLVM_TARGETS_TO_BUILD.
Changed so that it will now report unsupporeted.

For "X86;AArch64" it still passes.
For "X86" reports UNSUPPORTED: BOLT :: runtime/instrument-wrong-target.s
(1 of 1)
---
 bolt/test/runtime/instrument-wrong-target.s | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bolt/test/runtime/instrument-wrong-target.s b/bolt/test/runtime/instrument-wrong-target.s
index 31cae734a0bff..b25c924ffbcc0 100644
--- a/bolt/test/runtime/instrument-wrong-target.s
+++ b/bolt/test/runtime/instrument-wrong-target.s
@@ -1,7 +1,8 @@
 # Test that BOLT errs when trying to instrument a binary with a different
 # architecture than the one BOLT is built for.
 
-# REQUIRES: x86_64-linux,bolt-runtime,target=x86_64{{.*}}
+# REQUIRES: x86_64-linux,bolt-runtime
+# REQUIRES: target-x86_64 && aarch64-registered-target
 
 # RUN: llvm-mc -triple aarch64 -filetype=obj %s -o %t.o
 # RUN: ld.lld -q -pie -o %t.exe %t.o

From 21d83324fbdbd91de0115d48f3b55979f57807b7 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Mon, 26 Feb 2024 21:59:42 +0000
Subject: [PATCH 375/546] [clang] Implement __builtin_popcountg (#82359)

Fixes #82058.
---
 clang/docs/LanguageExtensions.rst             | 31 +++++++++++++
 clang/include/clang/Basic/Builtins.td         |  6 +++
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  3 +-
 clang/lib/Sema/SemaChecking.cpp               | 22 ++++++++++
 clang/test/CodeGen/builtins.c                 | 43 +++++++++++++++++++
 clang/test/Sema/builtin-popcountg.c           | 14 ++++++
 7 files changed, 120 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Sema/builtin-popcountg.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 711baf45f449a..2a177814c4df7 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3473,6 +3473,37 @@ builtin, the mangler emits their usual pattern without any special treatment.
   // Computes a unique stable name for the given type.
   constexpr const char * __builtin_sycl_unique_stable_name( type-id );
 
+``__builtin_popcountg``
+-----------------------
+
+``__builtin_popcountg`` returns the number of 1 bits in the argument. The
+argument can be of any integer type.
+
+**Syntax**:
+
+.. code-block:: c++
+
+  int __builtin_popcountg(type x)
+
+**Examples**:
+
+.. code-block:: c++
+
+  int x = 1;
+  int x_pop = __builtin_popcountg(x);
+
+  unsigned long y = 3;
+  int y_pop = __builtin_popcountg(y);
+
+  _BitInt(128) z = 7;
+  int z_pop = __builtin_popcountg(z);
+
+**Description**:
+
+``__builtin_popcountg`` is meant to be a type-generic alternative to the
+``__builtin_popcount{,l,ll}`` builtins, with support for other integer types,
+such as ``__int128`` and C23 ``_BitInt(N)``.
+
 Multiprecision Arithmetic Builtins
 ----------------------------------
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index e3432f7925ba1..3bc35c5bb38ec 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -688,6 +688,12 @@ def Popcount : Builtin, BitInt_Long_LongLongTemplate {
   let Prototype = "int(unsigned T)";
 }
 
+def Popcountg : Builtin {
+  let Spellings = ["__builtin_popcountg"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "int(...)";
+}
+
 def Clrsb : Builtin, BitInt_Long_LongLongTemplate {
   let Spellings = ["__builtin_clrsb"];
   let Attributes = [NoThrow, Const, Constexpr];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 57784a4ba2e38..c8141fefb8edb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11983,7 +11983,8 @@ def err_builtin_invalid_arg_type: Error <
   "pointer to a valid matrix element type|"
   "signed integer or floating point type|vector type|"
   "floating point type|"
-  "vector of integers}1 (was %2)">;
+  "vector of integers|"
+  "type of integer}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 54d7451a9d622..2d16e7cdc0605 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3217,7 +3217,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__popcnt64:
   case Builtin::BI__builtin_popcount:
   case Builtin::BI__builtin_popcountl:
-  case Builtin::BI__builtin_popcountll: {
+  case Builtin::BI__builtin_popcountll:
+  case Builtin::BI__builtin_popcountg: {
     Value *ArgValue = EmitScalarExpr(E->getArg(0));
 
     llvm::Type *ArgType = ArgValue->getType();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 984088e345c80..7d8a2cf280c33 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2189,6 +2189,23 @@ static bool SemaBuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall,
   return false;
 }
 
+/// Checks that __builtin_popcountg was called with a single argument, which is
+/// an integer.
+static bool SemaBuiltinPopcountg(Sema &S, CallExpr *TheCall) {
+  if (checkArgCount(S, TheCall, 1))
+    return true;
+
+  Expr *Arg = TheCall->getArg(0);
+  QualType ArgTy = Arg->getType();
+
+  if (!ArgTy->isIntegerType()) {
+    S.Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+        << 1 << /*integer ty*/ 7 << ArgTy;
+    return true;
+  }
+  return false;
+}
+
 ExprResult
 Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                CallExpr *TheCall) {
@@ -2959,7 +2976,12 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
            diag::err_hip_invalid_args_builtin_mangled_name);
       return ExprError();
     }
+    break;
   }
+  case Builtin::BI__builtin_popcountg:
+    if (SemaBuiltinPopcountg(*this, TheCall))
+      return ExprError();
+    break;
   }
 
   if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall))
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index 88282120283b8..73866116e07e7 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -940,4 +940,47 @@ void test_builtin_os_log_long_double(void *buf, long double ld) {
 // CHECK: %[[V3:.*]] = load i128, ptr %[[ARG0_ADDR]], align 16
 // CHECK: store i128 %[[V3]], ptr %[[ARGDATA]], align 1
 
+// CHECK-LABEL: define{{.*}} void @test_builtin_popcountg
+void test_builtin_popcountg(unsigned char uc, unsigned short us,
+                            unsigned int ui, unsigned long ul,
+                            unsigned long long ull, unsigned __int128 ui128,
+                            unsigned _BitInt(128) ubi128) {
+  volatile int pop;
+  pop = __builtin_popcountg(uc);
+  // CHECK: %1 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %conv = zext i8 %1 to i32
+  // CHECK-NEXT: %2 = call i32 @llvm.ctpop.i32(i32 %conv)
+  // CHECK-NEXT: store volatile i32 %2, ptr %pop, align 4
+  pop = __builtin_popcountg(us);
+  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %conv1 = zext i16 %3 to i32
+  // CHECK-NEXT: %4 = call i32 @llvm.ctpop.i32(i32 %conv1)
+  // CHECK-NEXT: store volatile i32 %4, ptr %pop, align 4
+  pop = __builtin_popcountg(ui);
+  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %6 = call i32 @llvm.ctpop.i32(i32 %5)
+  // CHECK-NEXT: store volatile i32 %6, ptr %pop, align 4
+  pop = __builtin_popcountg(ul);
+  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %8 = call i64 @llvm.ctpop.i64(i64 %7)
+  // CHECK-NEXT: %cast = trunc i64 %8 to i32
+  // CHECK-NEXT: store volatile i32 %cast, ptr %pop, align 4
+  pop = __builtin_popcountg(ull);
+  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %10 = call i64 @llvm.ctpop.i64(i64 %9)
+  // CHECK-NEXT: %cast2 = trunc i64 %10 to i32
+  // CHECK-NEXT: store volatile i32 %cast2, ptr %pop, align 4
+  pop = __builtin_popcountg(ui128);
+  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %12 = call i128 @llvm.ctpop.i128(i128 %11)
+  // CHECK-NEXT: %cast3 = trunc i128 %12 to i32
+  // CHECK-NEXT: store volatile i32 %cast3, ptr %pop, align 4
+  pop = __builtin_popcountg(ubi128);
+  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %14 = call i128 @llvm.ctpop.i128(i128 %13)
+  // CHECK-NEXT: %cast4 = trunc i128 %14 to i32
+  // CHECK-NEXT: store volatile i32 %cast4, ptr %pop, align 4
+  // CHECK-NEXT: ret void
+}
+
 #endif
diff --git a/clang/test/Sema/builtin-popcountg.c b/clang/test/Sema/builtin-popcountg.c
new file mode 100644
index 0000000000000..e18b910046ff0
--- /dev/null
+++ b/clang/test/Sema/builtin-popcountg.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple=x86_64-pc-linux-gnu -fsyntax-only -verify -Wpedantic %s
+
+typedef int int2 __attribute__((ext_vector_type(2)));
+
+void test_builtin_popcountg(int i, double d, int2 i2) {
+  __builtin_popcountg();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  __builtin_popcountg(i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  __builtin_popcountg(d);
+  // expected-error@-1 {{1st argument must be a type of integer (was 'double')}}
+  __builtin_popcountg(i2);
+  // expected-error@-1 {{1st argument must be a type of integer (was 'int2' (vector of 2 'int' values))}}
+}

From 6de5fcc74637116581d7b9b39c16fc252a5a54ef Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:00:31 -0800
Subject: [PATCH 376/546] [BOLT][DWARF] Add support for .debug_names (#81062)

DWARF5 spec supports the .debug_names acceleration table. This is the
formalized version of combination of gdb-index/pubnames/types. Added
implementation of it to BOLT. It supports both monolothic and split
dwarf, with and without Type Units. It does not include parent indices.
This will be in followup PR. Unlike LLVM output this will put all the
CUs and TUs into one Module.
---
 bolt/include/bolt/Core/BinaryContext.h        |   4 +
 bolt/include/bolt/Core/DIEBuilder.h           |   7 +-
 bolt/include/bolt/Core/DebugData.h            |   9 +-
 bolt/include/bolt/Core/DebugNames.h           | 172 +++++
 bolt/include/bolt/Rewrite/DWARFRewriter.h     |   7 +-
 bolt/lib/Core/CMakeLists.txt                  |   1 +
 bolt/lib/Core/DIEBuilder.cpp                  |  31 +-
 bolt/lib/Core/DebugNames.cpp                  | 618 +++++++++++++++
 bolt/lib/Rewrite/DWARFRewriter.cpp            |  34 +-
 .../X86/Inputs/dwarf5-debug-names-helper.s    | 487 ++++++++++++
 .../test/X86/Inputs/dwarf5-debug-names-main.s | 712 ++++++++++++++++++
 .../X86/Inputs/dwarf5-df-debug-names-helper.s | 326 ++++++++
 .../X86/Inputs/dwarf5-df-debug-names-main.s   | 493 ++++++++++++
 .../dwarf5-df-types-debug-names-helper.s      | 650 ++++++++++++++++
 .../Inputs/dwarf5-df-types-debug-names-main.s | 626 +++++++++++++++
 .../Inputs/dwarf5-types-debug-names-helper.s  | 405 ++++++++++
 .../Inputs/dwarf5-types-debug-names-main.s    | 391 ++++++++++
 ...arf5-debug-names-generate-debug-names.test | 154 ++++
 bolt/test/X86/dwarf5-debug-names.test         | 263 +++++++
 ...5-df-debug-names-generate-debug-names.test | 192 +++++
 bolt/test/X86/dwarf5-df-debug-names.test      | 149 ++++
 .../X86/dwarf5-df-one-cu-debug-names.test     | 101 +++
 .../test/X86/dwarf5-df-types-debug-names.test | 234 ++++++
 .../dwarf5-df-types-one-cu-debug-names.test   | 129 ++++
 bolt/test/X86/dwarf5-one-cu-debug-names.test  | 178 +++++
 bolt/test/X86/dwarf5-types-debug-names.test   | 129 ++++
 .../X86/dwarf5-types-one-cu-debug-names.test  |  98 +++
 27 files changed, 6571 insertions(+), 29 deletions(-)
 create mode 100644 bolt/include/bolt/Core/DebugNames.h
 create mode 100644 bolt/lib/Core/DebugNames.cpp
 create mode 100644 bolt/test/X86/Inputs/dwarf5-debug-names-helper.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-debug-names-main.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s
 create mode 100644 bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s
 create mode 100644 bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-df-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-df-one-cu-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-df-types-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-one-cu-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-types-debug-names.test
 create mode 100644 bolt/test/X86/dwarf5-types-one-cu-debug-names.test

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 30336c4e3a74f..741b1a36af86f 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -997,6 +997,10 @@ class BinaryContext {
     return getUniqueSectionByName(".gdb_index");
   }
 
+  ErrorOr<BinarySection &> getDebugNamesSection() const {
+    return getUniqueSectionByName(".debug_names");
+  }
+
   /// @}
 
   /// Register \p TargetFunction as a fragment of \p Function if checks pass:
diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index f0db924e2ccbb..4debf93633547 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -16,6 +16,7 @@
 #define BOLT_CORE_DIE_BUILDER_H
 
 #include "bolt/Core/BinaryContext.h"
+#include "bolt/Core/DebugNames.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
@@ -127,6 +128,7 @@ class DIEBuilder {
   DWARFUnit *SkeletonCU{nullptr};
   uint64_t UnitSize{0};
   llvm::DenseSet<uint64_t> AllProcessed;
+  DWARF5AcceleratorTable &DebugNamesTable;
 
   /// Returns current state of the DIEBuilder
   State &getState() { return *BuilderState.get(); }
@@ -206,8 +208,8 @@ class DIEBuilder {
   /// Update references once the layout is finalized.
   void updateReferences();
 
-  /// Update the Offset and Size of DIE.
-  uint32_t computeDIEOffset(const DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
+  /// Update the Offset and Size of DIE, populate DebugNames table.
+  uint32_t finalizeDIEs(DWARFUnit &CU, DIE &Die, uint32_t &CurOffset);
 
   void registerUnit(DWARFUnit &DU, bool NeedSort);
 
@@ -269,6 +271,7 @@ class DIEBuilder {
 
 public:
   DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext,
+             DWARF5AcceleratorTable &DebugNamesTable,
              DWARFUnit *SkeletonCU = nullptr);
 
   /// Returns enum to what we are currently processing.
diff --git a/bolt/include/bolt/Core/DebugData.h b/bolt/include/bolt/Core/DebugData.h
index 48b813a4ca11f..7d10b208dc83a 100644
--- a/bolt/include/bolt/Core/DebugData.h
+++ b/bolt/include/bolt/Core/DebugData.h
@@ -439,6 +439,8 @@ class DebugStrOffsetsWriter {
   /// Update Str offset in .debug_str in .debug_str_offsets.
   void updateAddressMap(uint32_t Index, uint32_t Address);
 
+  /// Get offset for given index in original .debug_str_offsets section.
+  uint64_t getOffset(uint32_t Index) const { return StrOffsets[Index]; }
   /// Writes out current sections entry into .debug_str_offsets.
   void finalizeSection(DWARFUnit &Unit, DIEBuilder &DIEBldr);
 
@@ -463,7 +465,7 @@ class DebugStrOffsetsWriter {
   std::unique_ptr<DebugStrOffsetsBufferVector> StrOffsetsBuffer;
   std::unique_ptr<raw_svector_ostream> StrOffsetsStream;
   std::map<uint32_t, uint32_t> IndexToAddressMap;
-  std::vector<uint32_t> StrOffsets;
+  SmallVector<uint32_t, 5> StrOffsets;
   std::unordered_map<uint64_t, uint64_t> ProcessedBaseOffsets;
   bool StrOffsetSectionWasModified = false;
 };
@@ -484,11 +486,12 @@ class DebugStrWriter {
   /// Returns False if no strings were added to .debug_str.
   bool isInitialized() const { return !StrBuffer->empty(); }
 
+  /// Initializes Buffer and Stream.
+  void initialize();
+
 private:
   /// Mutex used for parallel processing of debug info.
   std::mutex WriterMutex;
-  /// Initializes Buffer and Stream.
-  void initialize();
   /// Creates internal data structures.
   void create();
   std::unique_ptr<DebugStrBufferVector> StrBuffer;
diff --git a/bolt/include/bolt/Core/DebugNames.h b/bolt/include/bolt/Core/DebugNames.h
new file mode 100644
index 0000000000000..84c448aa1354c
--- /dev/null
+++ b/bolt/include/bolt/Core/DebugNames.h
@@ -0,0 +1,172 @@
+//===- bolt/Core/DebugNames.h - Debug names support ---*- C++
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declaration of classes required for generation of
+// .debug_names section.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_CORE_DEBUG_NAMES_H
+#define BOLT_CORE_DEBUG_NAMES_H
+
+#include "DebugData.h"
+#include "llvm/CodeGen/AccelTable.h"
+
+namespace llvm {
+namespace bolt {
+class BOLTDWARF5AccelTableData : public DWARF5AccelTableData {
+public:
+  BOLTDWARF5AccelTableData(const uint64_t DieOffset,
+                           const std::optional<uint64_t> DefiningParentOffset,
+                           const unsigned DieTag, const unsigned UnitID,
+                           const bool IsTU,
+                           const std::optional<unsigned> SecondUnitID)
+      : DWARF5AccelTableData(DieOffset, DefiningParentOffset, DieTag, UnitID,
+                             IsTU),
+        SecondUnitID(SecondUnitID) {}
+
+  uint64_t getDieOffset() const { return DWARF5AccelTableData::getDieOffset(); }
+  unsigned getDieTag() const { return DWARF5AccelTableData::getDieTag(); }
+  unsigned getUnitID() const { return DWARF5AccelTableData::getUnitID(); }
+  bool isTU() const { return DWARF5AccelTableData::isTU(); }
+  std::optional<unsigned> getSecondUnitID() const { return SecondUnitID; }
+
+private:
+  std::optional<unsigned> SecondUnitID;
+};
+
+class DWARF5AcceleratorTable {
+public:
+  DWARF5AcceleratorTable(const bool CreateDebugNames, BinaryContext &BC,
+                         DebugStrWriter &MainBinaryStrWriter);
+  ~DWARF5AcceleratorTable() {
+    for (DebugNamesAbbrev *Abbrev : AbbreviationsVector)
+      Abbrev->~DebugNamesAbbrev();
+  }
+  /// Add DWARF5 Accelerator table entry.
+  /// Input is DWARFUnit being processed, DIE that belongs to it, and potential
+  /// SkeletonCU if the Unit comes from a DWO section.
+  void addAccelTableEntry(DWARFUnit &Unit, const DIE &Die,
+                          const std::optional<uint64_t> &DWOID);
+  /// Set current unit being processed.
+  void setCurrentUnit(DWARFUnit &Unit, const uint64_t UnitStartOffset);
+  /// Emit Accelerator table.
+  void emitAccelTable();
+  /// Returns true if the table was crated.
+  bool isCreated() const { return NeedToCreate; }
+  /// Returns buffer containing the accelerator table.
+  std::unique_ptr<DebugBufferVector> releaseBuffer() {
+    return std::move(FullTableBuffer);
+  }
+
+private:
+  BinaryContext &BC;
+  bool NeedToCreate = false;
+  BumpPtrAllocator Allocator;
+  DebugStrWriter &MainBinaryStrWriter;
+  StringRef StrSection;
+  uint64_t CurrentUnitOffset = 0;
+  const DWARFUnit *CurrentUnit = nullptr;
+  std::unordered_map<uint32_t, uint32_t> AbbrevTagToIndexMap;
+
+  /// Represents a group of entries with identical name (and hence, hash value).
+  struct HashData {
+    uint64_t StrOffset;
+    uint32_t HashValue;
+    uint32_t EntryOffset;
+    std::vector<BOLTDWARF5AccelTableData *> Values;
+  };
+  using HashList = std::vector<HashData *>;
+  using BucketList = std::vector<HashList>;
+  /// Contains all the offsets of CUs.
+  SmallVector<uint32_t, 1> CUList;
+  /// Contains all the offsets of local TUs.
+  SmallVector<uint32_t, 1> LocalTUList;
+  /// Contains all the type hashes for split dwarf TUs.
+  SmallVector<uint64_t, 1> ForeignTUList;
+  using StringEntries =
+      MapVector<std::string, HashData, llvm::StringMap<unsigned>>;
+  StringEntries Entries;
+  /// FoldingSet that uniques the abbreviations.
+  FoldingSet<DebugNamesAbbrev> AbbreviationsSet;
+  /// Vector containing DebugNames abbreviations for iteration in order.
+  SmallVector<DebugNamesAbbrev *, 5> AbbreviationsVector;
+  /// The bump allocator to use when creating DIEAbbrev objects in the uniqued
+  /// storage container.
+  BumpPtrAllocator Alloc;
+  uint32_t BucketCount = 0;
+  uint32_t UniqueHashCount = 0;
+  uint32_t AbbrevTableSize = 0;
+  uint32_t CUIndexEncodingSize = 4;
+  uint32_t TUIndexEncodingSize = 4;
+  uint32_t AugmentationStringSize = 0;
+  dwarf::Form CUIndexForm = dwarf::DW_FORM_data4;
+  dwarf::Form TUIndexForm = dwarf::DW_FORM_data4;
+
+  BucketList Buckets;
+
+  std::unique_ptr<DebugBufferVector> FullTableBuffer;
+  std::unique_ptr<raw_svector_ostream> FullTableStream;
+  std::unique_ptr<DebugBufferVector> StrBuffer;
+  std::unique_ptr<raw_svector_ostream> StrStream;
+  std::unique_ptr<DebugBufferVector> EntriesBuffer;
+  std::unique_ptr<raw_svector_ostream> Entriestream;
+  std::unique_ptr<DebugBufferVector> AugStringBuffer;
+  std::unique_ptr<raw_svector_ostream> AugStringtream;
+  llvm::DenseMap<llvm::hash_code, uint64_t> StrCacheToOffsetMap;
+  // Contains DWO ID to CUList Index.
+  llvm::DenseMap<uint64_t, uint32_t> CUOffsetsToPatch;
+  /// Adds Unit to either CUList, LocalTUList or ForeignTUList.
+  /// Input Unit being processed, and DWO ID if Unit is being processed comes
+  /// from a DWO section.
+  void addUnit(DWARFUnit &Unit, const std::optional<uint64_t> &DWOID);
+  /// Returns number of buckets in .debug_name table.
+  ArrayRef<HashList> getBuckets() const { return Buckets; }
+  /// Get encoding for a given attribute.
+  std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+  getIndexForEntry(const BOLTDWARF5AccelTableData &Value) const;
+  /// Get encoding for a given attribute for second index.
+  /// Returns nullopt if there is no second index.
+  std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+  getSecondIndexForEntry(const BOLTDWARF5AccelTableData &Value) const;
+  /// Uniquify Entries.
+  void finalize();
+  /// Computes bucket count.
+  void computeBucketCount();
+  /// Populate Abbreviations Map.
+  void populateAbbrevsMap();
+  /// Write Entries.
+  void writeEntries();
+  /// Write an Entry.
+  void writeEntry(const BOLTDWARF5AccelTableData &Entry);
+  /// Write augmentation_string for BOLT.
+  void writeAugmentationString();
+  /// Emit out Header for DWARF5 Accelerator table.
+  void emitHeader() const;
+  /// Emit out CU list.
+  void emitCUList() const;
+  /// Emit out TU List. Combination of LocalTUList and ForeignTUList.
+  void emitTUList() const;
+  /// Emit buckets.
+  void emitBuckets() const;
+  /// Emit hashes for hash table.
+  void emitHashes() const;
+  /// Emit string offsets for hash table.
+  void emitStringOffsets() const;
+  /// Emit Entry Offsets for hash table.
+  void emitOffsets() const;
+  /// Emit abbreviation table.
+  void emitAbbrevs();
+  /// Emit entries.
+  void emitData();
+  /// Emit augmentation string.
+  void emitAugmentationString() const;
+};
+} // namespace bolt
+} // namespace llvm
+#endif
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index ba6775f99ce68..20972f3d0b85a 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -11,6 +11,7 @@
 
 #include "bolt/Core/DIEBuilder.h"
 #include "bolt/Core/DebugData.h"
+#include "bolt/Core/DebugNames.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/DWP/DWP.h"
@@ -140,8 +141,10 @@ class DWARFRewriter {
                             const std::list<DWARFUnit *> &CUs);
 
   /// Finalize debug sections in the main binary.
-  void finalizeDebugSections(DIEBuilder &DIEBlder, DIEStreamer &Streamer,
-                             raw_svector_ostream &ObjOS, CUOffsetMap &CUMap);
+  void finalizeDebugSections(DIEBuilder &DIEBlder,
+                             DWARF5AcceleratorTable &DebugNamesTable,
+                             DIEStreamer &Streamer, raw_svector_ostream &ObjOS,
+                             CUOffsetMap &CUMap);
 
   /// Patches the binary for DWARF address ranges (e.g. in functions and lexical
   /// blocks) to be updated.
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index c913179ebcc51..441df9fe08464 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_library(LLVMBOLTCore
   BinaryFunctionProfile.cpp
   BinarySection.cpp
   DebugData.cpp
+  DebugNames.cpp
   DIEBuilder.cpp
   DynoStats.cpp
   Exceptions.cpp
diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index e6104b81bf6c9..42287978a8576 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -19,20 +19,17 @@
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-#include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/ThreadPool.h"
-#include "llvm/Support/YAMLTraits.h"
 
 #include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <mutex>
-#include <string>
+#include <optional>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -179,8 +176,10 @@ void DIEBuilder::constructFromUnit(DWARFUnit &DU) {
 }
 
 DIEBuilder::DIEBuilder(BinaryContext &BC, DWARFContext *DwarfContext,
+                       DWARF5AcceleratorTable &DebugNamesTable,
                        DWARFUnit *SkeletonCU)
-    : BC(BC), DwarfContext(DwarfContext), SkeletonCU(SkeletonCU) {}
+    : BC(BC), DwarfContext(DwarfContext), SkeletonCU(SkeletonCU),
+      DebugNamesTable(DebugNamesTable) {}
 
 static unsigned int getCUNum(DWARFContext *DwarfContext, bool IsDWO) {
   unsigned int CUNum = IsDWO ? DwarfContext->getNumDWOCompileUnits()
@@ -378,18 +377,20 @@ getUnitForOffset(DIEBuilder &Builder, DWARFContext &DWCtx,
   return nullptr;
 }
 
-uint32_t DIEBuilder::computeDIEOffset(const DWARFUnit &CU, DIE &Die,
-                                      uint32_t &CurOffset) {
+uint32_t DIEBuilder::finalizeDIEs(DWARFUnit &CU, DIE &Die,
+                                  uint32_t &CurOffset) {
   getState().DWARFDieAddressesParsed.erase(Die.getOffset());
   uint32_t CurSize = 0;
   Die.setOffset(CurOffset);
+  DebugNamesTable.addAccelTableEntry(
+      CU, Die, SkeletonCU ? SkeletonCU->getDWOId() : std::nullopt);
   for (DIEValue &Val : Die.values())
     CurSize += Val.sizeOf(CU.getFormParams());
   CurSize += getULEB128Size(Die.getAbbrevNumber());
   CurOffset += CurSize;
 
   for (DIE &Child : Die.children()) {
-    uint32_t ChildSize = computeDIEOffset(CU, Child, CurOffset);
+    uint32_t ChildSize = finalizeDIEs(CU, Child, CurOffset);
     CurSize += ChildSize;
   }
   // for children end mark.
@@ -404,12 +405,12 @@ uint32_t DIEBuilder::computeDIEOffset(const DWARFUnit &CU, DIE &Die,
 }
 
 void DIEBuilder::finish() {
-  auto computeOffset = [&](const DWARFUnit &CU,
-                           uint64_t &UnitStartOffset) -> void {
+  auto finalizeCU = [&](DWARFUnit &CU, uint64_t &UnitStartOffset) -> void {
     DIE *UnitDIE = getUnitDIEbyUnit(CU);
     uint32_t HeaderSize = CU.getHeaderSize();
     uint32_t CurOffset = HeaderSize;
-    computeDIEOffset(CU, *UnitDIE, CurOffset);
+    DebugNamesTable.setCurrentUnit(CU, UnitStartOffset);
+    finalizeDIEs(CU, *UnitDIE, CurOffset);
 
     DWARFUnitInfo &CurUnitInfo = getUnitInfoByDwarfUnit(CU);
     CurUnitInfo.UnitOffset = UnitStartOffset;
@@ -420,18 +421,18 @@ void DIEBuilder::finish() {
   // It's processed first when CU is registered so will be at the begginnig of
   // the vector.
   uint64_t TypeUnitStartOffset = 0;
-  for (const DWARFUnit *CU : getState().DUList) {
+  for (DWARFUnit *CU : getState().DUList) {
     // We process DWARF$ types first.
     if (!(CU->getVersion() < 5 && CU->isTypeUnit()))
       break;
-    computeOffset(*CU, TypeUnitStartOffset);
+    finalizeCU(*CU, TypeUnitStartOffset);
   }
 
-  for (const DWARFUnit *CU : getState().DUList) {
+  for (DWARFUnit *CU : getState().DUList) {
     // Skipping DWARF4 types.
     if (CU->getVersion() < 5 && CU->isTypeUnit())
       continue;
-    computeOffset(*CU, UnitSize);
+    finalizeCU(*CU, UnitSize);
   }
   if (opts::Verbosity >= 1) {
     if (!getState().DWARFDieAddressesParsed.empty())
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
new file mode 100644
index 0000000000000..1a7792afbbd94
--- /dev/null
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -0,0 +1,618 @@
+//===- bolt/Rewrite/DebugNames.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/DebugNames.h"
+#include "bolt/Core/BinaryContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/LEB128.h"
+#include <cstdint>
+
+namespace llvm {
+namespace bolt {
+DWARF5AcceleratorTable::DWARF5AcceleratorTable(
+    const bool CreateDebugNames, BinaryContext &BC,
+    DebugStrWriter &MainBinaryStrWriter)
+    : BC(BC), MainBinaryStrWriter(MainBinaryStrWriter) {
+  NeedToCreate = CreateDebugNames || BC.getDebugNamesSection();
+  if (!NeedToCreate)
+    return;
+  FullTableBuffer = std::make_unique<DebugStrBufferVector>();
+  FullTableStream = std::make_unique<raw_svector_ostream>(*FullTableBuffer);
+  StrBuffer = std::make_unique<DebugStrBufferVector>();
+  StrStream = std::make_unique<raw_svector_ostream>(*StrBuffer);
+  EntriesBuffer = std::make_unique<DebugStrBufferVector>();
+  Entriestream = std::make_unique<raw_svector_ostream>(*EntriesBuffer);
+  AugStringBuffer = std::make_unique<DebugStrBufferVector>();
+  AugStringtream = std::make_unique<raw_svector_ostream>(*AugStringBuffer);
+
+  // Binary has split-dwarf CUs.
+  // Even thought for non-skeleton-cu all names are in .debug_str.dwo section,
+  // for the .debug_names contributions they are in .debug_str section.
+  if (BC.getNumDWOCUs()) {
+    DataExtractor StrData(BC.DwCtx->getDWARFObj().getStrSection(),
+                          BC.DwCtx->isLittleEndian(), 0);
+    uint64_t Offset = 0;
+    uint64_t StrOffset = 0;
+    while (StrData.isValidOffset(Offset)) {
+      Error Err = Error::success();
+      const char *CStr = StrData.getCStr(&Offset, &Err);
+      if (Err) {
+        NeedToCreate = false;
+        BC.errs() << "BOLT-WARNING: [internal-dwarf-error]: Could not extract "
+                     "string from .debug_str section at offset: "
+                  << Twine::utohexstr(StrOffset) << ".\n";
+        return;
+      }
+      auto R = StrCacheToOffsetMap.try_emplace(
+          llvm::hash_value(llvm::StringRef(CStr)), StrOffset);
+      if (!R.second)
+        BC.errs()
+            << "BOLT-WARNING: [internal-dwarf-error]: collision occured on "
+            << CStr << " at offset : 0x" << Twine::utohexstr(StrOffset)
+            << ". Previous string offset is: 0x"
+            << Twine::utohexstr(R.first->second) << ".\n";
+      StrOffset = Offset;
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::setCurrentUnit(DWARFUnit &Unit,
+                                            const uint64_t UnitStartOffset) {
+  CurrentUnit = nullptr;
+  CurrentUnitOffset = UnitStartOffset;
+  std::optional<uint64_t> DWOID = Unit.getDWOId();
+  // We process skeleton CUs after DWO Units for it.
+  // Patching offset in CU list to correct one.
+  if (!Unit.isDWOUnit() && DWOID) {
+    auto Iter = CUOffsetsToPatch.find(*DWOID);
+    // Check in case no entries were added from non skeleton DWO section.
+    if (Iter != CUOffsetsToPatch.end())
+      CUList[Iter->second] = UnitStartOffset;
+  }
+}
+
+void DWARF5AcceleratorTable::addUnit(DWARFUnit &Unit,
+                                     const std::optional<uint64_t> &DWOID) {
+  constexpr uint32_t BADCUOFFSET = 0xBADBAD;
+  StrSection = Unit.getStringSection();
+  if (Unit.isTypeUnit()) {
+    if (DWOID) {
+      // We adding an entry for a DWO TU. The DWO CU might not have any entries,
+      // so need to add it to the list pre-emptively.
+      auto Iter = CUOffsetsToPatch.insert({*DWOID, CUList.size()});
+      if (Iter.second)
+        CUList.push_back(BADCUOFFSET);
+      ForeignTUList.push_back(cast<DWARFTypeUnit>(&Unit)->getTypeHash());
+    } else {
+      LocalTUList.push_back(CurrentUnitOffset);
+    }
+  } else {
+    if (DWOID) {
+      // This is a path for split dwarf without type units.
+      // We process DWO Units before Skeleton CU. So at this point we don't know
+      // the offset of Skeleton CU. Adding CULit index to a map to patch later
+      // with the correct offset.
+      auto Iter = CUOffsetsToPatch.insert({*DWOID, CUList.size()});
+      if (Iter.second)
+        CUList.push_back(BADCUOFFSET);
+    } else {
+      CUList.push_back(CurrentUnitOffset);
+    }
+  }
+}
+
+// Returns true if DW_TAG_variable should be included in .debug-names based on
+// section 6.1.1.1 for DWARF5 spec.
+static bool shouldIncludeVariable(const DWARFUnit &Unit, const DIE &Die) {
+  if (Die.findAttribute(dwarf::Attribute::DW_AT_declaration))
+    return false;
+  const DIEValue LocAttrInfo =
+      Die.findAttribute(dwarf::Attribute::DW_AT_location);
+  if (!LocAttrInfo)
+    return false;
+  if (!(doesFormBelongToClass(LocAttrInfo.getForm(), DWARFFormValue::FC_Exprloc,
+                              Unit.getVersion()) ||
+        doesFormBelongToClass(LocAttrInfo.getForm(), DWARFFormValue::FC_Block,
+                              Unit.getVersion())))
+    return false;
+  std::vector<uint8_t> Sblock;
+  auto constructVect =
+      [&](const DIEValueList::const_value_range &Iter) -> void {
+    for (const DIEValue &Val : Iter)
+      Sblock.push_back(Val.getDIEInteger().getValue());
+  };
+  if (doesFormBelongToClass(LocAttrInfo.getForm(), DWARFFormValue::FC_Exprloc,
+                            Unit.getVersion()))
+    constructVect(LocAttrInfo.getDIELoc().values());
+  else
+    constructVect(LocAttrInfo.getDIEBlock().values());
+  ArrayRef<uint8_t> Expr = ArrayRef<uint8_t>(Sblock);
+  DataExtractor Data(StringRef((const char *)Expr.data(), Expr.size()),
+                     Unit.getContext().isLittleEndian(), 0);
+  DWARFExpression LocExpr(Data, Unit.getAddressByteSize(),
+                          Unit.getFormParams().Format);
+  for (const DWARFExpression::Operation &Expr : LocExpr)
+    if (Expr.getCode() == dwarf::DW_OP_addrx ||
+        Expr.getCode() == dwarf::DW_OP_form_tls_address)
+      return true;
+  return false;
+}
+
+/// Returns name offset in String Offset section.
+static uint64_t getNameOffset(BinaryContext &BC, DWARFUnit &Unit,
+                              const uint64_t Index) {
+  const DWARFSection &StrOffsetsSection = Unit.getStringOffsetSection();
+  const std::optional<StrOffsetsContributionDescriptor> &Contr =
+      Unit.getStringOffsetsTableContribution();
+  if (!Contr) {
+    BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not get "
+                 "StringOffsetsTableContribution for unit at offset: "
+              << Twine::utohexstr(Unit.getOffset()) << ".\n";
+    return 0;
+  }
+
+  const uint8_t DwarfOffsetByteSize = Contr->getDwarfOffsetByteSize();
+  return support::endian::read32le(StrOffsetsSection.Data.data() + Contr->Base +
+                                   Index * DwarfOffsetByteSize);
+}
+
+void DWARF5AcceleratorTable::addAccelTableEntry(
+    DWARFUnit &Unit, const DIE &Die, const std::optional<uint64_t> &DWOID) {
+  if (Unit.getVersion() < 5 || !NeedToCreate)
+    return;
+  std::string NameToUse = "";
+  auto canProcess = [&](const DIE &Die) -> bool {
+    switch (Die.getTag()) {
+    case dwarf::DW_TAG_base_type:
+    case dwarf::DW_TAG_class_type:
+    case dwarf::DW_TAG_enumeration_type:
+    case dwarf::DW_TAG_imported_declaration:
+    case dwarf::DW_TAG_pointer_type:
+    case dwarf::DW_TAG_structure_type:
+    case dwarf::DW_TAG_typedef:
+    case dwarf::DW_TAG_unspecified_type:
+      if (Die.findAttribute(dwarf::Attribute::DW_AT_name))
+        return true;
+      return false;
+    case dwarf::DW_TAG_namespace:
+      // According to DWARF5 spec namespaces without DW_AT_name needs to have
+      // "(anonymous namespace)"
+      if (!Die.findAttribute(dwarf::Attribute::DW_AT_name))
+        NameToUse = "(anonymous namespace)";
+      return true;
+    case dwarf::DW_TAG_inlined_subroutine:
+    case dwarf::DW_TAG_label:
+    case dwarf::DW_TAG_subprogram:
+      if (Die.findAttribute(dwarf::Attribute::DW_AT_low_pc) ||
+          Die.findAttribute(dwarf::Attribute::DW_AT_high_pc) ||
+          Die.findAttribute(dwarf::Attribute::DW_AT_ranges) ||
+          Die.findAttribute(dwarf::Attribute::DW_AT_entry_pc))
+        return true;
+      return false;
+    case dwarf::DW_TAG_variable:
+      return shouldIncludeVariable(Unit, Die);
+    default:
+      break;
+    }
+    return false;
+  };
+
+  auto getUnitID = [&](const DWARFUnit &Unit, bool &IsTU,
+                       uint32_t &DieTag) -> uint32_t {
+    IsTU = Unit.isTypeUnit();
+    DieTag = Die.getTag();
+    if (IsTU) {
+      if (DWOID)
+        return ForeignTUList.size() - 1;
+      return LocalTUList.size() - 1;
+    }
+    return CUList.size() - 1;
+  };
+
+  if (!canProcess(Die))
+    return;
+
+  // Addes a Unit to either CU, LocalTU or ForeignTU list the first time we
+  // encounter it.
+  // Invoking it here so that we don't add Units that don't have any entries.
+  if (&Unit != CurrentUnit) {
+    CurrentUnit = &Unit;
+    addUnit(Unit, DWOID);
+  }
+
+  auto addEntry = [&](DIEValue ValName) -> void {
+    if ((!ValName || ValName.getForm() == dwarf::DW_FORM_string) &&
+        NameToUse.empty())
+      return;
+    std::string Name = "";
+    uint64_t NameIndexOffset = 0;
+    if (NameToUse.empty()) {
+      NameIndexOffset = ValName.getDIEInteger().getValue();
+      if (ValName.getForm() != dwarf::DW_FORM_strp)
+        NameIndexOffset = getNameOffset(BC, Unit, NameIndexOffset);
+      // Counts on strings end with '\0'.
+      Name = std::string(&StrSection.data()[NameIndexOffset]);
+    } else {
+      Name = NameToUse;
+    }
+    auto &It = Entries[Name];
+    if (It.Values.empty()) {
+      if (DWOID && NameToUse.empty()) {
+        // For DWO Unit the offset is in the .debug_str.dwo section.
+        // Need to find offset for the name in the .debug_str section.
+        llvm::hash_code Hash = llvm::hash_value(llvm::StringRef(Name));
+        auto ItCache = StrCacheToOffsetMap.find(Hash);
+        if (ItCache == StrCacheToOffsetMap.end())
+          NameIndexOffset = MainBinaryStrWriter.addString(Name);
+        else
+          NameIndexOffset = ItCache->second;
+      }
+      if (!NameToUse.empty())
+        NameIndexOffset = MainBinaryStrWriter.addString(Name);
+      It.StrOffset = NameIndexOffset;
+      // This the same hash function used in DWARF5AccelTableData.
+      It.HashValue = caseFoldingDjbHash(Name);
+    }
+
+    bool IsTU = false;
+    uint32_t DieTag = 0;
+    uint32_t UnitID = getUnitID(Unit, IsTU, DieTag);
+    std::optional<unsigned> SecondIndex = std::nullopt;
+    if (IsTU && DWOID) {
+      auto Iter = CUOffsetsToPatch.find(*DWOID);
+      if (Iter == CUOffsetsToPatch.end())
+        BC.errs() << "BOLT-WARNING: [internal-dwarf-warning]: Could not find "
+                     "DWO ID in CU offsets for second Unit Index "
+                  << Name << ". For DIE at offset: "
+                  << Twine::utohexstr(CurrentUnitOffset + Die.getOffset())
+                  << ".\n";
+      SecondIndex = Iter->second;
+    }
+    It.Values.push_back(new (Allocator) BOLTDWARF5AccelTableData(
+        Die.getOffset(), std::nullopt, DieTag, UnitID, IsTU, SecondIndex));
+  };
+
+  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_name));
+  addEntry(Die.findAttribute(dwarf::Attribute::DW_AT_linkage_name));
+  return;
+}
+
+/// Algorithm from llvm implementation.
+void DWARF5AcceleratorTable::computeBucketCount() {
+  // First get the number of unique hashes.
+  std::vector<uint32_t> Uniques;
+  Uniques.reserve(Entries.size());
+  for (const auto &E : Entries)
+    Uniques.push_back(E.second.HashValue);
+  array_pod_sort(Uniques.begin(), Uniques.end());
+  std::vector<uint32_t>::iterator P =
+      std::unique(Uniques.begin(), Uniques.end());
+
+  UniqueHashCount = std::distance(Uniques.begin(), P);
+
+  if (UniqueHashCount > 1024)
+    BucketCount = UniqueHashCount / 4;
+  else if (UniqueHashCount > 16)
+    BucketCount = UniqueHashCount / 2;
+  else
+    BucketCount = std::max<uint32_t>(UniqueHashCount, 1);
+}
+
+/// Bucket code as in: AccelTableBase::finalize()
+void DWARF5AcceleratorTable::finalize() {
+  if (!NeedToCreate)
+    return;
+  // Figure out how many buckets we need, then compute the bucket contents and
+  // the final ordering. The hashes and offsets can be emitted by walking these
+  // data structures.
+  computeBucketCount();
+
+  // Compute bucket contents and final ordering.
+  Buckets.resize(BucketCount);
+  for (auto &E : Entries) {
+    uint32_t Bucket = E.second.HashValue % BucketCount;
+    Buckets[Bucket].push_back(&E.second);
+  }
+
+  // Sort the contents of the buckets by hash value so that hash collisions end
+  // up together. Stable sort makes testing easier and doesn't cost much more.
+  for (HashList &Bucket : Buckets) {
+    llvm::stable_sort(Bucket, [](const HashData *LHS, const HashData *RHS) {
+      return LHS->HashValue < RHS->HashValue;
+    });
+    for (HashData *H : Bucket)
+      llvm::stable_sort(H->Values, [](const BOLTDWARF5AccelTableData *LHS,
+                                      const BOLTDWARF5AccelTableData *RHS) {
+        return LHS->getDieOffset() < RHS->getDieOffset();
+      });
+  }
+
+  CUIndexForm = DIEInteger::BestForm(/*IsSigned*/ false, CUList.size() - 1);
+  TUIndexForm = DIEInteger::BestForm(
+      /*IsSigned*/ false, LocalTUList.size() + ForeignTUList.size() - 1);
+  const dwarf::FormParams FormParams{5, 4, dwarf::DwarfFormat::DWARF32, false};
+  CUIndexEncodingSize = *dwarf::getFixedFormByteSize(CUIndexForm, FormParams);
+  TUIndexEncodingSize = *dwarf::getFixedFormByteSize(TUIndexForm, FormParams);
+}
+
+std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+DWARF5AcceleratorTable::getIndexForEntry(
+    const BOLTDWARF5AccelTableData &Value) const {
+  if (Value.isTU())
+    return {{Value.getUnitID(), {dwarf::DW_IDX_type_unit, TUIndexForm}}};
+  if (CUList.size() > 1)
+    return {{Value.getUnitID(), {dwarf::DW_IDX_compile_unit, CUIndexForm}}};
+  return std::nullopt;
+}
+
+std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+DWARF5AcceleratorTable::getSecondIndexForEntry(
+    const BOLTDWARF5AccelTableData &Value) const {
+  if (Value.isTU() && CUList.size() > 1 && Value.getSecondUnitID())
+    return {
+        {*Value.getSecondUnitID(), {dwarf::DW_IDX_compile_unit, CUIndexForm}}};
+  return std::nullopt;
+}
+
+void DWARF5AcceleratorTable::populateAbbrevsMap() {
+  for (auto &Bucket : getBuckets()) {
+    for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
+      for (BOLTDWARF5AccelTableData *Value : Hash->Values) {
+        const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
+            getIndexForEntry(*Value);
+        // For entries that need to refer to the foreign type units and to
+        // the CU.
+        const std::optional<DWARF5AccelTable::UnitIndexAndEncoding>
+            SecondEntryRet = getSecondIndexForEntry(*Value);
+        DebugNamesAbbrev Abbrev(Value->getDieTag());
+        if (EntryRet)
+          Abbrev.addAttribute(EntryRet->Encoding);
+        if (SecondEntryRet)
+          Abbrev.addAttribute(SecondEntryRet->Encoding);
+        Abbrev.addAttribute({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+        FoldingSetNodeID ID;
+        Abbrev.Profile(ID);
+        void *InsertPos;
+        if (DebugNamesAbbrev *Existing =
+                AbbreviationsSet.FindNodeOrInsertPos(ID, InsertPos)) {
+          Value->setAbbrevNumber(Existing->getNumber());
+          continue;
+        }
+        DebugNamesAbbrev *NewAbbrev =
+            new (Alloc) DebugNamesAbbrev(std::move(Abbrev));
+        AbbreviationsVector.push_back(NewAbbrev);
+        NewAbbrev->setNumber(AbbreviationsVector.size());
+        AbbreviationsSet.InsertNode(NewAbbrev, InsertPos);
+        Value->setAbbrevNumber(NewAbbrev->getNumber());
+      }
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::writeEntry(const BOLTDWARF5AccelTableData &Entry) {
+  const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
+      getIndexForEntry(Entry);
+  // For forgeign type (FTU) units that need to refer to the FTU and to the CU.
+  const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> SecondEntryRet =
+      getSecondIndexForEntry(Entry);
+  const unsigned AbbrevIndex = Entry.getAbbrevNumber() - 1;
+  assert(AbbrevIndex < AbbreviationsVector.size() &&
+         "Entry abbrev index is outside of abbreviations vector range.");
+  const DebugNamesAbbrev *Abbrev = AbbreviationsVector[AbbrevIndex];
+  encodeULEB128(Entry.getAbbrevNumber(), *Entriestream);
+  auto writeIndex = [&](uint32_t Index, uint32_t IndexSize) -> void {
+    switch (IndexSize) {
+    default:
+      llvm_unreachable("Unsupported Index Size!");
+      break;
+    case 1:
+      support::endian::write(*Entriestream, static_cast<uint8_t>(Index),
+                             llvm::endianness::little);
+      break;
+    case 2:
+      support::endian::write(*Entriestream, static_cast<uint16_t>(Index),
+                             llvm::endianness::little);
+      break;
+    case 4:
+      support::endian::write(*Entriestream, static_cast<uint32_t>(Index),
+                             llvm::endianness::little);
+      break;
+    };
+  };
+
+  for (const DebugNamesAbbrev::AttributeEncoding &AttrEnc :
+       Abbrev->getAttributes()) {
+    switch (AttrEnc.Index) {
+    default: {
+      llvm_unreachable("Unexpected index attribute!");
+      break;
+    }
+    case dwarf::DW_IDX_compile_unit: {
+      const unsigned CUIndex =
+          SecondEntryRet ? SecondEntryRet->Index : EntryRet->Index;
+      writeIndex(CUIndex, CUIndexEncodingSize);
+      break;
+    }
+    case dwarf::DW_IDX_type_unit: {
+      writeIndex(EntryRet->Index, TUIndexEncodingSize);
+      break;
+    }
+    case dwarf::DW_IDX_die_offset: {
+      assert(AttrEnc.Form == dwarf::DW_FORM_ref4);
+      support::endian::write(*Entriestream,
+                             static_cast<uint32_t>(Entry.getDieOffset()),
+                             llvm::endianness::little);
+      break;
+    }
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::writeEntries() {
+  for (auto &Bucket : getBuckets()) {
+    for (DWARF5AcceleratorTable::HashData *Hash : Bucket) {
+      Hash->EntryOffset = EntriesBuffer->size();
+      for (const BOLTDWARF5AccelTableData *Value : Hash->Values) {
+        writeEntry(*Value);
+      }
+      support::endian::write(*Entriestream, static_cast<uint8_t>(0),
+                             llvm::endianness::little);
+    }
+  }
+}
+
+void DWARF5AcceleratorTable::writeAugmentationString() {
+  // String needs to be multiple of 4 bytes.
+  *AugStringtream << "BOLT";
+  AugmentationStringSize = AugStringBuffer->size();
+}
+
+/// Calculates size of .debug_names header without Length field.
+static constexpr uint32_t getDebugNamesHeaderSize() {
+  constexpr uint16_t VersionLength = sizeof(uint16_t);
+  constexpr uint16_t PaddingLength = sizeof(uint16_t);
+  constexpr uint32_t CompUnitCountLength = sizeof(uint32_t);
+  constexpr uint32_t LocalTypeUnitCountLength = sizeof(uint32_t);
+  constexpr uint32_t ForeignTypeUnitCountLength = sizeof(uint32_t);
+  constexpr uint32_t BucketCountLength = sizeof(uint32_t);
+  constexpr uint32_t NameCountLength = sizeof(uint32_t);
+  constexpr uint32_t AbbrevTableSizeLength = sizeof(uint32_t);
+  constexpr uint32_t AugmentationStringSizeLenght = sizeof(uint32_t);
+  return VersionLength + PaddingLength + CompUnitCountLength +
+         LocalTypeUnitCountLength + ForeignTypeUnitCountLength +
+         BucketCountLength + NameCountLength + AbbrevTableSizeLength +
+         AugmentationStringSizeLenght;
+}
+
+void DWARF5AcceleratorTable::emitHeader() const {
+  constexpr uint32_t HeaderSize = getDebugNamesHeaderSize();
+  // Header Length
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(HeaderSize + StrBuffer->size() +
+                                               AugmentationStringSize),
+                         llvm::endianness::little);
+  // Version
+  support::endian::write(*FullTableStream, static_cast<uint16_t>(5),
+                         llvm::endianness::little);
+  // Padding
+  support::endian::write(*FullTableStream, static_cast<uint16_t>(0),
+                         llvm::endianness::little);
+  // Compilation Unit Count
+  support::endian::write(*FullTableStream, static_cast<uint32_t>(CUList.size()),
+                         llvm::endianness::little);
+  // Local Type Unit Count
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(LocalTUList.size()),
+                         llvm::endianness::little);
+  // Foreign Type Unit Count
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(ForeignTUList.size()),
+                         llvm::endianness::little);
+  // Bucket Count
+  support::endian::write(*FullTableStream, static_cast<uint32_t>(BucketCount),
+                         llvm::endianness::little);
+  // Name Count
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(Entries.size()),
+                         llvm::endianness::little);
+  // Abbrev Table Size
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(AbbrevTableSize),
+                         llvm::endianness::little);
+  // Augmentation String Size
+  support::endian::write(*FullTableStream,
+                         static_cast<uint32_t>(AugmentationStringSize),
+                         llvm::endianness::little);
+
+  emitAugmentationString();
+  FullTableStream->write(StrBuffer->data(), StrBuffer->size());
+}
+
+void DWARF5AcceleratorTable::emitCUList() const {
+  for (const uint32_t CUID : CUList)
+    support::endian::write(*StrStream, CUID, llvm::endianness::little);
+}
+void DWARF5AcceleratorTable::emitTUList() const {
+  for (const uint32_t TUID : LocalTUList)
+    support::endian::write(*StrStream, TUID, llvm::endianness::little);
+
+  for (const uint64_t TUID : ForeignTUList)
+    support::endian::write(*StrStream, TUID, llvm::endianness::little);
+}
+void DWARF5AcceleratorTable::emitBuckets() const {
+  uint32_t Index = 1;
+  for (const auto &Bucket : enumerate(getBuckets())) {
+    const uint32_t TempIndex = Bucket.value().empty() ? 0 : Index;
+    support::endian::write(*StrStream, TempIndex, llvm::endianness::little);
+    Index += Bucket.value().size();
+  }
+}
+void DWARF5AcceleratorTable::emitHashes() const {
+  for (const auto &Bucket : getBuckets()) {
+    for (const DWARF5AcceleratorTable::HashData *Hash : Bucket)
+      support::endian::write(*StrStream, Hash->HashValue,
+                             llvm::endianness::little);
+  }
+}
+void DWARF5AcceleratorTable::emitStringOffsets() const {
+  for (const auto &Bucket : getBuckets()) {
+    for (const DWARF5AcceleratorTable::HashData *Hash : Bucket)
+      support::endian::write(*StrStream, static_cast<uint32_t>(Hash->StrOffset),
+                             llvm::endianness::little);
+  }
+}
+void DWARF5AcceleratorTable::emitOffsets() const {
+  for (const auto &Bucket : getBuckets()) {
+    for (const DWARF5AcceleratorTable::HashData *Hash : Bucket)
+      support::endian::write(*StrStream,
+                             static_cast<uint32_t>(Hash->EntryOffset),
+                             llvm::endianness::little);
+  }
+}
+void DWARF5AcceleratorTable::emitAbbrevs() {
+  const uint32_t AbbrevTableStart = StrBuffer->size();
+  for (const auto *Abbrev : AbbreviationsVector) {
+    encodeULEB128(Abbrev->getNumber(), *StrStream);
+    encodeULEB128(Abbrev->getDieTag(), *StrStream);
+    for (const auto &AttrEnc : Abbrev->getAttributes()) {
+      encodeULEB128(AttrEnc.Index, *StrStream);
+      encodeULEB128(AttrEnc.Form, *StrStream);
+    }
+    encodeULEB128(0, *StrStream);
+    encodeULEB128(0, *StrStream);
+  }
+  encodeULEB128(0, *StrStream);
+  AbbrevTableSize = StrBuffer->size() - AbbrevTableStart;
+}
+void DWARF5AcceleratorTable::emitData() {
+  StrStream->write(EntriesBuffer->data(), EntriesBuffer->size());
+}
+void DWARF5AcceleratorTable::emitAugmentationString() const {
+  FullTableStream->write(AugStringBuffer->data(), AugStringBuffer->size());
+}
+void DWARF5AcceleratorTable::emitAccelTable() {
+  if (!NeedToCreate)
+    return;
+  finalize();
+  populateAbbrevsMap();
+  writeEntries();
+  writeAugmentationString();
+  emitCUList();
+  emitTUList();
+  emitBuckets();
+  emitHashes();
+  emitStringOffsets();
+  emitOffsets();
+  emitAbbrevs();
+  emitData();
+  emitHeader();
+}
+} // namespace bolt
+} // namespace llvm
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 849c363730ebb..ca9d24245ceb1 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -347,6 +347,12 @@ static cl::opt<bool>
                       "multiple non-relocatable dwarf object files (dwo)."),
              cl::init(false), cl::cat(BoltCategory));
 
+static cl::opt<bool> CreateDebugNames(
+    "create-debug-names-section",
+    cl::desc("Creates .debug_names section, if the input binary doesn't have "
+             "it already, for DWARF5 CU/TUs."),
+    cl::init(false), cl::cat(BoltCategory));
+
 static cl::opt<bool>
     DebugSkeletonCu("debug-skeleton-cu",
                     cl::desc("prints out offsetrs for abbrev and debu_info of "
@@ -692,6 +698,8 @@ void DWARFRewriter::updateDebugInfo() {
     return ObjectName;
   };
 
+  DWARF5AcceleratorTable DebugNamesTable(opts::CreateDebugNames, BC,
+                                         *StrWriter);
   DWPState State;
   if (opts::WriteDWP)
     initDWPState(State);
@@ -709,7 +717,8 @@ void DWARFRewriter::updateDebugInfo() {
                                 : LegacyRangesSectionWriter.get();
     // Skipping CUs that failed to load.
     if (SplitCU) {
-      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), Unit);
+      DIEBuilder DWODIEBuilder(BC, &(*SplitCU)->getContext(), DebugNamesTable,
+                               Unit);
       DWODIEBuilder.buildDWOUnit(**SplitCU);
       std::string DWOName = updateDWONameCompDir(
           *Unit, *DIEBlder, *DIEBlder->getUnitDIEbyUnit(*Unit));
@@ -749,7 +758,7 @@ void DWARFRewriter::updateDebugInfo() {
     AddrWriter->update(*DIEBlder, *Unit);
   };
 
-  DIEBuilder DIEBlder(BC, BC.DwCtx.get());
+  DIEBuilder DIEBlder(BC, BC.DwCtx.get(), DebugNamesTable);
   DIEBlder.buildTypeUnits(StrOffstsWriter.get());
   SmallVector<char, 20> OutBuffer;
   std::unique_ptr<raw_svector_ostream> ObjOS =
@@ -781,10 +790,13 @@ void DWARFRewriter::updateDebugInfo() {
     ThreadPool.wait();
   }
 
+  DebugNamesTable.emitAccelTable();
+
   if (opts::WriteDWP)
     finalizeDWP(State);
 
-  finalizeDebugSections(DIEBlder, *Streamer, *ObjOS, OffsetMap);
+  finalizeDebugSections(DIEBlder, DebugNamesTable, *Streamer, *ObjOS,
+                        OffsetMap);
   updateGdbIndexSection(OffsetMap, CUIndex);
 }
 
@@ -1515,10 +1527,9 @@ CUOffsetMap DWARFRewriter::finalizeTypeSections(DIEBuilder &DIEBlder,
   return CUMap;
 }
 
-void DWARFRewriter::finalizeDebugSections(DIEBuilder &DIEBlder,
-                                          DIEStreamer &Streamer,
-                                          raw_svector_ostream &ObjOS,
-                                          CUOffsetMap &CUMap) {
+void DWARFRewriter::finalizeDebugSections(
+    DIEBuilder &DIEBlder, DWARF5AcceleratorTable &DebugNamesTable,
+    DIEStreamer &Streamer, raw_svector_ostream &ObjOS, CUOffsetMap &CUMap) {
   if (StrWriter->isInitialized()) {
     RewriteInstance::addToDebugSectionsToOverwrite(".debug_str");
     std::unique_ptr<DebugStrBufferVector> DebugStrSectionContents =
@@ -1616,6 +1627,15 @@ void DWARFRewriter::finalizeDebugSections(DIEBuilder &DIEBlder,
                                    copyByteArray(ARangesContents),
                                    ARangesContents.size());
   }
+
+  if (DebugNamesTable.isCreated()) {
+    RewriteInstance::addToDebugSectionsToOverwrite(".debug_names");
+    std::unique_ptr<DebugBufferVector> DebugNamesSectionContents =
+        DebugNamesTable.releaseBuffer();
+    BC.registerOrUpdateNoteSection(".debug_names",
+                                   copyByteArray(*DebugNamesSectionContents),
+                                   DebugNamesSectionContents->size());
+  }
 }
 
 void DWARFRewriter::finalizeCompileUnits(DIEBuilder &DIEBlder,
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-debug-names-helper.s
new file mode 100644
index 0000000000000..f10417bd0cebe
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-helper.s
@@ -0,0 +1,487 @@
+# clang++ -g2 -gdwarf-5 -gpubnames -fdebug-types-section -S
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# helper.cpp
+# #include "header.h"
+# int fooint;
+# struct Foo2Int {
+#    int *c1;
+#    int *c2;
+# };
+#
+# int foo() {
+#   Foo2Int fint;
+#   Foo2a f;
+#   return 0;
+# }
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "/typeDedup" "helper.cpp" md5 0xc33186b2db66a78883b1546aace9855d
+	.globl	_Z3foov                         # -- Begin function _Z3foov
+	.p2align	4, 0x90
+	.type	_Z3foov,@function
+_Z3foov:                                # @_Z3foov
+.Lfunc_begin0:
+	.loc	0 8 0                           # helper.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 11 3 prologue_end             # helper.cpp:11:3
+	xorl	%eax, %eax
+	.loc	0 11 3 epilogue_begin is_stmt 0 # helper.cpp:11:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z3foov, .Lfunc_end0-_Z3foov
+	.cfi_endproc
+                                        # -- End function
+	.type	fooint,@object                  # @fooint
+	.bss
+	.globl	fooint
+	.p2align	2, 0x0
+fooint:
+	.long	0                               # 0x0
+	.size	fooint, 4
+
+	.file	1 "." "header.h" md5 0xfea7bb1f22c47f129e15695f7137a1e7
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0x97 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	2                               # Abbrev [2] 0x23:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	46                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	3                               # Abbrev [3] 0x2e:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	4                               # Abbrev [4] 0x32:0x27 DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	5                               # DW_AT_linkage_name
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	46                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	5                               # Abbrev [5] 0x42:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	89                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x4d:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	88
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	119                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0x59:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	10                              # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	7                               # Abbrev [7] 0x5f:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	114                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	7                               # Abbrev [7] 0x68:0x9 DW_TAG_member
+	.byte	9                               # DW_AT_name
+	.long	114                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x72:0x5 DW_TAG_pointer_type
+	.long	46                              # DW_AT_type
+	.byte	6                               # Abbrev [6] 0x77:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	14                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	7                               # Abbrev [7] 0x7d:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	153                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	7                               # Abbrev [7] 0x86:0x9 DW_TAG_member
+	.byte	9                               # DW_AT_name
+	.long	153                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	7                               # Abbrev [7] 0x8f:0x9 DW_TAG_member
+	.byte	13                              # DW_AT_name
+	.long	153                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x99:0x5 DW_TAG_pointer_type
+	.long	158                             # DW_AT_type
+	.byte	3                               # Abbrev [3] 0x9e:0x4 DW_TAG_base_type
+	.byte	12                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	64                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"helper.cpp"                    # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T138552329/typeDedup" # string offset=35
+.Linfo_string3:
+	.asciz	"fooint"                        # string offset=83
+.Linfo_string4:
+	.asciz	"int"                           # string offset=90
+.Linfo_string5:
+	.asciz	"foo"                           # string offset=94
+.Linfo_string6:
+	.asciz	"_Z3foov"                       # string offset=98
+.Linfo_string7:
+	.asciz	"fint"                          # string offset=106
+.Linfo_string8:
+	.asciz	"Foo2Int"                       # string offset=111
+.Linfo_string9:
+	.asciz	"c1"                            # string offset=119
+.Linfo_string10:
+	.asciz	"c2"                            # string offset=122
+.Linfo_string11:
+	.asciz	"f"                             # string offset=125
+.Linfo_string12:
+	.asciz	"Foo2a"                         # string offset=127
+.Linfo_string13:
+	.asciz	"char"                          # string offset=133
+.Linfo_string14:
+	.asciz	"c3"                            # string offset=138
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string6
+	.long	.Linfo_string5
+	.long	.Linfo_string7
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.long	.Linfo_string8
+	.long	.Linfo_string11
+	.long	.Linfo_string13
+	.long	.Linfo_string14
+	.long	.Linfo_string12
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fooint
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	7                               # Header: bucket count
+	.long	7                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	5                               # Bucket 5
+	.long	7                               # Bucket 6
+	.long	-1257882357                     # Hash in Bucket 0
+	.long	-1168750522                     # Hash in Bucket 2
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 3
+	.long	193491849                       # Hash in Bucket 5
+	.long	2090147939                      # Hash in Bucket 5
+	.long	-35356620                       # Hash in Bucket 6
+	.long	.Linfo_string6                  # String in Bucket 0: _Z3foov
+	.long	.Linfo_string8                  # String in Bucket 2: Foo2Int
+	.long	.Linfo_string4                  # String in Bucket 3: int
+	.long	.Linfo_string12                 # String in Bucket 3: Foo2a
+	.long	.Linfo_string5                  # String in Bucket 5: foo
+	.long	.Linfo_string13                 # String in Bucket 5: char
+	.long	.Linfo_string3                  # String in Bucket 6: fooint
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 6
+.Lnames_abbrev_start0:
+	.ascii	"\230."                         # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\2304"                         # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+.L0:
+	.ascii	"\230."                         # Abbreviation code
+	.long	50                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _Z3foov
+.Lnames4:
+.L5:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	89                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2Int
+.Lnames0:
+.L2:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	46                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames5:
+.L3:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	119                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2a
+.Lnames2:
+	.ascii	"\230."                         # Abbreviation code
+	.long	50                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: foo
+.Lnames6:
+.L1:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	158                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+.Lnames1:
+.L4:
+	.ascii	"\2304"                         # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: fooint
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-debug-names-main.s
new file mode 100644
index 0000000000000..2d3f4d2d208f5
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-debug-names-main.s
@@ -0,0 +1,712 @@
+# clang++ -g2 -gdwarf-5 -gpubnames -fdebug-types-section
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# main.cpp
+# #include "header.h"
+# extern int fooint;
+# namespace {
+# struct t1 {
+# int i;
+# };
+# }
+# template <int *> struct t2 {
+#   t1 v1;
+# };
+# struct t3 {
+#   t2<&fooint> v1;
+# };
+# t3 v1;
+#
+# struct Foo {
+#  char *c1;
+#  char *c2;
+#  char *c3;
+# };
+# struct Foo2 {
+#  char *c1;
+#  char *c2;
+# };
+# int main(int argc, char *argv[]) {
+#  Foo f;
+#  Foo2 f2;
+#  Foo2a f3;
+#  return 0;
+# }
+	.text
+	.file	"main.cpp"
+	.file	0 "/typeDedup" "main.cpp" md5 0x04e636082b2b8a95a6ca39dde52372ae
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 25 0                          # main.cpp:25:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+	movl	%edi, -8(%rbp)
+	movq	%rsi, -16(%rbp)
+.Ltmp0:
+	.loc	0 29 2 prologue_end             # main.cpp:29:2
+	xorl	%eax, %eax
+	.loc	0 29 2 epilogue_begin is_stmt 0 # main.cpp:29:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.type	v1,@object                      # @v1
+	.bss
+	.globl	v1
+	.p2align	2, 0x0
+v1:
+	.zero	4
+	.size	v1, 4
+
+	.file	1 "." "header.h" md5 0xfea7bb1f22c47f129e15695f7137a1e7
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	48                              # DW_TAG_template_value_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	1                               # Abbrev [1] 0xc:0x11a DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	2                               # Abbrev [2] 0x23:0xb DW_TAG_variable
+	.byte	3                               # DW_AT_name
+	.long	46                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	1
+	.byte	3                               # Abbrev [3] 0x2e:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	8                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x34:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	62                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	12                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	3                               # Abbrev [3] 0x3e:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	7                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.byte	5                               # Abbrev [5] 0x44:0x9 DW_TAG_template_value_parameter
+	.long	87                              # DW_AT_type
+	.byte	3                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	159
+	.byte	4                               # Abbrev [4] 0x4d:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	97                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0x57:0x5 DW_TAG_pointer_type
+	.long	92                              # DW_AT_type
+	.byte	7                               # Abbrev [7] 0x5c:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x60:0x12 DW_TAG_namespace
+	.byte	3                               # Abbrev [3] 0x61:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	6                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x67:0x9 DW_TAG_member
+	.byte	5                               # DW_AT_name
+	.long	92                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	9                               # Abbrev [9] 0x72:0x48 DW_TAG_subprogram
+	.byte	2                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	9                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	92                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	10                              # Abbrev [10] 0x81:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	92                              # DW_AT_type
+	.byte	10                              # Abbrev [10] 0x8c:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	11                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	25                              # DW_AT_decl_line
+	.long	186                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0x97:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	88
+	.byte	13                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	26                              # DW_AT_decl_line
+	.long	200                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0xa2:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	72
+	.byte	18                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	27                              # DW_AT_decl_line
+	.long	234                             # DW_AT_type
+	.byte	11                              # Abbrev [11] 0xad:0xc DW_TAG_variable
+	.byte	3                               # DW_AT_location
+	.byte	145
+	.ascii	"\260\177"
+	.byte	20                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	28                              # DW_AT_decl_line
+	.long	259                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	6                               # Abbrev [6] 0xba:0x5 DW_TAG_pointer_type
+	.long	191                             # DW_AT_type
+	.byte	6                               # Abbrev [6] 0xbf:0x5 DW_TAG_pointer_type
+	.long	196                             # DW_AT_type
+	.byte	7                               # Abbrev [7] 0xc4:0x4 DW_TAG_base_type
+	.byte	12                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	3                               # Abbrev [3] 0xc8:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	17                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	16                              # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0xce:0x9 DW_TAG_member
+	.byte	14                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	17                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0xd7:0x9 DW_TAG_member
+	.byte	15                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	18                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0xe0:0x9 DW_TAG_member
+	.byte	16                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	19                              # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	3                               # Abbrev [3] 0xea:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	19                              # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	21                              # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0xf0:0x9 DW_TAG_member
+	.byte	14                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	22                              # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0xf9:0x9 DW_TAG_member
+	.byte	15                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	23                              # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	3                               # Abbrev [3] 0x103:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	21                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x109:0x9 DW_TAG_member
+	.byte	14                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0x112:0x9 DW_TAG_member
+	.byte	15                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0x11b:0x9 DW_TAG_member
+	.byte	16                              # DW_AT_name
+	.long	191                             # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	92                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T138552329/typeDedup" # string offset=33
+.Linfo_string3:
+	.asciz	"v1"                            # string offset=81
+.Linfo_string4:
+	.asciz	"t3"                            # string offset=84
+.Linfo_string5:
+	.asciz	"t2<&fooint>"                   # string offset=87
+.Linfo_string6:
+	.asciz	"int"                           # string offset=99
+.Linfo_string7:
+	.asciz	"(anonymous namespace)"         # string offset=103
+.Linfo_string8:
+	.asciz	"t1"                            # string offset=125
+.Linfo_string9:
+	.asciz	"i"                             # string offset=128
+.Linfo_string10:
+	.asciz	"main"                          # string offset=130
+.Linfo_string11:
+	.asciz	"argc"                          # string offset=135
+.Linfo_string12:
+	.asciz	"argv"                          # string offset=140
+.Linfo_string13:
+	.asciz	"char"                          # string offset=145
+.Linfo_string14:
+	.asciz	"f"                             # string offset=150
+.Linfo_string15:
+	.asciz	"Foo"                           # string offset=152
+.Linfo_string16:
+	.asciz	"c1"                            # string offset=156
+.Linfo_string17:
+	.asciz	"c2"                            # string offset=159
+.Linfo_string18:
+	.asciz	"c3"                            # string offset=162
+.Linfo_string19:
+	.asciz	"f2"                            # string offset=165
+.Linfo_string20:
+	.asciz	"Foo2"                          # string offset=168
+.Linfo_string21:
+	.asciz	"f3"                            # string offset=173
+.Linfo_string22:
+	.asciz	"Foo2a"                         # string offset=176
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string6
+	.long	.Linfo_string9
+	.long	.Linfo_string8
+	.long	.Linfo_string5
+	.long	.Linfo_string4
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.long	.Linfo_string12
+	.long	.Linfo_string13
+	.long	.Linfo_string14
+	.long	.Linfo_string16
+	.long	.Linfo_string17
+	.long	.Linfo_string18
+	.long	.Linfo_string15
+	.long	.Linfo_string19
+	.long	.Linfo_string20
+	.long	.Linfo_string21
+	.long	.Linfo_string22
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fooint
+	.quad	v1
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	11                              # Header: bucket count
+	.long	11                              # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	3                               # Bucket 1
+	.long	5                               # Bucket 2
+	.long	0                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	6                               # Bucket 5
+	.long	8                               # Bucket 6
+	.long	9                               # Bucket 7
+	.long	11                              # Bucket 8
+	.long	0                               # Bucket 9
+	.long	0                               # Bucket 10
+	.long	259227804                       # Hash in Bucket 0
+	.long	2090147939                      # Hash in Bucket 0
+	.long	193491849                       # Hash in Bucket 1
+	.long	958480634                       # Hash in Bucket 1
+	.long	2090263771                      # Hash in Bucket 2
+	.long	5863786                         # Hash in Bucket 5
+	.long	5863852                         # Hash in Bucket 5
+	.long	193495088                       # Hash in Bucket 6
+	.long	5863788                         # Hash in Bucket 7
+	.long	2090499946                      # Hash in Bucket 7
+	.long	-1929613044                     # Hash in Bucket 8
+	.long	.Linfo_string22                 # String in Bucket 0: Foo2a
+	.long	.Linfo_string13                 # String in Bucket 0: char
+	.long	.Linfo_string15                 # String in Bucket 1: Foo
+	.long	.Linfo_string5                  # String in Bucket 1: t2<&fooint>
+	.long	.Linfo_string20                 # String in Bucket 2: Foo2
+	.long	.Linfo_string8                  # String in Bucket 5: t1
+	.long	.Linfo_string3                  # String in Bucket 5: v1
+	.long	.Linfo_string6                  # String in Bucket 6: int
+	.long	.Linfo_string4                  # String in Bucket 7: t3
+	.long	.Linfo_string10                 # String in Bucket 7: main
+	.long	.Linfo_string7                  # String in Bucket 8: (anonymous namespace)
+	.long	.Lnames10-.Lnames_entries0      # Offset in Bucket 0
+	.long	.Lnames7-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames8-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames9-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 6
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 7
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 8
+.Lnames_abbrev_start0:
+	.ascii	"\2309"                         # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\270\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\2304"                         # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230."                         # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames10:
+.L1:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	259                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2a
+.Lnames7:
+.L8:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	196                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+.Lnames8:
+.L0:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	200                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo
+.Lnames1:
+.L2:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	62                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: t2<&fooint>
+.Lnames9:
+.L9:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	234                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2
+.Lnames4:
+.L5:
+	.ascii	"\270\023"                      # Abbreviation code
+	.long	97                              # DW_IDX_die_offset
+	.long	.L3-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: t1
+.Lnames5:
+.L7:
+	.ascii	"\2304"                         # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: v1
+.Lnames2:
+.L10:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	92                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L6:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	46                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: t3
+.Lnames6:
+.L4:
+	.ascii	"\230."                         # Abbreviation code
+	.long	114                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames3:
+.L3:
+	.ascii	"\2309"                         # Abbreviation code
+	.long	96                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: (anonymous namespace)
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s
new file mode 100644
index 0000000000000..8a53bda5f7124
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-debug-names-helper.s
@@ -0,0 +1,326 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-compilation-dir='.'
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# helper.cpp
+# #include "header.h"
+# struct Foo2Int {
+#    int *c1;
+#    int *c2;
+# };
+# Foo2Int fint;
+# const Foo2a f{nullptr, nullptr};
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "." "helper.cpp" md5 0x2804efac708fd4180d403e6d5dbcc54a
+	.type	fint,@object                    # @fint
+	.bss
+	.globl	fint
+	.p2align	3, 0x0
+fint:
+	.zero	16
+	.size	fint, 16
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	3223434782003797151
+	.byte	1                               # Abbrev [1] 0x14:0xf DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"Foo2Int"                       # string offset=2
+.Lskel_string2:
+	.asciz	"int"                           # string offset=10
+.Lskel_string3:
+	.asciz	"fint"                          # string offset=14
+.Lskel_string4:
+	.asciz	"helper.dwo"                    # string offset=19
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string4
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	36                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"fint"                          # string offset=0
+.Linfo_string1:
+	.asciz	"c1"                            # string offset=5
+.Linfo_string2:
+	.asciz	"int"                           # string offset=8
+.Linfo_string3:
+	.asciz	"c2"                            # string offset=12
+.Linfo_string4:
+	.asciz	"Foo2Int"                       # string offset=15
+.Linfo_string5:
+	.asciz	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)" # string offset=23
+.Linfo_string6:
+	.asciz	"helper.cpp"                    # string offset=131
+.Linfo_string7:
+	.asciz	"helper.dwo"                    # string offset=142
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	8
+	.long	12
+	.long	15
+	.long	23
+	.long	131
+	.long	142
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	3223434782003797151
+	.byte	1                               # Abbrev [1] 0x14:0x34 DW_TAG_compile_unit
+	.byte	5                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	6                               # DW_AT_name
+	.byte	7                               # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0xb DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	3                               # Abbrev [3] 0x25:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	4                               # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2b:0x9 DW_TAG_member
+	.byte	1                               # DW_AT_name
+	.long	62                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	4                               # Abbrev [4] 0x34:0x9 DW_TAG_member
+	.byte	3                               # DW_AT_name
+	.long	62                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x3e:0x5 DW_TAG_pointer_type
+	.long	67                              # DW_AT_type
+	.byte	6                               # Abbrev [6] 0x43:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fint
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	3                               # Header: bucket count
+	.long	3                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	1                               # Bucket 0
+	.long	2                               # Bucket 1
+	.long	3                               # Bucket 2
+	.long	-1168750522                     # Hash in Bucket 0
+	.long	2090257270                      # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 2
+	.long	.Lskel_string1                  # String in Bucket 0: Foo2Int
+	.long	.Lskel_string3                  # String in Bucket 1: fint
+	.long	.Lskel_string2                  # String in Bucket 2: int
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\2304"                         # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames0:
+.L1:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	37                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2Int
+.Lnames2:
+.L0:
+	.ascii	"\2304"                         # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: fint
+.Lnames1:
+.L2:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	67                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s
new file mode 100644
index 0000000000000..7ca7213239175
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-debug-names-main.s
@@ -0,0 +1,493 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-compilation-dir='.'
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# main.cpp
+# #include "header.h"
+# struct Foo2 {
+#  char *c1;
+# };
+# int main(int argc, char *argv[]) {
+#  Foo2 f2;
+#  Foo2a f3;
+#  return 0;
+# }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9c5cea5bb78d3fc265cd175110bfe903
+	.loc	0 5 0                           # main.cpp:5:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+	movl	%edi, -8(%rbp)
+	movq	%rsi, -16(%rbp)
+.Ltmp0:
+	.loc	0 8 2 prologue_end              # main.cpp:8:2
+	xorl	%eax, %eax
+	.loc	0 8 2 epilogue_begin is_stmt 0  # main.cpp:8:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.file	1 "." "header.h" md5 0xfea7bb1f22c47f129e15695f7137a1e7
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-5618023701701543936
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"main"                          # string offset=2
+.Lskel_string2:
+	.asciz	"int"                           # string offset=7
+.Lskel_string3:
+	.asciz	"char"                          # string offset=11
+.Lskel_string4:
+	.asciz	"Foo2"                          # string offset=16
+.Lskel_string5:
+	.asciz	"Foo2a"                         # string offset=21
+.Lskel_string6:
+	.asciz	"main.dwo"                      # string offset=27
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string6
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	64                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"argc"                          # string offset=9
+.Linfo_string3:
+	.asciz	"argv"                          # string offset=14
+.Linfo_string4:
+	.asciz	"char"                          # string offset=19
+.Linfo_string5:
+	.asciz	"f2"                            # string offset=24
+.Linfo_string6:
+	.asciz	"c1"                            # string offset=27
+.Linfo_string7:
+	.asciz	"Foo2"                          # string offset=30
+.Linfo_string8:
+	.asciz	"f3"                            # string offset=35
+.Linfo_string9:
+	.asciz	"c2"                            # string offset=38
+.Linfo_string10:
+	.asciz	"c3"                            # string offset=41
+.Linfo_string11:
+	.asciz	"Foo2a"                         # string offset=44
+.Linfo_string12:
+	.asciz	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)" # string offset=50
+.Linfo_string13:
+	.asciz	"main.cpp"                      # string offset=158
+.Linfo_string14:
+	.asciz	"main.dwo"                      # string offset=167
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	14
+	.long	19
+	.long	24
+	.long	27
+	.long	30
+	.long	35
+	.long	38
+	.long	41
+	.long	44
+	.long	50
+	.long	158
+	.long	167
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-5618023701701543936
+	.byte	1                               # Abbrev [1] 0x14:0x87 DW_TAG_compile_unit
+	.byte	12                              # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	13                              # DW_AT_name
+	.byte	14                              # DW_AT_dwo_name
+	.byte	2                               # Abbrev [2] 0x1a:0x3c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	2                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+	.byte	3                               # Abbrev [3] 0x34:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	90                              # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x3f:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	104
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.long	104                             # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x4a:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	80
+	.byte	8                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	120                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x56:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	6                               # Abbrev [6] 0x5a:0x5 DW_TAG_pointer_type
+	.long	95                              # DW_AT_type
+	.byte	6                               # Abbrev [6] 0x5f:0x5 DW_TAG_pointer_type
+	.long	100                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x64:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	7                               # Abbrev [7] 0x68:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	7                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x6e:0x9 DW_TAG_member
+	.byte	6                               # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x78:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	11                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	8                               # Abbrev [8] 0x7e:0x9 DW_TAG_member
+	.byte	6                               # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	8                               # Abbrev [8] 0x87:0x9 DW_TAG_member
+	.byte	9                               # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	8                               # Abbrev [8] 0x90:0x9 DW_TAG_member
+	.byte	10                              # DW_AT_name
+	.long	95                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	5                               # Header: bucket count
+	.long	5                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	0                               # Bucket 0
+	.long	1                               # Bucket 1
+	.long	0                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	4                               # Bucket 4
+	.long	2090263771                      # Hash in Bucket 1
+	.long	2090499946                      # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 4
+	.long	2090147939                      # Hash in Bucket 4
+	.long	.Lskel_string4                  # String in Bucket 1: Foo2
+	.long	.Lskel_string1                  # String in Bucket 1: main
+	.long	.Lskel_string2                  # String in Bucket 3: int
+	.long	.Lskel_string5                  # String in Bucket 4: Foo2a
+	.long	.Lskel_string3                  # String in Bucket 4: char
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+.Lnames_abbrev_start0:
+	.ascii	"\230\023"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230."                         # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\230$"                         # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+.L4:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	104                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2
+.Lnames0:
+.L1:
+	.ascii	"\230."                         # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames1:
+.L3:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames4:
+.L2:
+	.ascii	"\230\023"                      # Abbreviation code
+	.long	120                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: Foo2a
+.Lnames2:
+.L0:
+	.ascii	"\230$"                         # Abbreviation code
+	.long	100                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git (git@github.com:ayermolo/llvm-project.git da9e9277be64deca73370a90d22af33e5b37cc52)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s
new file mode 100644
index 0000000000000..10842852eccdb
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-helper.s
@@ -0,0 +1,650 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-types-section -fdebug-compilation-dir='.' -S
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# #include "header.h"
+# struct Foo2Int {
+#    int *c1;
+#    int *c2;
+# };
+# Foo2Int fint;
+# const Foo2a f{nullptr, nullptr};
+
+	.text
+	.file	"helper.cpp"
+	.file	0 "." "helper.cpp" md5 0xc33186b2db66a78883b1546aace9855d
+	.globl	_Z3foov                         # -- Begin function _Z3foov
+	.p2align	4, 0x90
+	.type	_Z3foov,@function
+_Z3foov:                                # @_Z3foov
+.Lfunc_begin0:
+	.loc	0 8 0                           # helper.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 11 3 prologue_end             # helper.cpp:11:3
+	xorl	%eax, %eax
+	.loc	0 11 3 epilogue_begin is_stmt 0 # helper.cpp:11:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z3foov, .Lfunc_end0-_Z3foov
+	.cfi_endproc
+                                        # -- End function
+	.type	fooint,@object                  # @fooint
+	.bss
+	.globl	fooint
+	.p2align	2, 0x0
+fooint:
+	.long	0                               # 0x0
+	.size	fooint, 4
+
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	-3882554063269480080            # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x2c DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	5                               # DW_AT_comp_dir
+	.byte	6                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x19 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	9                               # DW_AT_name
+	.byte	16                              # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	7                               # DW_AT_name
+	.long	58                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x30:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	58                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x3a:0x5 DW_TAG_pointer_type
+	.long	63                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x3f:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x35 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	5                               # DW_AT_comp_dir
+	.byte	6                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	13                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	7                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x30:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x39:0x9 DW_TAG_member
+	.byte	12                              # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x43:0x5 DW_TAG_pointer_type
+	.long	72                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x48:0x4 DW_TAG_base_type
+	.byte	11                              # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	2142419470755914572
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"int"                           # string offset=2
+.Lskel_string2:
+	.asciz	"fooint"                        # string offset=6
+.Lskel_string3:
+	.asciz	"foo"                           # string offset=13
+.Lskel_string4:
+	.asciz	"_Z3foov"                       # string offset=17
+.Lskel_string5:
+	.asciz	"Foo2Int"                       # string offset=25
+.Lskel_string6:
+	.asciz	"Foo2a"                         # string offset=33
+.Lskel_string7:
+	.asciz	"char"                          # string offset=39
+.Lskel_string8:
+	.asciz	"helper.dwo"                    # string offset=44
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string8
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	68                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"fooint"                        # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=7
+.Linfo_string2:
+	.asciz	"_Z3foov"                       # string offset=11
+.Linfo_string3:
+	.asciz	"foo"                           # string offset=19
+.Linfo_string4:
+	.asciz	"fint"                          # string offset=23
+.Linfo_string5:
+	.asciz	"."                             # string offset=28
+.Linfo_string6:
+	.asciz	"helper.dwo"                    # string offset=30
+.Linfo_string7:
+	.asciz	"c1"                            # string offset=41
+.Linfo_string8:
+	.asciz	"c2"                            # string offset=44
+.Linfo_string9:
+	.asciz	"Foo2Int"                       # string offset=47
+.Linfo_string10:
+	.asciz	"f"                             # string offset=55
+.Linfo_string11:
+	.asciz	"char"                          # string offset=57
+.Linfo_string12:
+	.asciz	"c3"                            # string offset=62
+.Linfo_string13:
+	.asciz	"Foo2a"                         # string offset=65
+.Linfo_string14:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=71
+.Linfo_string15:
+	.asciz	"helper.cpp"                    # string offset=179
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	7
+	.long	11
+	.long	19
+	.long	23
+	.long	28
+	.long	30
+	.long	41
+	.long	44
+	.long	47
+	.long	55
+	.long	57
+	.long	62
+	.long	65
+	.long	71
+	.long	179
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end2-.Ldebug_info_dwo_start2 # Length of Unit
+.Ldebug_info_dwo_start2:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	2142419470755914572
+	.byte	6                               # Abbrev [6] 0x14:0x4f DW_TAG_compile_unit
+	.byte	14                              # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	15                              # DW_AT_name
+	.byte	6                               # DW_AT_dwo_name
+	.byte	7                               # Abbrev [7] 0x1a:0xb DW_TAG_variable
+	.byte	0                               # DW_AT_name
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	2                               # DW_AT_location
+	.byte	161
+	.byte	0
+	.byte	5                               # Abbrev [5] 0x25:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x29:0x27 DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	2                               # DW_AT_linkage_name
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+	.long	37                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	9                               # Abbrev [9] 0x39:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	4                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+	.long	80                              # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x44:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	88
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	89                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	10                              # Abbrev [10] 0x50:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-3882554063269480080            # DW_AT_signature
+	.byte	10                              # Abbrev [10] 0x59:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end2:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_line.dwo,"e",@progbits
+.Ltmp2:
+	.long	.Ldebug_line_end0-.Ldebug_line_start0 # unit length
+.Ldebug_line_start0:
+	.short	5
+	.byte	8
+	.byte	0
+	.long	.Lprologue_end0-.Lprologue_start0
+.Lprologue_start0:
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	-5
+	.byte	14
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	46
+	.byte	0
+	.byte	46
+	.byte	0
+	.byte	3
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	15
+	.byte	5
+	.byte	30
+	.byte	2
+	.ascii	"helper.cpp"
+	.byte	0
+	.byte	0
+	.byte	0xc3, 0x31, 0x86, 0xb2
+	.byte	0xdb, 0x66, 0xa7, 0x88
+	.byte	0x83, 0xb1, 0x54, 0x6a
+	.byte	0xac, 0xe9, 0x85, 0x5d
+	.ascii	"header.h"
+	.byte	0
+	.byte	1
+	.byte	0xfe, 0xa7, 0xbb, 0x1f
+	.byte	0x22, 0xc4, 0x7f, 0x12
+	.byte	0x9e, 0x15, 0x69, 0x5f
+	.byte	0x71, 0x37, 0xa1, 0xe7
+.Lprologue_end0:
+.Ldebug_line_end0:
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	fooint
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	2                               # Header: foreign type unit count
+	.long	7                               # Header: bucket count
+	.long	7                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.quad	-3882554063269480080            # Type unit 0
+	.quad	1175092228111723119             # Type unit 1
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	0                               # Bucket 4
+	.long	5                               # Bucket 5
+	.long	7                               # Bucket 6
+	.long	-1257882357                     # Hash in Bucket 0
+	.long	-1168750522                     # Hash in Bucket 2
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 3
+	.long	193491849                       # Hash in Bucket 5
+	.long	2090147939                      # Hash in Bucket 5
+	.long	-35356620                       # Hash in Bucket 6
+	.long	.Lskel_string4                  # String in Bucket 0: _Z3foov
+	.long	.Lskel_string5                  # String in Bucket 2: Foo2Int
+	.long	.Lskel_string1                  # String in Bucket 3: int
+	.long	.Lskel_string6                  # String in Bucket 3: Foo2a
+	.long	.Lskel_string3                  # String in Bucket 5: foo
+	.long	.Lskel_string7                  # String in Bucket 5: char
+	.long	.Lskel_string2                  # String in Bucket 6: fooint
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames6-.Lnames_entries0       # Offset in Bucket 5
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 6
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\r"                        # Abbrev code
+	.byte	52                              # DW_TAG_variable
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # End of list: _Z3foov
+.Lnames4:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	80                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2Int
+.Lnames0:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	37                              # DW_IDX_die_offset
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	63                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames5:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	89                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames2:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	41                              # DW_IDX_die_offset
+	.byte	0                               # End of list: foo
+.Lnames6:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	72                              # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+.Lnames1:
+	.ascii	"\210\r"                        # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # End of list: fooint
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s
new file mode 100644
index 0000000000000..f89f28ec13f4e
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-df-types-debug-names-main.s
@@ -0,0 +1,626 @@
+# clang++ -gsplit-dwarf -g2 -gdwarf-5 -gpubnames -fdebug-types-section -fdebug-compilation-dir='.' -S
+# header.h
+# struct Foo2a {
+#   char *c1;
+#   char *c2;
+#   char *c3;
+# };
+# include "header.h"
+# struct Foo2 {
+#  char *c1;
+# };
+# int main(int argc, char *argv[]) {
+#  Foo2 f2;
+#  Foo2a f3;
+#  return 0;
+# }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "." "main.cpp" md5 0x9c5cea5bb78d3fc265cd175110bfe903
+	.loc	0 5 0                           # main.cpp:5:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+	movl	%edi, -8(%rbp)
+	movq	%rsi, -16(%rbp)
+.Ltmp0:
+	.loc	0 8 2 prologue_end              # main.cpp:8:2
+	xorl	%eax, %eax
+	.loc	0 8 2 epilogue_begin is_stmt 0  # main.cpp:8:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5322170643381124694             # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x23 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	6                               # DW_AT_comp_dir
+	.byte	7                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	9                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	49                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x31:0x5 DW_TAG_pointer_type
+	.long	54                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x36:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.long	.Ldebug_info_dwo_end1-.Ldebug_info_dwo_start1 # Length of Unit
+.Ldebug_info_dwo_start1:
+	.short	5                               # DWARF version number
+	.byte	6                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	33                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x35 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.byte	6                               # DW_AT_comp_dir
+	.byte	7                               # DW_AT_dwo_name
+	.long	0                               # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x21:0x22 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	13                              # DW_AT_name
+	.byte	24                              # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0x9 DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x30:0x9 DW_TAG_member
+	.byte	11                              # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	8                               # DW_AT_data_member_location
+	.byte	3                               # Abbrev [3] 0x39:0x9 DW_TAG_member
+	.byte	12                              # DW_AT_name
+	.long	67                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	4                               # DW_AT_decl_line
+	.byte	16                              # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x43:0x5 DW_TAG_pointer_type
+	.long	72                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x48:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end1:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	74                              # DW_TAG_skeleton_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	4                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	5962099678818150071
+	.byte	1                               # Abbrev [1] 0x14:0x14 DW_TAG_skeleton_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	0                               # DW_AT_comp_dir
+	.byte	1                               # DW_AT_dwo_name
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	12                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"." # string offset=0
+.Lskel_string1:
+	.asciz	"main"                          # string offset=53
+.Lskel_string2:
+	.asciz	"int"                           # string offset=58
+.Lskel_string3:
+	.asciz	"char"                          # string offset=62
+.Lskel_string4:
+	.asciz	"Foo2"                          # string offset=67
+.Lskel_string5:
+	.asciz	"Foo2a"                         # string offset=72
+.Lskel_string6:
+	.asciz	"main.dwo"                      # string offset=78
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Lskel_string0
+	.long	.Lskel_string6
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	68                              # Length of String Offsets Set
+	.short	5
+	.short	0
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.asciz	"argc"                          # string offset=9
+.Linfo_string3:
+	.asciz	"argv"                          # string offset=14
+.Linfo_string4:
+	.asciz	"char"                          # string offset=19
+.Linfo_string5:
+	.asciz	"f2"                            # string offset=24
+.Linfo_string6:
+	.asciz	"/home/ayermolo/local/tasks/T138552329/typeDedupSplit" # string offset=27
+.Linfo_string7:
+	.asciz	"main.dwo"                      # string offset=80
+.Linfo_string8:
+	.asciz	"c1"                            # string offset=89
+.Linfo_string9:
+	.asciz	"Foo2"                          # string offset=92
+.Linfo_string10:
+	.asciz	"f3"                            # string offset=97
+.Linfo_string11:
+	.asciz	"c2"                            # string offset=100
+.Linfo_string12:
+	.asciz	"c3"                            # string offset=103
+.Linfo_string13:
+	.asciz	"Foo2a"                         # string offset=106
+.Linfo_string14:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=112
+.Linfo_string15:
+	.asciz	"main.cpp"                      # string offset=220
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	14
+	.long	19
+	.long	24
+	.long	27
+	.long	80
+	.long	89
+	.long	92
+	.long	97
+	.long	100
+	.long	103
+	.long	106
+	.long	112
+	.long	220
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end2-.Ldebug_info_dwo_start2 # Length of Unit
+.Ldebug_info_dwo_start2:
+	.short	5                               # DWARF version number
+	.byte	5                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	0                               # Offset Into Abbrev. Section
+	.quad	5962099678818150071
+	.byte	6                               # Abbrev [6] 0x14:0x67 DW_TAG_compile_unit
+	.byte	14                              # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	15                              # DW_AT_name
+	.byte	7                               # DW_AT_dwo_name
+	.byte	7                               # Abbrev [7] 0x1a:0x3c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	8                               # Abbrev [8] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	2                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	86                              # DW_AT_type
+	.byte	8                               # Abbrev [8] 0x34:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	5                               # DW_AT_decl_line
+	.long	90                              # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x3f:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	104
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.long	104                             # DW_AT_type
+	.byte	9                               # Abbrev [9] 0x4a:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	80
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	7                               # DW_AT_decl_line
+	.long	113                             # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x56:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	4                               # Abbrev [4] 0x5a:0x5 DW_TAG_pointer_type
+	.long	95                              # DW_AT_type
+	.byte	4                               # Abbrev [4] 0x5f:0x5 DW_TAG_pointer_type
+	.long	100                             # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x64:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	10                              # Abbrev [10] 0x68:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	5322170643381124694             # DW_AT_signature
+	.byte	10                              # Abbrev [10] 0x71:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end2:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	118                             # DW_AT_dwo_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_line.dwo,"e",@progbits
+.Ltmp2:
+	.long	.Ldebug_line_end0-.Ldebug_line_start0 # unit length
+.Ldebug_line_start0:
+	.short	5
+	.byte	8
+	.byte	0
+	.long	.Lprologue_end0-.Lprologue_start0
+.Lprologue_start0:
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	-5
+	.byte	14
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	8
+	.byte	2
+	.ascii	"/home/ayermolo/local/tasks/T138552329/typeDedupSplit"
+	.byte	0
+	.byte	46
+	.byte	0
+	.byte	3
+	.byte	1
+	.byte	8
+	.byte	2
+	.byte	15
+	.byte	5
+	.byte	30
+	.byte	2
+	.ascii	"main.cpp"
+	.byte	0
+	.byte	0
+	.byte	0x9c, 0x5c, 0xea, 0x5b
+	.byte	0xb7, 0x8d, 0x3f, 0xc2
+	.byte	0x65, 0xcd, 0x17, 0x51
+	.byte	0x10, 0xbf, 0xe9, 0x03
+	.ascii	"header.h"
+	.byte	0
+	.byte	1
+	.byte	0xfe, 0xa7, 0xbb, 0x1f
+	.byte	0x22, 0xc4, 0x7f, 0x12
+	.byte	0x9e, 0x15, 0x69, 0x5f
+	.byte	0x71, 0x37, 0xa1, 0xe7
+.Lprologue_end0:
+.Ldebug_line_end0:
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	0                               # Header: local type unit count
+	.long	2                               # Header: foreign type unit count
+	.long	5                               # Header: bucket count
+	.long	5                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.quad	5322170643381124694             # Type unit 0
+	.quad	1175092228111723119             # Type unit 1
+	.long	0                               # Bucket 0
+	.long	1                               # Bucket 1
+	.long	0                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	4                               # Bucket 4
+	.long	2090263771                      # Hash in Bucket 1
+	.long	2090499946                      # Hash in Bucket 1
+	.long	193495088                       # Hash in Bucket 3
+	.long	259227804                       # Hash in Bucket 4
+	.long	2090147939                      # Hash in Bucket 4
+	.long	.Lskel_string4                  # String in Bucket 1: Foo2
+	.long	.Lskel_string1                  # String in Bucket 1: main
+	.long	.Lskel_string2                  # String in Bucket 3: int
+	.long	.Lskel_string5                  # String in Bucket 4: Foo2a
+	.long	.Lskel_string3                  # String in Bucket 4: char
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 1
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames3:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	104                             # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2
+.Lnames0:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	26                              # DW_IDX_die_offset
+	.byte	0                               # End of list: main
+.Lnames1:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	86                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames4:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	33                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	113                             # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames2:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	54                              # DW_IDX_die_offset
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	1                               # DW_IDX_type_unit
+	.long	72                              # DW_IDX_die_offset
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	100                             # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s b/bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s
new file mode 100644
index 0000000000000..3e89382ed2d8a
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-types-debug-names-helper.s
@@ -0,0 +1,405 @@
+# clang++ helper.cpp -g2 -gdwarf-5 -gpubnames -fdebug-types-section
+# header.h
+# struct Foo2a {
+#   char *c1;
+# };
+# helper.cpp
+# #include "header.h"
+# int foo() {
+#   Foo2a f;
+#   return 0;
+# }
+
+
+	.text
+	.file	"helper.cpp"
+	.globl	_Z3foov                         # -- Begin function _Z3foov
+	.p2align	4, 0x90
+	.type	_Z3foov,@function
+_Z3foov:                                # @_Z3foov
+.Lfunc_begin0:
+	.file	0 "/typeDedupSmall" "helper.cpp" md5 0x305ec66c221c583021f8375b300e2591
+	.loc	0 2 0                           # helper.cpp:2:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+.Ltmp0:
+	.loc	0 4 3 prologue_end              # helper.cpp:4:3
+	xorl	%eax, %eax
+	.loc	0 4 3 epilogue_begin is_stmt 0  # helper.cpp:4:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	_Z3foov, .Lfunc_end0-_Z3foov
+	.cfi_endproc
+                                        # -- End function
+	.file	1 "." "header.h" md5 0x53699580704254cb1dd2a83230f8a7ea
+	.section	.debug_info,"G",@progbits,1175092228111723119,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x25 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	9                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	7                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x5 DW_TAG_pointer_type
+	.long	56                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x38:0x4 DW_TAG_base_type
+	.byte	8                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	6                               # Abbrev [6] 0xc:0x41 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	7                               # Abbrev [7] 0x23:0x1c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_linkage_name
+	.byte	4                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	63                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	8                               # Abbrev [8] 0x33:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	6                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	67                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x3f:0x4 DW_TAG_base_type
+	.byte	5                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x43:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	44                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=0
+.Linfo_string1:
+	.asciz	"helper.cpp"                    # string offset=108
+.Linfo_string2:
+	.asciz	"/home/typeDedupSmall" # string offset=119
+.Linfo_string3:
+	.asciz	"foo"                           # string offset=172
+.Linfo_string4:
+	.asciz	"_Z3foov"                       # string offset=176
+.Linfo_string5:
+	.asciz	"int"                           # string offset=184
+.Linfo_string6:
+	.asciz	"f"                             # string offset=188
+.Linfo_string7:
+	.asciz	"Foo2a"                         # string offset=190
+.Linfo_string8:
+	.asciz	"c1"                            # string offset=196
+.Linfo_string9:
+	.asciz	"char"                          # string offset=199
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string4
+	.long	.Linfo_string3
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string7
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	5                               # Header: bucket count
+	.long	5                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	0                               # Bucket 2
+	.long	1                               # Bucket 3
+	.long	2                               # Bucket 4
+	.long	193495088                       # Hash in Bucket 3
+	.long	193491849                       # Hash in Bucket 4
+	.long	259227804                       # Hash in Bucket 4
+	.long	2090147939                      # Hash in Bucket 4
+	.long	-1257882357                     # Hash in Bucket 4
+	.long	.Linfo_string5                  # String in Bucket 3: int
+	.long	.Linfo_string3                  # String in Bucket 4: foo
+	.long	.Linfo_string7                  # String in Bucket 4: Foo2a
+	.long	.Linfo_string9                  # String in Bucket 4: char
+	.long	.Linfo_string4                  # String in Bucket 4: _Z3foov
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 4
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames2:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	63                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames0:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # End of list: foo
+.Lnames3:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	67                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames4:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	56                              # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+.Lnames1:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # End of list: _Z3foov
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s b/bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s
new file mode 100644
index 0000000000000..cb9d287081d7f
--- /dev/null
+++ b/bolt/test/X86/Inputs/dwarf5-types-debug-names-main.s
@@ -0,0 +1,391 @@
+# clang++ -g2 -gdwarf-5 -gpubnames -fdebug-types-section
+# header.h
+# struct Foo2a {
+#   char *c1;
+# };
+# main.cpp
+# #include "header.h"
+# int main() {
+#  Foo2a f3;
+#  return 0;
+# }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "/typeDedupSmall" "main.cpp" md5 0x6e787b94dac2f817bb35cfde8006dc82
+	.loc	0 2 0                           # main.cpp:2:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 4 2 prologue_end              # main.cpp:4:2
+	xorl	%eax, %eax
+	.loc	0 4 2 epilogue_begin is_stmt 0  # main.cpp:4:2
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.file	1 "." "header.h" md5 0x53699580704254cb1dd2a83230f8a7ea
+	.section	.debug_info,"G",@progbits,1175092228111723119,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	1175092228111723119             # Type Signature
+	.long	35                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x25 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	8                               # DW_AT_name
+	.byte	8                               # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+	.byte	6                               # DW_AT_name
+	.long	51                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x33:0x5 DW_TAG_pointer_type
+	.long	56                              # DW_AT_type
+	.byte	5                               # Abbrev [5] 0x38:0x4 DW_TAG_base_type
+	.byte	7                               # DW_AT_name
+	.byte	6                               # DW_AT_encoding
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	6                               # Abbrev [6] 0xc:0x40 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	7                               # Abbrev [7] 0x23:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	62                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	8                               # Abbrev [8] 0x32:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	112
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.long	66                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	5                               # Abbrev [5] 0x3e:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	9                               # Abbrev [9] 0x42:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	1175092228111723119             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	40                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)" # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=108
+.Linfo_string2:
+	.asciz	"/home/typeDedupSmall" # string offset=117
+.Linfo_string3:
+	.asciz	"main"                          # string offset=170
+.Linfo_string4:
+	.asciz	"int"                           # string offset=175
+.Linfo_string5:
+	.asciz	"f3"                            # string offset=179
+.Linfo_string6:
+	.asciz	"Foo2a"                         # string offset=182
+.Linfo_string7:
+	.asciz	"c1"                            # string offset=188
+.Linfo_string8:
+	.asciz	"char"                          # string offset=191
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string7
+	.long	.Linfo_string8
+	.long	.Linfo_string6
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	4                               # Header: bucket count
+	.long	4                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	3                               # Bucket 2
+	.long	4                               # Bucket 3
+	.long	193495088                       # Hash in Bucket 0
+	.long	259227804                       # Hash in Bucket 0
+	.long	2090499946                      # Hash in Bucket 2
+	.long	2090147939                      # Hash in Bucket 3
+	.long	.Linfo_string4                  # String in Bucket 0: int
+	.long	.Linfo_string6                  # String in Bucket 0: Foo2a
+	.long	.Linfo_string3                  # String in Bucket 2: main
+	.long	.Linfo_string8                  # String in Bucket 3: char
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 3
+.Lnames_abbrev_start0:
+	.ascii	"\350\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\354\004"                      # Abbrev code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\210\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\310\013"                      # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.ascii	"\214\t"                        # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+	.ascii	"\210\t"                        # Abbreviation code
+	.long	62                              # DW_IDX_die_offset
+	.byte	0                               # End of list: int
+.Lnames2:
+	.ascii	"\354\004"                      # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+	.ascii	"\350\004"                      # Abbreviation code
+	.long	66                              # DW_IDX_die_offset
+	.byte	0                               # End of list: Foo2a
+.Lnames0:
+	.ascii	"\310\013"                      # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # End of list: main
+.Lnames3:
+	.ascii	"\214\t"                        # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	56                              # DW_IDX_die_offset
+	.byte	0                               # End of list: char
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 18.0.0git (git@github.com:ayermolo/llvm-project.git db35fa8fc524127079662802c4735dbf397f86d0)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
new file mode 100644
index 0000000000000..b8789a6ad2300
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-generate-debug-names.test
@@ -0,0 +1,154 @@
+; REQUIRES: system-linux
+
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
+; RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
+; RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --create-debug-names-section=true
+; RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %t.bolt > %t.txt
+; RUN: cat %t.txt | FileCheck --check-prefix=BOLT %s
+
+;; Tests BOLT generates .debug_names with --create-debug-names-section.
+;; Also applicable when binary has CUs that do not contribute to .debug_names pre-bolt.
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x103
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 8
+; BOLT-NEXT:     Name count: 8
+; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:     CU[1]: [[OFFSET2]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000007f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0x8C06E589
+; BOLT-NEXT:       String: {{.+}} "_Z3usePiS_"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xB88B3D2
+; BOLT-NEXT:       String: {{.+}} "use"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000049
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0xFDE4B5D2
+; BOLT-NEXT:       String: {{.+}} "fooVar"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000028
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000092
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0xB5063CFE
+; BOLT-NEXT:       String: {{.+}} "_Z3fooi"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005e
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-debug-names.test b/bolt/test/X86/dwarf5-debug-names.test
new file mode 100644
index 0000000000000..de0e88d7a36ae
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names.test
@@ -0,0 +1,263 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-main.s   -o %tmain.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-helper.s -o %thelper.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o %thelper.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x1C2
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 14
+; BOLT-NEXT:     Name count: 15
+; BOLT-NEXT:     Abbreviations table size: 0x29
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:     CU[1]: [[OFFSET2]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_namespace
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x59796C
+; BOLT-NEXT:       String: {{.+}} "t3"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0x8CFC710C
+; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xBA564846
+; BOLT-NEXT:       String: {{.+}} "Foo2Int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "Foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000009f
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0x392140FA
+; BOLT-NEXT:       String: {{.+}} "t2<&fooint>"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0xFDE48034
+; BOLT-NEXT:       String: {{.+}} "fooint"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 10 {
+; BOLT-NEXT:       Hash: 0xB5063D0B
+; BOLT-NEXT:       String: {{.+}} "_Z3foov"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000033
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     Name 11 {
+; BOLT-NEXT:       Hash: 0x5979AC
+; BOLT-NEXT:       String: {{.+}} "v1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 9 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 10 [
+; BOLT-NEXT:     Name 12 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 13 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 14 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 11 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 12 [
+; BOLT-NEXT:     Name 15 {
+; BOLT-NEXT:       Hash: 0x59796A
+; BOLT-NEXT:       String: {{.+}} "t1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 13 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
new file mode 100644
index 0000000000000..3be327b780e52
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-debug-names-generate-debug-names.test
@@ -0,0 +1,192 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-dualcu-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-dualcu-helper.s \
+; RUN: -split-dwarf-file=helper.dwo -o helper.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe -fno-pic -no-pie
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --create-debug-names-section=true
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt > %t/foo.txt
+; RUN: cat %t/foo.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests BOLT generates .debug_names with --create-debug-names-section.
+;; Also applicable when binary has split dwarf CUs that do not contribute to .debug_names pre-bolt.
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x148
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 11
+; BOLT-NEXT:     Name count: 11
+; BOLT-NEXT:     Abbreviations table size: 0x19
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:     CU[1]: [[OFFSET2]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x2B61E
+; BOLT-NEXT:       String: {{.+}} "y"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000008c
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0x2B609
+; BOLT-NEXT:       String: {{.+}} "d"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0x2B61F
+; BOLT-NEXT:       String: {{.+}} "z"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0xB88B3D2
+; BOLT-NEXT:       String: {{.+}} "use"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0x45A3B006
+; BOLT-NEXT:       String: {{.+}} "_Z6helperii"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x8C06E589
+; BOLT-NEXT:       String: {{.+}} "_Z3usePiS_"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0x1D853E5
+; BOLT-NEXT:       String: {{.+}} "helper"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000034
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 10 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000057
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 9 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 10 [
+; BOLT-NEXT:     Name 11 {
+; BOLT-NEXT:       Hash: 0x2B61D
+; BOLT-NEXT:       String: {{.+}} "x"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
diff --git a/bolt/test/X86/dwarf5-df-debug-names.test b/bolt/test/X86/dwarf5-df-debug-names.test
new file mode 100644
index 0000000000000..0352d0ff72082
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-debug-names.test
@@ -0,0 +1,149 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-helper.s \
+; RUN: -split-dwarf-file=helper.dwo -o helper.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt > log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs for split dwarf.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+: BOLT:       Name Index @ 0x0 {
+: BOLT-NEXT:   Header {
+: BOLT-NEXT:     Length: 0xF4
+: BOLT-NEXT:     Format: DWARF32
+: BOLT-NEXT:     Version: 5
+: BOLT-NEXT:     CU count: 2
+: BOLT-NEXT:     Local TU count: 0
+: BOLT-NEXT:     Foreign TU count: 0
+: BOLT-NEXT:     Bucket count: 7
+: BOLT-NEXT:     Name count: 7
+: BOLT-NEXT:     Abbreviations table size: 0x21
+: BOLT-NEXT:     Augmentation: 'BOLT'
+: BOLT-NEXT:   }
+: BOLT-NEXT:   Compilation Unit offsets [
+: BOLT-NEXT:     CU[0]: [[OFFSET]]
+: BOLT-NEXT:     CU[1]: [[OFFSET1]]
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Abbreviations [
+: BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_structure_type
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_base_type
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_variable
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+: BOLT-NEXT:       Tag: DW_TAG_subprogram
+: BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+: BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 0 [
+: BOLT-NEXT:     EMPTY
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 1 [
+: BOLT-NEXT:     Name 1 {
+: BOLT-NEXT:       Hash: 0x7C96E4DB
+: BOLT-NEXT:       String: {{.+}} "Foo2"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV1]]
+: BOLT-NEXT:         Tag: DW_TAG_structure_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 2 [
+: BOLT-NEXT:     Name 2 {
+: BOLT-NEXT:       Hash: 0xBA564846
+: BOLT-NEXT:       String: {{.+}} "Foo2Int"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV1]]
+: BOLT-NEXT:         Tag: DW_TAG_structure_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 3 [
+: BOLT-NEXT:     Name 3 {
+: BOLT-NEXT:       Hash: 0xB888030
+: BOLT-NEXT:       String: {{.+}} "int"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV2]]
+: BOLT-NEXT:         Tag: DW_TAG_base_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000043
+: BOLT-NEXT:       }
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV2]]
+: BOLT-NEXT:         Tag: DW_TAG_base_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Name 4 {
+: BOLT-NEXT:       Hash: 0xF73809C
+: BOLT-NEXT:       String: {{.+}} "Foo2a"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV1]]
+: BOLT-NEXT:         Tag: DW_TAG_structure_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Name 5 {
+: BOLT-NEXT:       Hash: 0x7C96CB76
+: BOLT-NEXT:       String: {{.+}} "fint"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV3]]
+: BOLT-NEXT:         Tag: DW_TAG_variable
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:     Name 6 {
+: BOLT-NEXT:       Hash: 0x7C9A7F6A
+: BOLT-NEXT:       String: {{.+}} "main"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV4]]
+: BOLT-NEXT:         Tag: DW_TAG_subprogram
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 4 [
+: BOLT-NEXT:     EMPTY
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 5 [
+: BOLT-NEXT:     Name 7 {
+: BOLT-NEXT:       Hash: 0x7C952063
+: BOLT-NEXT:       String: {{.+}} "char"
+: BOLT-NEXT:       Entry @ {{.+}} {
+: BOLT-NEXT:         Abbrev: [[ABBREV2]]
+: BOLT-NEXT:         Tag: DW_TAG_base_type
+: BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+: BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+: BOLT-NEXT:       }
+: BOLT-NEXT:     }
+: BOLT-NEXT:   ]
+: BOLT-NEXT:   Bucket 6 [
+: BOLT-NEXT:     EMPTY
+: BOLT-NEXT:   ]
+: BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
new file mode 100644
index 0000000000000..246ce7efb3bad
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-one-cu-debug-names.test
@@ -0,0 +1,101 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt > log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CU for split dwarf.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0xA9
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 1
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 5
+; BOLT-NEXT:     Name count: 5
+; BOLT-NEXT:     Abbreviations table size: 0x13
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: 0x00000000
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000068
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000078
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
new file mode 100644
index 0000000000000..fdb962b80c751
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -0,0 +1,234 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-helper.s \
+; RUN: -split-dwarf-file=helper.dwo -o helper.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo > log.txt
+; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt >> log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs and foreign TUs.
+
+; BOLT: type_signature = [[TYPE:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE1:0x[0-9a-f]*]]
+; BOLT: Compile Unit
+; BOLT: type_signature = [[TYPE2:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE3:0x[0-9a-f]*]]
+; BOLT: Compile Unit
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x174
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 2
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 4
+; BOLT-NEXT:     Bucket count: 9
+; BOLT-NEXT:     Name count: 9
+; BOLT-NEXT:     Abbreviations table size: 0x2D
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET]]
+; BOLT-NEXT:     CU[1]: [[OFFSET1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Foreign Type Unit signatures [
+; BOLT-NEXT:     ForeignTU[0]: [[TYPE]]
+; BOLT-NEXT:     ForeignTU[1]: [[TYPE1]]
+; BOLT-NEXT:     ForeignTU[2]: [[TYPE2]]
+; BOLT-NEXT:     ForeignTU[3]: [[TYPE3]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT: Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0xB5063D0B
+; BOLT-NEXT:       String: {{.+}} "_Z3foov"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xFDE48034
+; BOLT-NEXT:       String: {{.+}} "fooint"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000025
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x02
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000029
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x03
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0xBA564846
+; BOLT-NEXT:       String: {{.+}} "Foo2Int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x02
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: 0x5
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x03
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_compile_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
new file mode 100644
index 0000000000000..1c77583f50883
--- /dev/null
+++ b/bolt/test/X86/dwarf5-df-types-one-cu-debug-names.test
@@ -0,0 +1,129 @@
+; RUN: rm -rf %t
+; RUN: mkdir %t
+; RUN: cd %t
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-main.s \
+; RUN: -split-dwarf-file=main.dwo -o main.o
+; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o -o main.exe
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo > log.txt
+; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt >> log.txt
+; RUN: cat log.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CU and foreign TUs.
+
+; BOLT: type_signature = [[TYPE:0x[0-9a-f]*]]
+; BOLT: type_signature = [[TYPE1:0x[0-9a-f]*]]
+; BOLT: Compile Unit
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0xD1
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 1
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 2
+; BOLT-NEXT:     Bucket count: 5
+; BOLT-NEXT:     Name count: 5
+; BOLT-NEXT:     Abbreviations table size: 0x1D
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Foreign Type Unit signatures [
+; BOLT-NEXT:     ForeignTU[0]: [[TYPE]]
+; BOLT-NEXT:     ForeignTU[1]: [[TYPE1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000056
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000021
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x00
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000036
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_type_unit: 0x01
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000048
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000064
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-one-cu-debug-names.test b/bolt/test/X86/dwarf5-one-cu-debug-names.test
new file mode 100644
index 0000000000000..e7754521d61cb
--- /dev/null
+++ b/bolt/test/X86/dwarf5-one-cu-debug-names.test
@@ -0,0 +1,178 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-debug-names-main.s   -o %tmain.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info -r 0 --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CUs
+
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT:       Name Index @ 0x0 {
+; BOLT-NEXT:   Header {
+; BOLT-NEXT:     Length: 0x13E
+; BOLT-NEXT:     Format: DWARF32
+; BOLT-NEXT:     Version: 5
+; BOLT-NEXT:     CU count: 1
+; BOLT-NEXT:     Local TU count: 0
+; BOLT-NEXT:     Foreign TU count: 0
+; BOLT-NEXT:     Bucket count: 11
+; BOLT-NEXT:     Name count: 11
+; BOLT-NEXT:     Abbreviations table size: 0x1F
+; BOLT-NEXT:     Augmentation: 'BOLT'
+; BOLT-NEXT:   }
+; BOLT-NEXT:   Compilation Unit offsets [
+; BOLT-NEXT:     CU[0]: [[OFFSET1]]
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Abbreviations [
+; BOLT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_structure_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_base_type
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_variable
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_subprogram
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Abbreviation [[ABBREV5:0x[0-9a-f]*]] {
+; BOLT-NEXT:       Tag: DW_TAG_namespace
+; BOLT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 0 [
+; BOLT-NEXT:     Name 1 {
+; BOLT-NEXT:       Hash: 0xF73809C
+; BOLT-NEXT:       String: {{.+}} "Foo2a"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000104
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 2 {
+; BOLT-NEXT:       Hash: 0x7C952063
+; BOLT-NEXT:       String: {{.+}} "char"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c5
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 1 [
+; BOLT-NEXT:     Name 3 {
+; BOLT-NEXT:       Hash: 0xB887389
+; BOLT-NEXT:       String: {{.+}} "Foo"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000c9
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 4 {
+; BOLT-NEXT:       Hash: 0x392140FA
+; BOLT-NEXT:       String: {{.+}} "t2<&fooint>"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 2 [
+; BOLT-NEXT:     Name 5 {
+; BOLT-NEXT:       Hash: 0x7C96E4DB
+; BOLT-NEXT:       String: {{.+}} "Foo2"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x000000eb
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 3 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 4 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 5 [
+; BOLT-NEXT:     Name 6 {
+; BOLT-NEXT:       Hash: 0x59796A
+; BOLT-NEXT:       String: {{.+}} "t1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000062
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 7 {
+; BOLT-NEXT:       Hash: 0x5979AC
+; BOLT-NEXT:       String: {{.+}} "v1"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV3]]
+; BOLT-NEXT:         Tag: DW_TAG_variable
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 6 [
+; BOLT-NEXT:     Name 8 {
+; BOLT-NEXT:       Hash: 0xB888030
+; BOLT-NEXT:       String: {{.+}} "int"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV2]]
+; BOLT-NEXT:         Tag: DW_TAG_base_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000005d
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 7 [
+; BOLT-NEXT:     Name 9 {
+; BOLT-NEXT:       Hash: 0x59796C
+; BOLT-NEXT:       String: {{.+}} "t3"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV1]]
+; BOLT-NEXT:         Tag: DW_TAG_structure_type
+; BOLT-NEXT:         DW_IDX_die_offset: 0x0000002f
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:     Name 10 {
+; BOLT-NEXT:       Hash: 0x7C9A7F6A
+; BOLT-NEXT:       String: {{.+}} "main"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV4]]
+; BOLT-NEXT:         Tag: DW_TAG_subprogram
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000073
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 8 [
+; BOLT-NEXT:     Name 11 {
+; BOLT-NEXT:       Hash: 0x8CFC710C
+; BOLT-NEXT:       String: {{.+}} "(anonymous namespace)"
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:       Entry @ {{.+}} {
+; BOLT-NEXT:         Abbrev: [[ABBREV5]]
+; BOLT-NEXT:         Tag: DW_TAG_namespace
+; BOLT-NEXT:         DW_IDX_die_offset: 0x00000061
+; BOLT-NEXT:       }
+; BOLT-NEXT:     }
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 9 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT:   Bucket 10 [
+; BOLT-NEXT:     EMPTY
+; BOLT-NEXT:   ]
+; BOLT-NEXT: }
diff --git a/bolt/test/X86/dwarf5-types-debug-names.test b/bolt/test/X86/dwarf5-types-debug-names.test
new file mode 100644
index 0000000000000..6a26477d4cdb5
--- /dev/null
+++ b/bolt/test/X86/dwarf5-types-debug-names.test
@@ -0,0 +1,129 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-main.s   -o %tmain.o
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-helper.s -o %thelper.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o %thelper.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with two CUs and a local TU.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Type Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+; BOLT: [[OFFSET2:0x[0-9a-f]*]]: Compile Unit
+
+
+; BOLT: Name Index @ 0x0 {
+; BOLT:   Header {
+; BOLT:     Length: 0xE1
+; BOLT:     Format: DWARF32
+; BOLT:     Version: 5
+; BOLT:     CU count: 2
+; BOLT:     Local TU count: 1
+; BOLT:     Foreign TU count: 0
+; BOLT:     Bucket count: 6
+; BOLT:     Name count: 6
+; BOLT:     Abbreviations table size: 0x21
+; BOLT:     Augmentation: 'BOLT'
+; BOLT:   }
+; BOLT:   Compilation Unit offsets [
+; BOLT:     CU[0]: [[OFFSET1]]
+; BOLT:     CU[1]: [[OFFSET2]]
+; BOLT:   ]
+; BOLT:   Local Type Unit offsets [
+; BOLT:     LocalTU[0]: [[OFFSET]]
+; BOLT:   ]
+; BOLT:   Abbreviations [
+; BOLT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_structure_type
+; BOLT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_subprogram
+; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_base_type
+; BOLT:       DW_IDX_compile_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT:       Tag: DW_TAG_base_type
+; BOLT:       DW_IDX_type_unit: DW_FORM_data1
+; BOLT:       DW_IDX_die_offset: DW_FORM_ref4
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 0 [
+; BOLT:     Name 1 {
+; BOLT:       Hash: 0xF73809C
+; BOLT:       String: {{.+}} "Foo2a"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV]]
+; BOLT:         Tag: DW_TAG_structure_type
+; BOLT:         DW_IDX_type_unit: 0x00
+; BOLT:         DW_IDX_die_offset: 0x00000023
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 1 [
+; BOLT:     Name 2 {
+; BOLT:       Hash: 0xB5063D0B
+; BOLT:       String: {{.+}} "_Z3foov"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV1]]
+; BOLT:         Tag: DW_TAG_subprogram
+; BOLT:         DW_IDX_compile_unit: 0x01
+; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 2 [
+; BOLT:     Name 3 {
+; BOLT:       Hash: 0xB888030
+; BOLT:       String: {{.+}} "int"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV2]]
+; BOLT:         Tag: DW_TAG_base_type
+; BOLT:         DW_IDX_compile_unit: 0x01
+; BOLT:         DW_IDX_die_offset: 0x00000040
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 3 [
+; BOLT:     Name 4 {
+; BOLT:       Hash: 0xB887389
+; BOLT:       String: {{.+}} "foo"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV1]]
+; BOLT:         Tag: DW_TAG_subprogram
+; BOLT:         DW_IDX_compile_unit: 0x01
+; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 4 [
+; BOLT:     Name 5 {
+; BOLT:       Hash: 0x7C9A7F6A
+; BOLT:       String: {{.+}} "main"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV1]]
+; BOLT:         Tag: DW_TAG_subprogram
+; BOLT:         DW_IDX_compile_unit: 0x00
+; BOLT:         DW_IDX_die_offset: 0x00000024
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT:   Bucket 5 [
+; BOLT:     Name 6 {
+; BOLT:       Hash: 0x7C952063
+; BOLT:       String: {{.+}} "char"
+; BOLT:       Entry @ {{.+}} {
+; BOLT:         Abbrev: [[ABBREV3]]
+; BOLT:         Tag: DW_TAG_base_type
+; BOLT:         DW_IDX_type_unit: 0x00
+; BOLT:         DW_IDX_die_offset: 0x00000038
+; BOLT:       }
+; BOLT:     }
+; BOLT:   ]
+; BOLT: }
diff --git a/bolt/test/X86/dwarf5-types-one-cu-debug-names.test b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
new file mode 100644
index 0000000000000..00a1319a55786
--- /dev/null
+++ b/bolt/test/X86/dwarf5-types-one-cu-debug-names.test
@@ -0,0 +1,98 @@
+; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-main.s   -o %tmain.o
+; RUN: %clang %cflags -gdwarf-5 %tmain.o -o %tmain.exe
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-dwarfdump --debug-info --debug-names %tmain.exe.bolt > %tlog.txt
+; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
+
+;; Tests that BOLT correctly generates .debug_names section with one CU and a local TU.
+
+; BOLT: [[OFFSET:0x[0-9a-f]*]]: Type Unit
+; BOLT: [[OFFSET1:0x[0-9a-f]*]]: Compile Unit
+
+; BOLT:Name Index @ 0x0 {
+; BOLT-NEXT:  Header {
+; BOLT-NEXT:    Length: 0xA3
+; BOLT-NEXT:    Format: DWARF32
+; BOLT-NEXT:    Version: 5
+; BOLT-NEXT:    CU count: 1
+; BOLT-NEXT:    Local TU count: 1
+; BOLT-NEXT:    Foreign TU count: 0
+; BOLT-NEXT:    Bucket count: 4
+; BOLT-NEXT:    Name count: 4
+; BOLT-NEXT:    Abbreviations table size: 0x1D
+; BOLT-NEXT:    Augmentation: 'BOLT'
+; BOLT-NEXT:  }
+; BOLT-NEXT:  Compilation Unit offsets [
+; BOLT-NEXT:    CU[0]: [[OFFSET1]]
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Local Type Unit offsets [
+; BOLT-NEXT:    LocalTU[0]: [[OFFSET]]
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Abbreviations [
+; BOLT-NEXT:    Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_base_type
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_structure_type
+; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_subprogram
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; BOLT-NEXT:      Tag: DW_TAG_base_type
+; BOLT-NEXT:      DW_IDX_type_unit: DW_FORM_data1
+; BOLT-NEXT:      DW_IDX_die_offset: DW_FORM_ref4
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 0 [
+; BOLT-NEXT:    Name 1 {
+; BOLT-NEXT:      Hash: 0xB888030
+; BOLT-NEXT:      String: {{.+}} "int"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV]]
+; BOLT-NEXT:        Tag: DW_TAG_base_type
+; BOLT-NEXT:        DW_IDX_die_offset: 0x0000003f
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:    Name 2 {
+; BOLT-NEXT:      Hash: 0xF73809C
+; BOLT-NEXT:      String: {{.+}} "Foo2a"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV1]]
+; BOLT-NEXT:        Tag: DW_TAG_structure_type
+; BOLT-NEXT:        DW_IDX_type_unit: 0x00
+; BOLT-NEXT:        DW_IDX_die_offset: 0x00000023
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 1 [
+; BOLT-NEXT:    EMPTY
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 2 [
+; BOLT-NEXT:    Name 3 {
+; BOLT-NEXT:      Hash: 0x7C9A7F6A
+; BOLT-NEXT:      String: {{.+}} "main"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV2]]
+; BOLT-NEXT:        Tag: DW_TAG_subprogram
+; BOLT-NEXT:        DW_IDX_die_offset: 0x00000024
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:  Bucket 3 [
+; BOLT-NEXT:    Name 4 {
+; BOLT-NEXT:      Hash: 0x7C952063
+; BOLT-NEXT:      String: {{.+}} "char"
+; BOLT-NEXT:      Entry @ {{.+}} {
+; BOLT-NEXT:        Abbrev: [[ABBREV3]]
+; BOLT-NEXT:        Tag: DW_TAG_base_type
+; BOLT-NEXT:        DW_IDX_type_unit: 0x00
+; BOLT-NEXT:        DW_IDX_die_offset: 0x00000038
+; BOLT-NEXT:      }
+; BOLT-NEXT:    }
+; BOLT-NEXT:  ]
+; BOLT-NEXT:}

From e4604c35f5ccc4478c3b649edbc74b494098a442 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 26 Feb 2024 14:09:09 -0800
Subject: [PATCH 377/546] [flang] Added support for REAL16 math intrinsics in
 lowering and runtime. (#82860)

This PR does not include support for COMPLEX(16) intrinsics.
Note that (fp ** int) operations do not require Float128Math library,
as they are implemented via basic F128 operations,
which are supported by the build compilers' runtimes.
---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 55 ++++++++++++++++
 flang/runtime/Float128Math/CMakeLists.txt     | 33 ++++++++++
 flang/runtime/Float128Math/acos.cpp           | 22 +++++++
 flang/runtime/Float128Math/acosh.cpp          | 22 +++++++
 flang/runtime/Float128Math/asin.cpp           | 22 +++++++
 flang/runtime/Float128Math/asinh.cpp          | 22 +++++++
 flang/runtime/Float128Math/atan.cpp           | 22 +++++++
 flang/runtime/Float128Math/atan2.cpp          | 23 +++++++
 flang/runtime/Float128Math/atanh.cpp          | 22 +++++++
 flang/runtime/Float128Math/ceil.cpp           | 22 +++++++
 flang/runtime/Float128Math/cos.cpp            | 22 +++++++
 flang/runtime/Float128Math/cosh.cpp           | 22 +++++++
 flang/runtime/Float128Math/erf.cpp            | 22 +++++++
 flang/runtime/Float128Math/erfc.cpp           | 22 +++++++
 flang/runtime/Float128Math/exp.cpp            | 22 +++++++
 flang/runtime/Float128Math/floor.cpp          | 22 +++++++
 flang/runtime/Float128Math/hypot.cpp          | 23 +++++++
 flang/runtime/Float128Math/j0.cpp             | 22 +++++++
 flang/runtime/Float128Math/j1.cpp             | 22 +++++++
 flang/runtime/Float128Math/jn.cpp             | 22 +++++++
 flang/runtime/Float128Math/lgamma.cpp         | 22 +++++++
 flang/runtime/Float128Math/llround.cpp        | 22 +++++++
 flang/runtime/Float128Math/log.cpp            | 22 +++++++
 flang/runtime/Float128Math/log10.cpp          | 22 +++++++
 flang/runtime/Float128Math/lround.cpp         | 22 +++++++
 flang/runtime/Float128Math/math-entries.h     | 66 +++++++++++++++++++
 flang/runtime/Float128Math/pow.cpp            | 23 +++++++
 flang/runtime/Float128Math/round.cpp          | 26 ++++++++
 flang/runtime/Float128Math/sinh.cpp           | 22 +++++++
 flang/runtime/Float128Math/tan.cpp            | 22 +++++++
 flang/runtime/Float128Math/tanh.cpp           | 22 +++++++
 flang/runtime/Float128Math/tgamma.cpp         | 22 +++++++
 flang/runtime/Float128Math/trunc.cpp          | 26 ++++++++
 flang/runtime/Float128Math/y0.cpp             | 22 +++++++
 flang/runtime/Float128Math/y1.cpp             | 22 +++++++
 flang/runtime/Float128Math/yn.cpp             | 22 +++++++
 flang/test/Lower/Intrinsics/acos_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/acosh_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/aint_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/anint_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/asin_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/asinh_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/atan2_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/atan_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/atanh_real16.f90  |  9 +++
 .../Lower/Intrinsics/bessel_j0_real16.f90     |  9 +++
 .../Lower/Intrinsics/bessel_j1_real16.f90     |  9 +++
 .../Lower/Intrinsics/bessel_jn_real16.f90     | 10 +++
 .../Lower/Intrinsics/bessel_y0_real16.f90     |  9 +++
 .../Lower/Intrinsics/bessel_y1_real16.f90     |  9 +++
 .../Lower/Intrinsics/bessel_yn_real16.f90     | 10 +++
 .../test/Lower/Intrinsics/ceiling_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/cos_real16.f90    |  9 +++
 flang/test/Lower/Intrinsics/cosh_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/erf_real16.f90    |  9 +++
 flang/test/Lower/Intrinsics/erfc_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/exp_real16.f90    |  9 +++
 flang/test/Lower/Intrinsics/floor_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/gamma_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/hypot_real16.f90  |  9 +++
 flang/test/Lower/Intrinsics/log10_real16.f90  |  9 +++
 .../Lower/Intrinsics/log_gamma_real16.f90     |  9 +++
 flang/test/Lower/Intrinsics/log_real16.f90    |  9 +++
 flang/test/Lower/Intrinsics/nint_real16.f90   | 13 ++++
 flang/test/Lower/Intrinsics/pow_real16.f90    |  9 +++
 flang/test/Lower/Intrinsics/powi_real16.f90   | 22 +++++++
 flang/test/Lower/Intrinsics/sinh_real16.f90   |  9 +++
 flang/test/Lower/Intrinsics/tan_real16.f90    |  9 +++
 flang/test/Lower/Intrinsics/tanh_real16.f90   |  9 +++
 69 files changed, 1207 insertions(+)
 create mode 100644 flang/runtime/Float128Math/acos.cpp
 create mode 100644 flang/runtime/Float128Math/acosh.cpp
 create mode 100644 flang/runtime/Float128Math/asin.cpp
 create mode 100644 flang/runtime/Float128Math/asinh.cpp
 create mode 100644 flang/runtime/Float128Math/atan.cpp
 create mode 100644 flang/runtime/Float128Math/atan2.cpp
 create mode 100644 flang/runtime/Float128Math/atanh.cpp
 create mode 100644 flang/runtime/Float128Math/ceil.cpp
 create mode 100644 flang/runtime/Float128Math/cos.cpp
 create mode 100644 flang/runtime/Float128Math/cosh.cpp
 create mode 100644 flang/runtime/Float128Math/erf.cpp
 create mode 100644 flang/runtime/Float128Math/erfc.cpp
 create mode 100644 flang/runtime/Float128Math/exp.cpp
 create mode 100644 flang/runtime/Float128Math/floor.cpp
 create mode 100644 flang/runtime/Float128Math/hypot.cpp
 create mode 100644 flang/runtime/Float128Math/j0.cpp
 create mode 100644 flang/runtime/Float128Math/j1.cpp
 create mode 100644 flang/runtime/Float128Math/jn.cpp
 create mode 100644 flang/runtime/Float128Math/lgamma.cpp
 create mode 100644 flang/runtime/Float128Math/llround.cpp
 create mode 100644 flang/runtime/Float128Math/log.cpp
 create mode 100644 flang/runtime/Float128Math/log10.cpp
 create mode 100644 flang/runtime/Float128Math/lround.cpp
 create mode 100644 flang/runtime/Float128Math/pow.cpp
 create mode 100644 flang/runtime/Float128Math/round.cpp
 create mode 100644 flang/runtime/Float128Math/sinh.cpp
 create mode 100644 flang/runtime/Float128Math/tan.cpp
 create mode 100644 flang/runtime/Float128Math/tanh.cpp
 create mode 100644 flang/runtime/Float128Math/tgamma.cpp
 create mode 100644 flang/runtime/Float128Math/trunc.cpp
 create mode 100644 flang/runtime/Float128Math/y0.cpp
 create mode 100644 flang/runtime/Float128Math/y1.cpp
 create mode 100644 flang/runtime/Float128Math/yn.cpp
 create mode 100644 flang/test/Lower/Intrinsics/acos_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/acosh_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/aint_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/anint_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/asin_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/asinh_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/atan2_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/atan_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/atanh_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/bessel_j0_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/bessel_j1_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/bessel_jn_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/bessel_y0_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/bessel_y1_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/bessel_yn_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/ceiling_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/cos_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/cosh_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/erf_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/erfc_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/exp_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/floor_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/gamma_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/hypot_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/log10_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/log_gamma_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/log_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/nint_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/pow_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/powi_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/sinh_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/tan_real16.f90
 create mode 100644 flang/test/Lower/Intrinsics/tanh_real16.f90

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 3a82be895d37c..0344c6ba6cb3b 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -916,6 +916,16 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
 ///       See https://gcc.gnu.org/onlinedocs/gcc-12.1.0/gfortran/\
 ///       Intrinsic-Procedures.html for a reference.
 constexpr auto FuncTypeReal16Real16 = genFuncType<Ty::Real<16>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Real16Real16 =
+    genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Integer4Real16 =
+    genFuncType<Ty::Real<16>, Ty::Integer<4>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Integer8Real16 =
+    genFuncType<Ty::Real<16>, Ty::Integer<8>, Ty::Real<16>>;
+constexpr auto FuncTypeInteger4Real16 =
+    genFuncType<Ty::Integer<4>, Ty::Real<16>>;
+constexpr auto FuncTypeInteger8Real16 =
+    genFuncType<Ty::Integer<8>, Ty::Real<16>>;
 constexpr auto FuncTypeReal16Complex16 =
     genFuncType<Ty::Real<16>, Ty::Complex<16>>;
 
@@ -933,10 +943,12 @@ static constexpr MathOperation mathOperations[] = {
     {"abs", RTNAME_STRING(CAbsF128), FuncTypeReal16Complex16, genLibF128Call},
     {"acos", "acosf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"acos", "acos", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"acos", RTNAME_STRING(AcosF128), FuncTypeReal16Real16, genLibF128Call},
     {"acos", "cacosf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"acos", "cacos", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"acosh", "acoshf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"acosh", "acosh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"acosh", RTNAME_STRING(AcoshF128), FuncTypeReal16Real16, genLibF128Call},
     {"acosh", "cacoshf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genLibCall},
     {"acosh", "cacosh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -948,6 +960,7 @@ static constexpr MathOperation mathOperations[] = {
      genLibCall},
     {"aint", "llvm.trunc.f80", genFuncType<Ty::Real<10>, Ty::Real<10>>,
      genLibCall},
+    {"aint", RTNAME_STRING(TruncF128), FuncTypeReal16Real16, genLibF128Call},
     // llvm.round behaves the same way as libm's round.
     {"anint", "llvm.round.f32", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::LLVM::RoundOp>},
@@ -955,12 +968,15 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::LLVM::RoundOp>},
     {"anint", "llvm.round.f80", genFuncType<Ty::Real<10>, Ty::Real<10>>,
      genMathOp<mlir::LLVM::RoundOp>},
+    {"anint", RTNAME_STRING(RoundF128), FuncTypeReal16Real16, genLibF128Call},
     {"asin", "asinf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"asin", "asin", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"asin", RTNAME_STRING(AsinF128), FuncTypeReal16Real16, genLibF128Call},
     {"asin", "casinf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"asin", "casin", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"asinh", "asinhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"asinh", "asinh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"asinh", RTNAME_STRING(AsinhF128), FuncTypeReal16Real16, genLibF128Call},
     {"asinh", "casinhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genLibCall},
     {"asinh", "casinh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -969,49 +985,64 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::AtanOp>},
     {"atan", "atan", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::AtanOp>},
+    {"atan", RTNAME_STRING(AtanF128), FuncTypeReal16Real16, genLibF128Call},
     {"atan", "catanf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"atan", "catan", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"atan2", "atan2f", genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::Atan2Op>},
     {"atan2", "atan2", genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::Atan2Op>},
+    {"atan2", RTNAME_STRING(Atan2F128), FuncTypeReal16Real16Real16,
+     genLibF128Call},
     {"atanh", "atanhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"atanh", "atanh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"atanh", RTNAME_STRING(AtanhF128), FuncTypeReal16Real16, genLibF128Call},
     {"atanh", "catanhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genLibCall},
     {"atanh", "catanh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
      genLibCall},
     {"bessel_j0", "j0f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_j0", "j0", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_j0", RTNAME_STRING(J0F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_j1", "j1f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_j1", "j1", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_j1", RTNAME_STRING(J1F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_jn", "jnf", genFuncType<Ty::Real<4>, Ty::Integer<4>, Ty::Real<4>>,
      genLibCall},
     {"bessel_jn", "jn", genFuncType<Ty::Real<8>, Ty::Integer<4>, Ty::Real<8>>,
      genLibCall},
+    {"bessel_jn", RTNAME_STRING(JnF128), FuncTypeReal16Integer4Real16,
+     genLibF128Call},
     {"bessel_y0", "y0f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_y0", "y0", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_y0", RTNAME_STRING(Y0F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_y1", "y1f", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"bessel_y1", "y1", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"bessel_y1", RTNAME_STRING(Y1F128), FuncTypeReal16Real16, genLibF128Call},
     {"bessel_yn", "ynf", genFuncType<Ty::Real<4>, Ty::Integer<4>, Ty::Real<4>>,
      genLibCall},
     {"bessel_yn", "yn", genFuncType<Ty::Real<8>, Ty::Integer<4>, Ty::Real<8>>,
      genLibCall},
+    {"bessel_yn", RTNAME_STRING(YnF128), FuncTypeReal16Integer4Real16,
+     genLibF128Call},
     // math::CeilOp returns a real, while Fortran CEILING returns integer.
     {"ceil", "ceilf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::CeilOp>},
     {"ceil", "ceil", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::CeilOp>},
+    {"ceil", RTNAME_STRING(CeilF128), FuncTypeReal16Real16, genLibF128Call},
     {"cos", "cosf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::CosOp>},
     {"cos", "cos", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::CosOp>},
+    {"cos", RTNAME_STRING(CosF128), FuncTypeReal16Real16, genLibF128Call},
     {"cos", "ccosf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::CosOp>},
     {"cos", "ccos", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
      genComplexMathOp<mlir::complex::CosOp>},
     {"cosh", "coshf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"cosh", "cosh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"cosh", RTNAME_STRING(CoshF128), FuncTypeReal16Real16, genLibF128Call},
     {"cosh", "ccoshf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"cosh", "ccosh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"divc",
@@ -1038,12 +1069,15 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::ErfOp>},
     {"erf", "erf", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::ErfOp>},
+    {"erf", RTNAME_STRING(ErfF128), FuncTypeReal16Real16, genLibF128Call},
     {"erfc", "erfcf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"erfc", "erfc", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"erfc", RTNAME_STRING(ErfcF128), FuncTypeReal16Real16, genLibF128Call},
     {"exp", "expf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::ExpOp>},
     {"exp", "exp", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::ExpOp>},
+    {"exp", RTNAME_STRING(ExpF128), FuncTypeReal16Real16, genLibF128Call},
     {"exp", "cexpf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::ExpOp>},
     {"exp", "cexp", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1074,6 +1108,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::FloorOp>},
     {"floor", "floor", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::FloorOp>},
+    {"floor", RTNAME_STRING(FloorF128), FuncTypeReal16Real16, genLibF128Call},
     {"fma", "llvm.fma.f32",
      genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::FmaOp>},
@@ -1082,14 +1117,18 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::FmaOp>},
     {"gamma", "tgammaf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"gamma", "tgamma", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"gamma", RTNAME_STRING(TgammaF128), FuncTypeReal16Real16, genLibF128Call},
     {"hypot", "hypotf", genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Real<4>>,
      genLibCall},
     {"hypot", "hypot", genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genLibCall},
+    {"hypot", RTNAME_STRING(HypotF128), FuncTypeReal16Real16Real16,
+     genLibF128Call},
     {"log", "logf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
      genMathOp<mlir::math::LogOp>},
     {"log", "log", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::LogOp>},
+    {"log", RTNAME_STRING(LogF128), FuncTypeReal16Real16, genLibF128Call},
     {"log", "clogf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::LogOp>},
     {"log", "clog", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1098,17 +1137,23 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::Log10Op>},
     {"log10", "log10", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::Log10Op>},
+    {"log10", RTNAME_STRING(Log10F128), FuncTypeReal16Real16, genLibF128Call},
     {"log_gamma", "lgammaf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"log_gamma", "lgamma", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"log_gamma", RTNAME_STRING(LgammaF128), FuncTypeReal16Real16,
+     genLibF128Call},
     // llvm.lround behaves the same way as libm's lround.
     {"nint", "llvm.lround.i64.f64", genFuncType<Ty::Integer<8>, Ty::Real<8>>,
      genLibCall},
     {"nint", "llvm.lround.i64.f32", genFuncType<Ty::Integer<8>, Ty::Real<4>>,
      genLibCall},
+    {"nint", RTNAME_STRING(LlroundF128), FuncTypeInteger8Real16,
+     genLibF128Call},
     {"nint", "llvm.lround.i32.f64", genFuncType<Ty::Integer<4>, Ty::Real<8>>,
      genLibCall},
     {"nint", "llvm.lround.i32.f32", genFuncType<Ty::Integer<4>, Ty::Real<4>>,
      genLibCall},
+    {"nint", RTNAME_STRING(LroundF128), FuncTypeInteger4Real16, genLibF128Call},
     {"pow",
      {},
      genFuncType<Ty::Integer<1>, Ty::Integer<1>, Ty::Integer<1>>,
@@ -1129,6 +1174,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::PowFOp>},
     {"pow", "pow", genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::PowFOp>},
+    {"pow", RTNAME_STRING(PowF128), FuncTypeReal16Real16Real16, genLibF128Call},
     {"pow", "cpowf",
      genFuncType<Ty::Complex<4>, Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::PowOp>},
@@ -1140,12 +1186,18 @@ static constexpr MathOperation mathOperations[] = {
     {"pow", RTNAME_STRING(FPow8i),
      genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Integer<4>>,
      genMathOp<mlir::math::FPowIOp>},
+    {"pow", RTNAME_STRING(FPow16i),
+     genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Integer<4>>,
+     genMathOp<mlir::math::FPowIOp>},
     {"pow", RTNAME_STRING(FPow4k),
      genFuncType<Ty::Real<4>, Ty::Real<4>, Ty::Integer<8>>,
      genMathOp<mlir::math::FPowIOp>},
     {"pow", RTNAME_STRING(FPow8k),
      genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Integer<8>>,
      genMathOp<mlir::math::FPowIOp>},
+    {"pow", RTNAME_STRING(FPow16k),
+     genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Integer<8>>,
+     genMathOp<mlir::math::FPowIOp>},
     {"pow", RTNAME_STRING(cpowi),
      genFuncType<Ty::Complex<4>, Ty::Complex<4>, Ty::Integer<4>>, genLibCall},
     {"pow", RTNAME_STRING(zpowi),
@@ -1174,6 +1226,7 @@ static constexpr MathOperation mathOperations[] = {
      genComplexMathOp<mlir::complex::SinOp>},
     {"sinh", "sinhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"sinh", "sinh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"sinh", RTNAME_STRING(SinhF128), FuncTypeReal16Real16, genLibF128Call},
     {"sinh", "csinhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"sinh", "csinh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
     {"sqrt", "sqrtf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
@@ -1189,6 +1242,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::TanOp>},
     {"tan", "tan", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::TanOp>},
+    {"tan", RTNAME_STRING(TanF128), FuncTypeReal16Real16, genLibF128Call},
     {"tan", "ctanf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::TanOp>},
     {"tan", "ctan", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
@@ -1197,6 +1251,7 @@ static constexpr MathOperation mathOperations[] = {
      genMathOp<mlir::math::TanhOp>},
     {"tanh", "tanh", genFuncType<Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::TanhOp>},
+    {"tanh", RTNAME_STRING(TanhF128), FuncTypeReal16Real16, genLibF128Call},
     {"tanh", "ctanhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>,
      genComplexMathOp<mlir::complex::TanhOp>},
     {"tanh", "ctanh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>,
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index f8da4d7ca1a9f..2e20f4fd612f2 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -33,9 +33,42 @@ else()
 endif()
 
 set(sources
+  acos.cpp
+  acosh.cpp
+  asin.cpp
+  asinh.cpp
+  atan.cpp
+  atan2.cpp
+  atanh.cpp
   cabs.cpp
+  ceil.cpp
+  cos.cpp
+  cosh.cpp
+  erf.cpp
+  erfc.cpp
+  exp.cpp
+  floor.cpp
+  hypot.cpp
+  j0.cpp
+  j1.cpp
+  jn.cpp
+  lgamma.cpp
+  llround.cpp
+  log.cpp
+  log10.cpp
+  lround.cpp
+  pow.cpp
+  round.cpp
   sin.cpp
+  sinh.cpp
   sqrt.cpp
+  tan.cpp
+  tanh.cpp
+  tgamma.cpp
+  trunc.cpp
+  y0.cpp
+  y1.cpp
+  yn.cpp
   )
 
 include_directories(AFTER "${CMAKE_CURRENT_SOURCE_DIR}/..")
diff --git a/flang/runtime/Float128Math/acos.cpp b/flang/runtime/Float128Math/acos.cpp
new file mode 100644
index 0000000000000..531c79c7444bd
--- /dev/null
+++ b/flang/runtime/Float128Math/acos.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/acos.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AcosF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Acos<RTNAME(AcosF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/acosh.cpp b/flang/runtime/Float128Math/acosh.cpp
new file mode 100644
index 0000000000000..1495120edd1a0
--- /dev/null
+++ b/flang/runtime/Float128Math/acosh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/acosh.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AcoshF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Acosh<RTNAME(AcoshF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/asin.cpp b/flang/runtime/Float128Math/asin.cpp
new file mode 100644
index 0000000000000..2fb8c6c5e97d7
--- /dev/null
+++ b/flang/runtime/Float128Math/asin.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/asin.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AsinF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Asin<RTNAME(AsinF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/asinh.cpp b/flang/runtime/Float128Math/asinh.cpp
new file mode 100644
index 0000000000000..3630a77be42b2
--- /dev/null
+++ b/flang/runtime/Float128Math/asinh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/asinh.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AsinhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Asinh<RTNAME(AsinhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/atan.cpp b/flang/runtime/Float128Math/atan.cpp
new file mode 100644
index 0000000000000..4609343e9d127
--- /dev/null
+++ b/flang/runtime/Float128Math/atan.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/atan.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AtanF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Atan<RTNAME(AtanF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/atan2.cpp b/flang/runtime/Float128Math/atan2.cpp
new file mode 100644
index 0000000000000..c0175e67ec71b
--- /dev/null
+++ b/flang/runtime/Float128Math/atan2.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/atan2.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Atan2F128)(
+    CppTypeFor<TypeCategory::Real, 16> x,
+    CppTypeFor<TypeCategory::Real, 16> y) {
+  return Atan2<RTNAME(Atan2F128)>::invoke(x, y);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/atanh.cpp b/flang/runtime/Float128Math/atanh.cpp
new file mode 100644
index 0000000000000..bfacb967117d7
--- /dev/null
+++ b/flang/runtime/Float128Math/atanh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/atanh.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(AtanhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Atanh<RTNAME(AtanhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/ceil.cpp b/flang/runtime/Float128Math/ceil.cpp
new file mode 100644
index 0000000000000..a53a2c27c616b
--- /dev/null
+++ b/flang/runtime/Float128Math/ceil.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/ceil.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CeilF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Ceil<RTNAME(CeilF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/cos.cpp b/flang/runtime/Float128Math/cos.cpp
new file mode 100644
index 0000000000000..845c970bd8e63
--- /dev/null
+++ b/flang/runtime/Float128Math/cos.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/cos.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CosF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Cos<RTNAME(CosF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/cosh.cpp b/flang/runtime/Float128Math/cosh.cpp
new file mode 100644
index 0000000000000..acf6ff4130ee3
--- /dev/null
+++ b/flang/runtime/Float128Math/cosh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/cosh.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CoshF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Cosh<RTNAME(CoshF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/erf.cpp b/flang/runtime/Float128Math/erf.cpp
new file mode 100644
index 0000000000000..862f3b9741187
--- /dev/null
+++ b/flang/runtime/Float128Math/erf.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/erf.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Erf<RTNAME(ErfF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/erfc.cpp b/flang/runtime/Float128Math/erfc.cpp
new file mode 100644
index 0000000000000..0ac0b94556374
--- /dev/null
+++ b/flang/runtime/Float128Math/erfc.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/erfc.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ErfcF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Erfc<RTNAME(ErfcF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/exp.cpp b/flang/runtime/Float128Math/exp.cpp
new file mode 100644
index 0000000000000..50386fdbfb644
--- /dev/null
+++ b/flang/runtime/Float128Math/exp.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/exp.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(ExpF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Exp<RTNAME(ExpF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/floor.cpp b/flang/runtime/Float128Math/floor.cpp
new file mode 100644
index 0000000000000..48cf4e0144807
--- /dev/null
+++ b/flang/runtime/Float128Math/floor.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/floor.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(FloorF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Floor<RTNAME(FloorF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/hypot.cpp b/flang/runtime/Float128Math/hypot.cpp
new file mode 100644
index 0000000000000..33c83a1654993
--- /dev/null
+++ b/flang/runtime/Float128Math/hypot.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/hypot.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(HypotF128)(
+    CppTypeFor<TypeCategory::Real, 16> x,
+    CppTypeFor<TypeCategory::Real, 16> y) {
+  return Hypot<RTNAME(HypotF128)>::invoke(x, y);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/j0.cpp b/flang/runtime/Float128Math/j0.cpp
new file mode 100644
index 0000000000000..f8f3fe71d8a61
--- /dev/null
+++ b/flang/runtime/Float128Math/j0.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/j0.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(J0F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return J0<RTNAME(J0F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/j1.cpp b/flang/runtime/Float128Math/j1.cpp
new file mode 100644
index 0000000000000..9a51b973e1cf8
--- /dev/null
+++ b/flang/runtime/Float128Math/j1.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/j1.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(J1F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return J1<RTNAME(J1F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/jn.cpp b/flang/runtime/Float128Math/jn.cpp
new file mode 100644
index 0000000000000..644a66863c0d2
--- /dev/null
+++ b/flang/runtime/Float128Math/jn.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/jn.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(JnF128)(
+    int n, CppTypeFor<TypeCategory::Real, 16> x) {
+  return Jn<RTNAME(JnF128)>::invoke(n, x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/lgamma.cpp b/flang/runtime/Float128Math/lgamma.cpp
new file mode 100644
index 0000000000000..fff7dfcb9c15d
--- /dev/null
+++ b/flang/runtime/Float128Math/lgamma.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/lgamma.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(LgammaF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Lgamma<RTNAME(LgammaF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/llround.cpp b/flang/runtime/Float128Math/llround.cpp
new file mode 100644
index 0000000000000..00c62818af19d
--- /dev/null
+++ b/flang/runtime/Float128Math/llround.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/llround.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Integer, 8> RTDEF(LlroundF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Llround<RTNAME(LlroundF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/log.cpp b/flang/runtime/Float128Math/log.cpp
new file mode 100644
index 0000000000000..0cfe329c6f7f5
--- /dev/null
+++ b/flang/runtime/Float128Math/log.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/log.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(LogF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Log<RTNAME(LogF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/log10.cpp b/flang/runtime/Float128Math/log10.cpp
new file mode 100644
index 0000000000000..cd8bf27fcb121
--- /dev/null
+++ b/flang/runtime/Float128Math/log10.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/log10.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Log10F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Log10<RTNAME(Log10F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/lround.cpp b/flang/runtime/Float128Math/lround.cpp
new file mode 100644
index 0000000000000..6ced66a1b2d3a
--- /dev/null
+++ b/flang/runtime/Float128Math/lround.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/lround.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Integer, 4> RTDEF(LroundF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Lround<RTNAME(LroundF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 91c14b008b576..d7896ac827913 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -52,9 +52,42 @@ namespace Fortran::runtime {
   };
 
 // Define fallback callers.
+DEFINE_FALLBACK(Acos)
+DEFINE_FALLBACK(Acosh)
+DEFINE_FALLBACK(Asin)
+DEFINE_FALLBACK(Asinh)
+DEFINE_FALLBACK(Atan)
+DEFINE_FALLBACK(Atan2)
+DEFINE_FALLBACK(Atanh)
 DEFINE_FALLBACK(CAbs)
+DEFINE_FALLBACK(Ceil)
+DEFINE_FALLBACK(Cos)
+DEFINE_FALLBACK(Cosh)
+DEFINE_FALLBACK(Erf)
+DEFINE_FALLBACK(Erfc)
+DEFINE_FALLBACK(Exp)
+DEFINE_FALLBACK(Floor)
+DEFINE_FALLBACK(Hypot)
+DEFINE_FALLBACK(J0)
+DEFINE_FALLBACK(J1)
+DEFINE_FALLBACK(Jn)
+DEFINE_FALLBACK(Lgamma)
+DEFINE_FALLBACK(Llround)
+DEFINE_FALLBACK(Lround)
+DEFINE_FALLBACK(Log)
+DEFINE_FALLBACK(Log10)
+DEFINE_FALLBACK(Pow)
+DEFINE_FALLBACK(Round)
 DEFINE_FALLBACK(Sin)
+DEFINE_FALLBACK(Sinh)
 DEFINE_FALLBACK(Sqrt)
+DEFINE_FALLBACK(Tan)
+DEFINE_FALLBACK(Tanh)
+DEFINE_FALLBACK(Tgamma)
+DEFINE_FALLBACK(Trunc)
+DEFINE_FALLBACK(Y0)
+DEFINE_FALLBACK(Y1)
+DEFINE_FALLBACK(Yn)
 
 // Define ComplexF128 type that is compatible with
 // the type of results/arguments of libquadmath.
@@ -68,9 +101,42 @@ typedef _Complex float __attribute__((mode(KC))) ComplexF128;
 #if HAS_QUADMATHLIB
 // Define wrapper callers for libquadmath.
 #include "quadmath.h"
+DEFINE_SIMPLE_ALIAS(Acos, acosq)
+DEFINE_SIMPLE_ALIAS(Acosh, acoshq)
+DEFINE_SIMPLE_ALIAS(Asin, asinq)
+DEFINE_SIMPLE_ALIAS(Asinh, asinhq)
+DEFINE_SIMPLE_ALIAS(Atan, atanq)
+DEFINE_SIMPLE_ALIAS(Atan2, atan2q)
+DEFINE_SIMPLE_ALIAS(Atanh, atanhq)
 DEFINE_SIMPLE_ALIAS(CAbs, cabsq)
+DEFINE_SIMPLE_ALIAS(Ceil, ceilq)
+DEFINE_SIMPLE_ALIAS(Cos, cosq)
+DEFINE_SIMPLE_ALIAS(Cosh, coshq)
+DEFINE_SIMPLE_ALIAS(Erf, erfq)
+DEFINE_SIMPLE_ALIAS(Erfc, erfcq)
+DEFINE_SIMPLE_ALIAS(Exp, expq)
+DEFINE_SIMPLE_ALIAS(Floor, floorq)
+DEFINE_SIMPLE_ALIAS(Hypot, hypotq)
+DEFINE_SIMPLE_ALIAS(J0, j0q)
+DEFINE_SIMPLE_ALIAS(J1, j1q)
+DEFINE_SIMPLE_ALIAS(Jn, jnq)
+DEFINE_SIMPLE_ALIAS(Lgamma, lgammaq)
+DEFINE_SIMPLE_ALIAS(Llround, llroundq)
+DEFINE_SIMPLE_ALIAS(Lround, lroundq)
+DEFINE_SIMPLE_ALIAS(Log, logq)
+DEFINE_SIMPLE_ALIAS(Log10, log10q)
+DEFINE_SIMPLE_ALIAS(Pow, powq)
+DEFINE_SIMPLE_ALIAS(Round, roundq)
 DEFINE_SIMPLE_ALIAS(Sin, sinq)
+DEFINE_SIMPLE_ALIAS(Sinh, sinhq)
 DEFINE_SIMPLE_ALIAS(Sqrt, sqrtq)
+DEFINE_SIMPLE_ALIAS(Tan, tanq)
+DEFINE_SIMPLE_ALIAS(Tanh, tanhq)
+DEFINE_SIMPLE_ALIAS(Tgamma, tgammaq)
+DEFINE_SIMPLE_ALIAS(Trunc, truncq)
+DEFINE_SIMPLE_ALIAS(Y0, y0q)
+DEFINE_SIMPLE_ALIAS(Y1, y1q)
+DEFINE_SIMPLE_ALIAS(Yn, ynq)
 #endif
 } // namespace Fortran::runtime
 
diff --git a/flang/runtime/Float128Math/pow.cpp b/flang/runtime/Float128Math/pow.cpp
new file mode 100644
index 0000000000000..02958a890e522
--- /dev/null
+++ b/flang/runtime/Float128Math/pow.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/pow.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(PowF128)(
+    CppTypeFor<TypeCategory::Real, 16> x,
+    CppTypeFor<TypeCategory::Real, 16> y) {
+  return Pow<RTNAME(PowF128)>::invoke(x, y);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/round.cpp b/flang/runtime/Float128Math/round.cpp
new file mode 100644
index 0000000000000..43ab57768cb77
--- /dev/null
+++ b/flang/runtime/Float128Math/round.cpp
@@ -0,0 +1,26 @@
+//===-- runtime/Float128Math/round.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Round to nearest integer, away from zero.
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(RoundF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Round<RTNAME(RoundF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/sinh.cpp b/flang/runtime/Float128Math/sinh.cpp
new file mode 100644
index 0000000000000..9c907041fd7eb
--- /dev/null
+++ b/flang/runtime/Float128Math/sinh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/sinh.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(SinhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Sinh<RTNAME(SinhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/tan.cpp b/flang/runtime/Float128Math/tan.cpp
new file mode 100644
index 0000000000000..01d3c7bdd2e85
--- /dev/null
+++ b/flang/runtime/Float128Math/tan.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/tan.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TanF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Tan<RTNAME(TanF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/tanh.cpp b/flang/runtime/Float128Math/tanh.cpp
new file mode 100644
index 0000000000000..fedc1a4120caf
--- /dev/null
+++ b/flang/runtime/Float128Math/tanh.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/tanh.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TanhF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Tanh<RTNAME(TanhF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/tgamma.cpp b/flang/runtime/Float128Math/tgamma.cpp
new file mode 100644
index 0000000000000..329defff38cf9
--- /dev/null
+++ b/flang/runtime/Float128Math/tgamma.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/tgamma.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TgammaF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Tgamma<RTNAME(TgammaF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/trunc.cpp b/flang/runtime/Float128Math/trunc.cpp
new file mode 100644
index 0000000000000..3cab219ce31c2
--- /dev/null
+++ b/flang/runtime/Float128Math/trunc.cpp
@@ -0,0 +1,26 @@
+//===-- runtime/Float128Math/trunc.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Round to integer, toward zero.
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(TruncF128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Trunc<RTNAME(TruncF128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/y0.cpp b/flang/runtime/Float128Math/y0.cpp
new file mode 100644
index 0000000000000..f3e2ee454aeab
--- /dev/null
+++ b/flang/runtime/Float128Math/y0.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/y0.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Y0F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Y0<RTNAME(Y0F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/y1.cpp b/flang/runtime/Float128Math/y1.cpp
new file mode 100644
index 0000000000000..c117bbcb2b5a8
--- /dev/null
+++ b/flang/runtime/Float128Math/y1.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/y1.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Y1F128)(
+    CppTypeFor<TypeCategory::Real, 16> x) {
+  return Y1<RTNAME(Y1F128)>::invoke(x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/yn.cpp b/flang/runtime/Float128Math/yn.cpp
new file mode 100644
index 0000000000000..237bc2866a0d5
--- /dev/null
+++ b/flang/runtime/Float128Math/yn.cpp
@@ -0,0 +1,22 @@
+//===-- runtime/Float128Math/yn.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(YnF128)(
+    int n, CppTypeFor<TypeCategory::Real, 16> x) {
+  return Yn<RTNAME(YnF128)>::invoke(n, x);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/test/Lower/Intrinsics/acos_real16.f90 b/flang/test/Lower/Intrinsics/acos_real16.f90
new file mode 100644
index 0000000000000..2a09bfe94a801
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/acos_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAcosF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = acos(a)
+end
diff --git a/flang/test/Lower/Intrinsics/acosh_real16.f90 b/flang/test/Lower/Intrinsics/acosh_real16.f90
new file mode 100644
index 0000000000000..de787e3d2b0fb
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/acosh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAcoshF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = acosh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/aint_real16.f90 b/flang/test/Lower/Intrinsics/aint_real16.f90
new file mode 100644
index 0000000000000..b8b80ea3097cf
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/aint_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATruncF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = aint(a)
+end
diff --git a/flang/test/Lower/Intrinsics/anint_real16.f90 b/flang/test/Lower/Intrinsics/anint_real16.f90
new file mode 100644
index 0000000000000..677240dc41df0
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/anint_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranARoundF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = anint(a)
+end
diff --git a/flang/test/Lower/Intrinsics/asin_real16.f90 b/flang/test/Lower/Intrinsics/asin_real16.f90
new file mode 100644
index 0000000000000..cb32d0a6af70d
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/asin_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAsinF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = asin(a)
+end
diff --git a/flang/test/Lower/Intrinsics/asinh_real16.f90 b/flang/test/Lower/Intrinsics/asinh_real16.f90
new file mode 100644
index 0000000000000..9ab16f19d933e
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/asinh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAsinhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = asinh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/atan2_real16.f90 b/flang/test/Lower/Intrinsics/atan2_real16.f90
new file mode 100644
index 0000000000000..5d0bf3069342c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/atan2_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAtan2F128({{.*}}){{.*}}: (f128, f128) -> f128
+  real(16) :: a, b
+  b = atan2(a, b)
+end
diff --git a/flang/test/Lower/Intrinsics/atan_real16.f90 b/flang/test/Lower/Intrinsics/atan_real16.f90
new file mode 100644
index 0000000000000..5c0c262711c61
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/atan_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAtanF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = atan(a)
+end
diff --git a/flang/test/Lower/Intrinsics/atanh_real16.f90 b/flang/test/Lower/Intrinsics/atanh_real16.f90
new file mode 100644
index 0000000000000..0d60aecd08e1b
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/atanh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAAtanhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = atanh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_j0_real16.f90 b/flang/test/Lower/Intrinsics/bessel_j0_real16.f90
new file mode 100644
index 0000000000000..f1c07f6137d81
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_j0_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAJ0F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_j0(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_j1_real16.f90 b/flang/test/Lower/Intrinsics/bessel_j1_real16.f90
new file mode 100644
index 0000000000000..c41e7b5246ade
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_j1_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAJ1F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_j1(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_jn_real16.f90 b/flang/test/Lower/Intrinsics/bessel_jn_real16.f90
new file mode 100644
index 0000000000000..1ec9cc719a418
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_jn_real16.f90
@@ -0,0 +1,10 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAJnF128({{.*}}){{.*}}: (i32, f128) -> f128
+  integer :: n
+  real(16) :: a, b
+  b = bessel_jn(n, a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_y0_real16.f90 b/flang/test/Lower/Intrinsics/bessel_y0_real16.f90
new file mode 100644
index 0000000000000..459d2f4e7315f
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_y0_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAY0F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_y0(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_y1_real16.f90 b/flang/test/Lower/Intrinsics/bessel_y1_real16.f90
new file mode 100644
index 0000000000000..869b2dc2c961b
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_y1_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAY1F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = bessel_y1(a)
+end
diff --git a/flang/test/Lower/Intrinsics/bessel_yn_real16.f90 b/flang/test/Lower/Intrinsics/bessel_yn_real16.f90
new file mode 100644
index 0000000000000..53be23ab82d1a
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/bessel_yn_real16.f90
@@ -0,0 +1,10 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAYnF128({{.*}}){{.*}}: (i32, f128) -> f128
+  integer :: n
+  real(16) :: a, b
+  b = bessel_yn(n, a)
+end
diff --git a/flang/test/Lower/Intrinsics/ceiling_real16.f90 b/flang/test/Lower/Intrinsics/ceiling_real16.f90
new file mode 100644
index 0000000000000..21dc221d6b35c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/ceiling_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACeilF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = ceiling(a)
+end
diff --git a/flang/test/Lower/Intrinsics/cos_real16.f90 b/flang/test/Lower/Intrinsics/cos_real16.f90
new file mode 100644
index 0000000000000..859d4a5671e8c
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cos_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACosF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = cos(a)
+end
diff --git a/flang/test/Lower/Intrinsics/cosh_real16.f90 b/flang/test/Lower/Intrinsics/cosh_real16.f90
new file mode 100644
index 0000000000000..cab85365661ef
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/cosh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranACoshF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = cosh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/erf_real16.f90 b/flang/test/Lower/Intrinsics/erf_real16.f90
new file mode 100644
index 0000000000000..da40816946171
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/erf_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAErfF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = erf(a)
+end
diff --git a/flang/test/Lower/Intrinsics/erfc_real16.f90 b/flang/test/Lower/Intrinsics/erfc_real16.f90
new file mode 100644
index 0000000000000..7e3daa27768c7
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/erfc_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAErfcF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = erfc(a)
+end
diff --git a/flang/test/Lower/Intrinsics/exp_real16.f90 b/flang/test/Lower/Intrinsics/exp_real16.f90
new file mode 100644
index 0000000000000..aed7319a9eb2d
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/exp_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAExpF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = exp(a)
+end
diff --git a/flang/test/Lower/Intrinsics/floor_real16.f90 b/flang/test/Lower/Intrinsics/floor_real16.f90
new file mode 100644
index 0000000000000..536c14106dd6f
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/floor_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAFloorF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = floor(a)
+end
diff --git a/flang/test/Lower/Intrinsics/gamma_real16.f90 b/flang/test/Lower/Intrinsics/gamma_real16.f90
new file mode 100644
index 0000000000000..aabf7fb73f0b2
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/gamma_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATgammaF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = gamma(a)
+end
diff --git a/flang/test/Lower/Intrinsics/hypot_real16.f90 b/flang/test/Lower/Intrinsics/hypot_real16.f90
new file mode 100644
index 0000000000000..753148ede29cd
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/hypot_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAHypotF128({{.*}}){{.*}}: (f128, f128) -> f128
+  real(16) :: a, b
+  b = hypot(a, b)
+end
diff --git a/flang/test/Lower/Intrinsics/log10_real16.f90 b/flang/test/Lower/Intrinsics/log10_real16.f90
new file mode 100644
index 0000000000000..3a6e1d1af9112
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/log10_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALog10F128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = log10(a)
+end
diff --git a/flang/test/Lower/Intrinsics/log_gamma_real16.f90 b/flang/test/Lower/Intrinsics/log_gamma_real16.f90
new file mode 100644
index 0000000000000..771ec4e5d931d
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/log_gamma_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALgammaF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = log_gamma(a)
+end
diff --git a/flang/test/Lower/Intrinsics/log_real16.f90 b/flang/test/Lower/Intrinsics/log_real16.f90
new file mode 100644
index 0000000000000..a57b8cc8e9698
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/log_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALogF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = log(a)
+end
diff --git a/flang/test/Lower/Intrinsics/nint_real16.f90 b/flang/test/Lower/Intrinsics/nint_real16.f90
new file mode 100644
index 0000000000000..c4bbacd0347c0
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/nint_real16.f90
@@ -0,0 +1,13 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranALroundF128({{.*}}){{.*}}: (f128) -> i32
+! CHECK: fir.call @_FortranALlroundF128({{.*}}){{.*}}: (f128) -> i64
+  real(16) :: a
+  integer(4) :: b
+  integer(8) :: c
+  b = nint(a, 4)
+  c = nint(a, 8)
+end
diff --git a/flang/test/Lower/Intrinsics/pow_real16.f90 b/flang/test/Lower/Intrinsics/pow_real16.f90
new file mode 100644
index 0000000000000..869422381401d
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/pow_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAPowF128({{.*}}){{.*}}: (f128, f128) -> f128
+  real(16) :: a, b
+  b = a ** b
+end
diff --git a/flang/test/Lower/Intrinsics/powi_real16.f90 b/flang/test/Lower/Intrinsics/powi_real16.f90
new file mode 100644
index 0000000000000..9e7d0f828b5cd
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/powi_real16.f90
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-FAST
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-PRECISE
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefix=CHECK-FAST
+
+! CHECK-PRECISE: fir.call @_FortranAFPow16i({{.*}}){{.*}}: (f128, i32) -> f128
+! CHECK-PRECISE: fir.call @_FortranAFPow16i({{.*}}){{.*}}: (f128, i32) -> f128
+! CHECK-PRECISE: fir.call @_FortranAFPow16i({{.*}}){{.*}}: (f128, i32) -> f128
+! CHECK-PRECISE: fir.call @_FortranAFPow16k({{.*}}){{.*}}: (f128, i64) -> f128
+! CHECK-FAST: math.fpowi {{.*}} : f128, i32
+! CHECK-FAST: math.fpowi {{.*}} : f128, i32
+! CHECK-FAST: math.fpowi {{.*}} : f128, i32
+! CHECK-FAST: math.fpowi {{.*}} : f128, i64
+  real(16) :: a
+  integer(1) :: e1
+  integer(2) :: e2
+  integer(4) :: e3
+  integer(8) :: e4
+  a = a ** e1
+  a = a ** e2
+  a = a ** e3
+  a = a ** e4
+end
diff --git a/flang/test/Lower/Intrinsics/sinh_real16.f90 b/flang/test/Lower/Intrinsics/sinh_real16.f90
new file mode 100644
index 0000000000000..79aa71940aff8
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/sinh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranASinhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = sinh(a)
+end
diff --git a/flang/test/Lower/Intrinsics/tan_real16.f90 b/flang/test/Lower/Intrinsics/tan_real16.f90
new file mode 100644
index 0000000000000..62aa403fe58ce
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/tan_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATanF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = tan(a)
+end
diff --git a/flang/test/Lower/Intrinsics/tanh_real16.f90 b/flang/test/Lower/Intrinsics/tanh_real16.f90
new file mode 100644
index 0000000000000..aff7e7b31d754
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/tanh_real16.f90
@@ -0,0 +1,9 @@
+! REQUIRES: flang-supports-f128-math
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranATanhF128({{.*}}){{.*}}: (f128) -> f128
+  real(16) :: a, b
+  b = tanh(a)
+end

From a870a48c09ea32a5c179e827ed60e53e748cbc49 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Mon, 26 Feb 2024 17:12:13 -0500
Subject: [PATCH 378/546] [HLSL] Fix casting asserts (#82827)

There are two issues here. first `ICK_Floating_Integral` were always
defaulting to `CK_FloatingToIntegral` for vectors regardless of
direction of cast. Check was scalar only so added a vec float check to
the conditional.

Second issue was float to int casts were resolving to
ICK_Integral_Promotion when they need to be resolving to
CK_FloatingToIntegral. This was fixed by changing the ordering of
conversion checks.

This fixes #82826
---
 clang/lib/Sema/SemaExprCXX.cpp                |  2 +-
 clang/lib/Sema/SemaOverload.cpp               | 14 +++---
 .../SemaHLSL/VectorOverloadResolution.hlsl    | 44 +++++++++++++++++++
 3 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 322bd1c87b1d7..59758d3bd6d1a 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -4843,7 +4843,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
                  .get();
       break;
     case ICK_Floating_Integral:
-      if (ToType->isRealFloatingType())
+      if (ToType->hasFloatingRepresentation())
         From =
             ImpCastExprToType(From, ToType, CK_IntegralToFloating, VK_PRValue,
                               /*BasePath=*/nullptr, CCK)
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f7645422348b6..ecad2b9681655 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1884,6 +1884,13 @@ static bool IsVectorElementConversion(Sema &S, QualType FromType,
     return true;
   }
 
+  if ((FromType->isRealFloatingType() && ToType->isIntegralType(S.Context)) ||
+      (FromType->isIntegralOrUnscopedEnumerationType() &&
+       ToType->isRealFloatingType())) {
+    ICK = ICK_Floating_Integral;
+    return true;
+  }
+
   if (S.IsIntegralPromotion(From, FromType, ToType)) {
     ICK = ICK_Integral_Promotion;
     return true;
@@ -1895,13 +1902,6 @@ static bool IsVectorElementConversion(Sema &S, QualType FromType,
     return true;
   }
 
-  if ((FromType->isRealFloatingType() && ToType->isIntegralType(S.Context)) ||
-      (FromType->isIntegralOrUnscopedEnumerationType() &&
-       ToType->isRealFloatingType())) {
-    ICK = ICK_Floating_Integral;
-    return true;
-  }
-
   return false;
 }
 
diff --git a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
index e07391f803f89..81fedc2de3157 100644
--- a/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
+++ b/clang/test/SemaHLSL/VectorOverloadResolution.hlsl
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-unknown-shadermodel6.6-library -S -fnative-half-type -finclude-default-header -o - -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s --check-prefixes=CHECKIR
 void Fn(double2 D);
 void Fn(half2 H);
 
@@ -28,3 +29,46 @@ void Fn2(int16_t2 S);
 void Call2(int2 I) {
   Fn2(I);
 }
+
+void Fn3( int64_t2 p0);
+
+// CHECK: FunctionDecl {{.*}} Call3 'void (half2)'
+// CHECK: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(int64_t2)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'void (int64_t2)' lvalue Function {{.*}} 'Fn3' 'void (int64_t2)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'half2':'half __attribute__((ext_vector_type(2)))' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'half2':'half __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'half2':'half __attribute__((ext_vector_type(2)))'
+// CHECKIR-LABEL: Call3
+// CHECKIR: %conv = fptosi <2 x half> {{.*}} to <2 x i64>
+void Call3(half2 p0) {
+  Fn3(p0);
+}
+
+// CHECK: FunctionDecl {{.*}} Call4 'void (float2)'
+// CHECK: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(int64_t2)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'void (int64_t2)' lvalue Function {{.*}} 'Fn3' 'void (int64_t2)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'float2':'float __attribute__((ext_vector_type(2)))'
+// CHECKIR-LABEL: Call4
+// CHECKIR: {{.*}} = fptosi <2 x float> {{.*}} to <2 x i64>
+void Call4(float2 p0) {
+  Fn3(p0);
+}
+
+void Fn4( float2 p0);
+
+// CHECK: FunctionDecl {{.*}} Call5 'void (int64_t2)'
+// CHECK: CallExpr {{.*}} 'void'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(float2)' <FunctionToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'void (float2)' lvalue Function {{.*}} 'Fn4' 'void (float2)'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float2':'float __attribute__((ext_vector_type(2)))' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int64_t2':'long __attribute__((ext_vector_type(2)))' lvalue ParmVar {{.*}} 'p0' 'int64_t2':'long __attribute__((ext_vector_type(2)))'
+// CHECKIR-LABEL: Call5
+// CHECKIR: {{.*}} = sitofp <2 x i64> {{.*}} to <2 x float>
+void Call5(int64_t2 p0) {
+  Fn4(p0);
+}

From b94913b8adbc0508762809a167e356becae92021 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 26 Feb 2024 22:15:36 +0000
Subject: [PATCH 379/546] [AArch64] Vector insert zero upper tests. NFC

---
 .../implicitly-set-zero-high-64-bits.ll       | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
index ddf98b2c971e4..1eb9eab1c21e6 100644
--- a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
+++ b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
@@ -134,5 +134,104 @@ entry:
 }
 
 
+define <16 x i8> @insertzero_v8i8(<8 x i8> %a) {
+; CHECK-LABEL: insertzero_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @insertzero_v4i16(<4 x i16> %a) {
+; CHECK-LABEL: insertzero_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @insertzero_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: insertzero_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @insertzero_v1i64(<1 x i64> %a) {
+; CHECK-LABEL: insertzero_v1i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <1 x i64> %a, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x half> @insertzero_v4f16(<4 x half> %a) {
+; CHECK-LABEL: insertzero_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %shuffle.i
+}
+
+define <8 x bfloat> @insertzero_v4bf16(<4 x bfloat> %a) {
+; CHECK-LABEL: insertzero_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d4, #0000000000000000
+; CHECK-NEXT:    movi d5, #0000000000000000
+; CHECK-NEXT:    movi d6, #0000000000000000
+; CHECK-NEXT:    movi d7, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x bfloat> %shuffle.i
+}
+
+define <4 x float> @insertzero_v2f32(<2 x float> %a) {
+; CHECK-LABEL: insertzero_v2f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @insertzero_v1f64(<1 x double> %a) {
+; CHECK-LABEL: insertzero_v1f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %shuffle.i = shufflevector <1 x double> %a, <1 x double> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuffle.i
+}
+
+
+
 declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
 declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)

From b1916599befa8575c43349ef0819389b8b5c3ab4 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 26 Feb 2024 22:17:16 +0000
Subject: [PATCH 380/546] [gn build] Port 6de5fcc74637

---
 llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
index eeabaffc7d005..210dd12785095 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
@@ -25,6 +25,7 @@ static_library("Core") {
     "BinarySection.cpp",
     "DIEBuilder.cpp",
     "DebugData.cpp",
+    "DebugNames.cpp",
     "DynoStats.cpp",
     "Exceptions.cpp",
     "FunctionLayout.cpp",

From e87724560f5f155e662cbdda17409361b98d42d6 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo@google.com>
Date: Mon, 26 Feb 2024 14:27:22 -0800
Subject: [PATCH 381/546] [docs] Remove ref to deleted "Packaging.rst"

This is a fix for commit e9cdd165d7bc ("[docs] Remove the Packaging
"Tips" which seems to be about pre-cmake ./configure (#82958)"). The
doc was removed but not the references to it.
---
 llvm/docs/GettingInvolved.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 811b324ebad9d..f89483904ab73 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -69,7 +69,6 @@ Information about LLVM's development process.
 
    Projects
    HowToReleaseLLVM
-   Packaging
    ReleaseProcess
    HowToAddABuilder
    ReleaseNotes
@@ -89,9 +88,6 @@ Information about LLVM's development process.
 :doc:`HowToAddABuilder`
    Instructions for adding new builder to LLVM buildbot master.
 
-:doc:`Packaging`
-   Advice on packaging LLVM into a distribution.
-
 :doc:`Release notes for the current release <ReleaseNotes>`
    This describes new features, known bugs, and other limitations.
 

From 796d26a37d70374e41766df659700a826dc62e34 Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:29:49 -0800
Subject: [PATCH 382/546] [flang] Fix ISO_Fortran_binding.h to work better with
 C++ code (#82556)

[flang] Fix ISO_Fortran_binding.h to work better with C++ code

This stems from working on LANL's "dopey" project --
https://github.com/lanl/dopey.

That project contains C++ code which includes the header file
"ISO_Fortran_binding.h". The dopey code wraps that include with an
'extern "C"' clause since the standard (18.5.1, paragraph 1) says that
the file" shall contain C structure definitions, typedef declarations,
...". But the clang++ compiler emits error messages objecting to the
fact that ISO_Fortran_binding.h contains templates.

This change fixes that by preceding the problematic code in
ISO_Fortran_binding.h with language linkage clauses that specify that
they contain C++ code rather than C code.

In the accompanying test, I needed to account for the fact that some
people build the compiler by doing a `make check-flang`. In this case,
the clang compiler which is required by the test will not be built.

Here's an example of a C++ program that shows the problem:
```
  extern "C" {
  #include "ISO_Fortran_binding.h"
  }
  int main() {
    return 0;
  }
```
---
 flang/include/flang/ISO_Fortran_binding.h     |  8 ++---
 .../test/Integration/iso-fortran-binding.cpp  | 33 +++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Integration/iso-fortran-binding.cpp

diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 4a28d3322a38f..3f74a7e56f175 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -125,7 +125,7 @@ namespace cfi_internal {
 // The below structure emulates a flexible array. This structure does not take
 // care of getting the memory storage. Note that it already contains one element
 // because a struct cannot be empty.
-template <typename T> struct FlexibleArray : T {
+extern "C++" template <typename T> struct FlexibleArray : T {
   RT_API_ATTRS T &operator[](int index) { return *(this + index); }
   const RT_API_ATTRS T &operator[](int index) const { return *(this + index); }
   RT_API_ATTRS operator T *() { return this; }
@@ -163,12 +163,12 @@ typedef struct CFI_cdesc_t {
 // needed, for C++'s CFI_cdesc_t's emulated flexible
 // dim[] array.
 namespace cfi_internal {
-template <int r> struct CdescStorage : public CFI_cdesc_t {
+extern "C++" template <int r> struct CdescStorage : public CFI_cdesc_t {
   static_assert((r > 1 && r <= CFI_MAX_RANK), "CFI_INVALID_RANK");
   CFI_dim_t dim[r - 1];
 };
-template <> struct CdescStorage<1> : public CFI_cdesc_t {};
-template <> struct CdescStorage<0> : public CFI_cdesc_t {};
+extern "C++" template <> struct CdescStorage<1> : public CFI_cdesc_t {};
+extern "C++" template <> struct CdescStorage<0> : public CFI_cdesc_t {};
 } // namespace cfi_internal
 #define CFI_CDESC_T(rank) \
   FORTRAN_ISO_NAMESPACE_::cfi_internal::CdescStorage<rank>
diff --git a/flang/test/Integration/iso-fortran-binding.cpp b/flang/test/Integration/iso-fortran-binding.cpp
new file mode 100644
index 0000000000000..aaafd7cccd07d
--- /dev/null
+++ b/flang/test/Integration/iso-fortran-binding.cpp
@@ -0,0 +1,33 @@
+// UNSUPPORTED: system-windows
+// RUN: split-file %s %t
+// RUN: chmod +x %t/runtest.sh
+// RUN: %t/runtest.sh %t %t/cppfile.cpp %flang | FileCheck %s
+
+//--- cppfile.cpp
+extern "C" {
+#include "ISO_Fortran_binding.h"
+}
+#include <iostream>
+
+int main() {
+  std::cout << "PASS\n";
+  return 0;
+}
+
+// CHECK: PASS
+// clang-format off
+//--- runtest.sh
+#!/bin/bash
+TMPDIR=$1
+CPPFILE=$2
+FLANG=$3
+BINDIR=`dirname $FLANG`
+CPPCOMP=$BINDIR/clang++
+if [ -x $CPPCOMP ]
+then
+  $CPPCOMP $CPPFILE -o $TMPDIR/a.out
+  $TMPDIR/a.out # should print "PASS"
+else
+  # No clang compiler, just pass by default
+  echo "PASS"
+fi

From 7789fb6604e5319ae46896285a264920015f8771 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 26 Feb 2024 16:42:25 -0600
Subject: [PATCH 383/546] [libc][NFC] Move 'sleep_briefly' function to common
 header (#83074)

Summary:
The https://github.com/llvm/llvm-project/pull/83026 patch has another
use for this function. Additionally add support for the Arm instruction
barrier if this is ever used by other targets.
---
 libc/src/__support/RPC/rpc_util.h         | 17 ++----------
 libc/src/__support/threads/CMakeLists.txt |  6 ++++
 libc/src/__support/threads/sleep.h        | 34 +++++++++++++++++++++++
 3 files changed, 42 insertions(+), 15 deletions(-)
 create mode 100644 libc/src/__support/threads/sleep.h

diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index cc2a11a1108e0..11d2f751355d3 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -11,28 +11,15 @@
 
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
-#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/properties/architectures.h"
+#include "src/__support/threads/sleep.h"
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE {
 namespace rpc {
 
-/// Suspend the thread briefly to assist the thread scheduler during busy loops.
-LIBC_INLINE void sleep_briefly() {
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
-  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
-    LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
-#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-  __builtin_amdgcn_s_sleep(2);
-#elif defined(LIBC_TARGET_ARCH_IS_X86)
-  __builtin_ia32_pause();
-#else
-  // Simply do nothing if sleeping isn't supported on this platform.
-#endif
-}
-
 /// Conditional to indicate if this process is running on the GPU.
 LIBC_INLINE constexpr bool is_process_gpu() {
 #if defined(LIBC_TARGET_ARCH_IS_GPU)
diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
index 0feeda0c179b9..731adf6f9c8e4 100644
--- a/libc/src/__support/threads/CMakeLists.txt
+++ b/libc/src/__support/threads/CMakeLists.txt
@@ -4,6 +4,12 @@ add_header_library(
     mutex_common.h
 )
 
+add_header_library(
+  sleep
+  HDRS
+    sleep.h
+)
+
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${LIBC_TARGET_OS})
 endif()
diff --git a/libc/src/__support/threads/sleep.h b/libc/src/__support/threads/sleep.h
new file mode 100644
index 0000000000000..9a2dff598ece8
--- /dev/null
+++ b/libc/src/__support/threads/sleep.h
@@ -0,0 +1,34 @@
+//===-- Utilities for suspending threads ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_SLEEP_H
+#define LLVM_LIBC_SRC___SUPPORT_THREADS_SLEEP_H
+
+#include "src/__support/macros/attributes.h"
+
+namespace LIBC_NAMESPACE {
+
+/// Suspend the thread briefly to assist the thread scheduler during busy loops.
+LIBC_INLINE void sleep_briefly() {
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+  if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+    LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
+#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+  __builtin_amdgcn_s_sleep(2);
+#elif defined(LIBC_TARGET_ARCH_IS_X86)
+  __builtin_ia32_pause();
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  __builtin_arm_isb(0xf);
+#else
+  // Simply do nothing if sleeping isn't supported on this platform.
+#endif
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_SLEEP_H

From 23f895f6567e0a4cef45cfc9d96d817a454b6e8f Mon Sep 17 00:00:00 2001
From: gulfemsavrun <gulfem@google.com>
Date: Mon, 26 Feb 2024 14:44:55 -0800
Subject: [PATCH 384/546] [InstrProf] Single byte counters in coverage (#75425)

This patch inserts 1-byte counters instead of an 8-byte counters into
llvm profiles for source-based code coverage. The origial idea was
proposed as block-cov for PGO, and this patch repurposes that idea for
coverage: https://groups.google.com/g/llvm-dev/c/r03Z6JoN7d4

The current 8-byte counters mechanism add counters to minimal regions,
and infer the counters in the remaining regions via adding or
subtracting counters. For example, it infers the counter in the if.else
region by subtracting the counters between if.entry and if.then regions
in an if statement. Whenever there is a control-flow merge, it adds the
counters from all the incoming regions. However, we are not going to be
able to infer counters by subtracting two execution counts when using
single-byte counters. Therefore, this patch conservatively inserts
additional counters for the cases where we need to add or subtract
counters.

RFC:
https://discourse.llvm.org/t/rfc-single-byte-counters-for-source-based-code-coverage/75685
---
 clang/lib/CodeGen/CGExprAgg.cpp               |  13 +-
 clang/lib/CodeGen/CGExprComplex.cpp           |  14 +-
 clang/lib/CodeGen/CGExprScalar.cpp            |  32 ++-
 clang/lib/CodeGen/CGStmt.cpp                  |  73 ++++++-
 clang/lib/CodeGen/CodeGenFunction.cpp         |   9 +-
 clang/lib/CodeGen/CodeGenFunction.h           |   2 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   1 +
 clang/lib/CodeGen/CodeGenPGO.cpp              | 150 ++++++++++++-
 clang/lib/CodeGen/CodeGenPGO.h                |   6 +-
 clang/lib/CodeGen/CoverageMappingGen.cpp      | 206 +++++++++++++-----
 .../CoverageMapping/single-byte-counters.cpp  | 169 ++++++++++++++
 compiler-rt/lib/profile/InstrProfiling.h      |   3 +-
 .../ProfileData/Coverage/CoverageMapping.h    |  20 +-
 .../llvm/ProfileData/InstrProfWriter.h        |   4 +
 .../ProfileData/Coverage/CoverageMapping.cpp  |  13 +-
 15 files changed, 623 insertions(+), 92 deletions(-)
 create mode 100644 clang/test/CoverageMapping/single-byte-counters.cpp

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index d0d6202974fe9..5190b22bcc162 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -33,6 +33,10 @@ using namespace CodeGen;
 //                        Aggregate Expression Emitter
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 namespace  {
 class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   CodeGenFunction &CGF;
@@ -1279,7 +1283,10 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(LHSBlock);
-  CGF.incrementProfileCounter(E);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getTrueExpr());
+  else
+    CGF.incrementProfileCounter(E);
   Visit(E->getTrueExpr());
   eval.end(CGF);
 
@@ -1294,6 +1301,8 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(RHSBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getFalseExpr());
   Visit(E->getFalseExpr());
   eval.end(CGF);
 
@@ -1302,6 +1311,8 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
                     E->getType());
 
   CGF.EmitBlock(ContBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E);
 }
 
 void AggExprEmitter::VisitChooseExpr(const ChooseExpr *CE) {
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 176a7e00141f9..0266ba934da62 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -28,6 +28,10 @@ using namespace CodeGen;
 //                        Complex Expression Emitter
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 typedef CodeGenFunction::ComplexPairTy ComplexPairTy;
 
 /// Return the complex type that we are meant to emit.
@@ -1330,7 +1334,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(LHSBlock);
-  CGF.incrementProfileCounter(E);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getTrueExpr());
+  else
+    CGF.incrementProfileCounter(E);
+
   ComplexPairTy LHS = Visit(E->getTrueExpr());
   LHSBlock = Builder.GetInsertBlock();
   CGF.EmitBranch(ContBlock);
@@ -1338,9 +1346,13 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
   eval.begin(CGF);
   CGF.EmitBlock(RHSBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E->getFalseExpr());
   ComplexPairTy RHS = Visit(E->getFalseExpr());
   RHSBlock = Builder.GetInsertBlock();
   CGF.EmitBlock(ContBlock);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E);
   eval.end(CGF);
 
   // Create a PHI node for the real part.
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 10b7457522044..8536570087ad0 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -52,6 +52,10 @@ using llvm::Value;
 //                         Scalar Expression Emitter
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 namespace {
 
 /// Determine whether the given binary operation may overflow.
@@ -4925,8 +4929,13 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
 
     // If the dead side doesn't have labels we need, just emit the Live part.
     if (!CGF.ContainsLabel(dead)) {
-      if (CondExprBool)
+      if (CondExprBool) {
+        if (llvm::EnableSingleByteCoverage) {
+          CGF.incrementProfileCounter(lhsExpr);
+          CGF.incrementProfileCounter(rhsExpr);
+        }
         CGF.incrementProfileCounter(E);
+      }
       Value *Result = Visit(live);
 
       // If the live part is a throw expression, it acts like it has a void
@@ -5005,7 +5014,12 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
     llvm::Value *CondV = CGF.EvaluateExprAsBool(condExpr);
     llvm::Value *StepV = Builder.CreateZExtOrBitCast(CondV, CGF.Int64Ty);
 
-    CGF.incrementProfileCounter(E, StepV);
+    if (llvm::EnableSingleByteCoverage) {
+      CGF.incrementProfileCounter(lhsExpr);
+      CGF.incrementProfileCounter(rhsExpr);
+      CGF.incrementProfileCounter(E);
+    } else
+      CGF.incrementProfileCounter(E, StepV);
 
     llvm::Value *LHS = Visit(lhsExpr);
     llvm::Value *RHS = Visit(rhsExpr);
@@ -5037,7 +5051,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   if (CGF.MCDCLogOpStack.empty())
     CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
 
-  CGF.incrementProfileCounter(E);
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(lhsExpr);
+  else
+    CGF.incrementProfileCounter(E);
+
   eval.begin(CGF);
   Value *LHS = Visit(lhsExpr);
   eval.end(CGF);
@@ -5053,6 +5071,9 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   if (CGF.MCDCLogOpStack.empty())
     CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
 
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(rhsExpr);
+
   eval.begin(CGF);
   Value *RHS = Visit(rhsExpr);
   eval.end(CGF);
@@ -5071,6 +5092,11 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   PN->addIncoming(LHS, LHSBlock);
   PN->addIncoming(RHS, RHSBlock);
 
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    CGF.incrementProfileCounter(E);
+
   return PN;
 }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index af51875782c9f..d0a3a716ad75e 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -43,6 +43,10 @@ using namespace CodeGen;
 //                              Statement Emission
 //===----------------------------------------------------------------------===//
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 void CodeGenFunction::EmitStopPoint(const Stmt *S) {
   if (CGDebugInfo *DI = getDebugInfo()) {
     SourceLocation Loc;
@@ -856,7 +860,10 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
 
   // Emit the 'then' code.
   EmitBlock(ThenBlock);
-  incrementProfileCounter(&S);
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getThen());
+  else
+    incrementProfileCounter(&S);
   {
     RunCleanupsScope ThenScope(*this);
     EmitStmt(S.getThen());
@@ -870,6 +877,9 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
       auto NL = ApplyDebugLocation::CreateEmpty(*this);
       EmitBlock(ElseBlock);
     }
+    // When single byte coverage mode is enabled, add a counter to else block.
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(Else);
     {
       RunCleanupsScope ElseScope(*this);
       EmitStmt(Else);
@@ -883,6 +893,11 @@ void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
 
   // Emit the continuation block for code after the if.
   EmitBlock(ContBlock, true);
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
@@ -927,6 +942,10 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
                  SourceLocToDebugLoc(R.getEnd()),
                  checkIfLoopMustProgress(CondIsConstInt));
 
+  // When single byte coverage mode is enabled, add a counter to loop condition.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getCond());
+
   // As long as the condition is true, go to the loop body.
   llvm::BasicBlock *LoopBody = createBasicBlock("while.body");
   if (EmitBoolCondBranch) {
@@ -959,7 +978,11 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
   {
     RunCleanupsScope BodyScope(*this);
     EmitBlock(LoopBody);
-    incrementProfileCounter(&S);
+    // When single byte coverage mode is enabled, add a counter to the body.
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(S.getBody());
+    else
+      incrementProfileCounter(&S);
     EmitStmt(S.getBody());
   }
 
@@ -981,6 +1004,11 @@ void CodeGenFunction::EmitWhileStmt(const WhileStmt &S,
   // a branch, try to erase it.
   if (!EmitBoolCondBranch)
     SimplifyForwardingBlocks(LoopHeader.getBlock());
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitDoStmt(const DoStmt &S,
@@ -996,13 +1024,19 @@ void CodeGenFunction::EmitDoStmt(const DoStmt &S,
   // Emit the body of the loop.
   llvm::BasicBlock *LoopBody = createBasicBlock("do.body");
 
-  EmitBlockWithFallThrough(LoopBody, &S);
+  if (llvm::EnableSingleByteCoverage)
+    EmitBlockWithFallThrough(LoopBody, S.getBody());
+  else
+    EmitBlockWithFallThrough(LoopBody, &S);
   {
     RunCleanupsScope BodyScope(*this);
     EmitStmt(S.getBody());
   }
 
   EmitBlock(LoopCond.getBlock());
+  // When single byte coverage mode is enabled, add a counter to loop condition.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getCond());
 
   // C99 6.8.5.2: "The evaluation of the controlling expression takes place
   // after each execution of the loop body."
@@ -1043,6 +1077,11 @@ void CodeGenFunction::EmitDoStmt(const DoStmt &S,
   // emitting a branch, try to erase it.
   if (!EmitBoolCondBranch)
     SimplifyForwardingBlocks(LoopCond.getBlock());
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitForStmt(const ForStmt &S,
@@ -1101,6 +1140,11 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
       BreakContinueStack.back().ContinueBlock = Continue;
     }
 
+    // When single byte coverage mode is enabled, add a counter to loop
+    // condition.
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(S.getCond());
+
     llvm::BasicBlock *ExitBlock = LoopExit.getBlock();
     // If there are any cleanups between here and the loop-exit scope,
     // create a block to stage a loop exit along.
@@ -1131,8 +1175,12 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
     // Treat it as a non-zero constant.  Don't even create a new block for the
     // body, just fall into it.
   }
-  incrementProfileCounter(&S);
 
+  // When single byte coverage mode is enabled, add a counter to the body.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getBody());
+  else
+    incrementProfileCounter(&S);
   {
     // Create a separate cleanup scope for the body, in case it is not
     // a compound statement.
@@ -1144,6 +1192,8 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
   if (S.getInc()) {
     EmitBlock(Continue.getBlock());
     EmitStmt(S.getInc());
+    if (llvm::EnableSingleByteCoverage)
+      incrementProfileCounter(S.getInc());
   }
 
   BreakContinueStack.pop_back();
@@ -1159,6 +1209,11 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S,
 
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock(), true);
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void
@@ -1211,7 +1266,10 @@ CodeGenFunction::EmitCXXForRangeStmt(const CXXForRangeStmt &S,
   }
 
   EmitBlock(ForBody);
-  incrementProfileCounter(&S);
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(S.getBody());
+  else
+    incrementProfileCounter(&S);
 
   // Create a block for the increment. In case of a 'continue', we jump there.
   JumpDest Continue = getJumpDestInCurrentScope("for.inc");
@@ -1241,6 +1299,11 @@ CodeGenFunction::EmitCXXForRangeStmt(const CXXForRangeStmt &S,
 
   // Emit the fall-through block.
   EmitBlock(LoopExit.getBlock(), true);
+
+  // When single byte coverage mode is enabled, add a counter to continuation
+  // block.
+  if (llvm::EnableSingleByteCoverage)
+    incrementProfileCounter(&S);
 }
 
 void CodeGenFunction::EmitReturnOfRValue(RValue RV, QualType Ty) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 1ad905078d349..b87fc86f4e635 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -52,6 +52,10 @@
 using namespace clang;
 using namespace CodeGen;
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 /// shouldEmitLifetimeMarkers - Decide whether we need emit the life-time
 /// markers.
 static bool shouldEmitLifetimeMarkers(const CodeGenOptions &CGOpts,
@@ -1270,7 +1274,10 @@ void CodeGenFunction::EmitFunctionBody(const Stmt *Body) {
 void CodeGenFunction::EmitBlockWithFallThrough(llvm::BasicBlock *BB,
                                                const Stmt *S) {
   llvm::BasicBlock *SkipCountBB = nullptr;
-  if (HaveInsertPoint() && CGM.getCodeGenOpts().hasProfileClangInstr()) {
+  // Do not skip over the instrumentation when single byte coverage mode is
+  // enabled.
+  if (HaveInsertPoint() && CGM.getCodeGenOpts().hasProfileClangInstr() &&
+      !llvm::EnableSingleByteCoverage) {
     // When instrumenting for profiling, the fallthrough to certain
     // statements needs to skip over the instrumentation code so that we
     // get an accurate count.
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index b2800f699ff4b..06327a1847177 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -1545,7 +1545,7 @@ class CodeGenFunction : public CodeGenTypeCache {
     if (CGM.getCodeGenOpts().hasProfileClangInstr() &&
         !CurFn->hasFnAttribute(llvm::Attribute::NoProfile) &&
         !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile))
-      PGO.emitCounterIncrement(Builder, S, StepV);
+      PGO.emitCounterSetOrIncrement(Builder, S, StepV);
     PGO.setCurrentStmt(S);
   }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 95e457bef28ed..1550b000a89a3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -858,6 +858,7 @@ void CodeGenModule::Release() {
   checkAliases();
   EmitDeferredUnusedCoverageMappings();
   CodeGenPGO(*this).setValueProfilingFlag(getModule());
+  CodeGenPGO(*this).setProfileVersion(getModule());
   if (CoverageMapping)
     CoverageMapping->emit();
   if (CodeGenOpts.SanitizeCfiCrossDso) {
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 8aebd3557690a..2619edfeb7dc7 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -23,6 +23,10 @@
 #include "llvm/Support/MD5.h"
 #include <optional>
 
+namespace llvm {
+extern cl::opt<bool> EnableSingleByteCoverage;
+} // namespace llvm
+
 static llvm::cl::opt<bool>
     EnableValueProfiling("enable-value-profiling",
                          llvm::cl::desc("Enable value profiling"),
@@ -346,6 +350,14 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return Base::VisitBinaryOperator(S);
   }
 
+  bool VisitConditionalOperator(ConditionalOperator *S) {
+    if (llvm::EnableSingleByteCoverage && S->getTrueExpr())
+      CounterMap[S->getTrueExpr()] = NextCounter++;
+    if (llvm::EnableSingleByteCoverage && S->getFalseExpr())
+      CounterMap[S->getFalseExpr()] = NextCounter++;
+    return Base::VisitConditionalOperator(S);
+  }
+
   /// Include \p S in the function hash.
   bool VisitStmt(Stmt *S) {
     auto Type = updateCounterMappings(S);
@@ -361,8 +373,21 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     if (Hash.getHashVersion() == PGO_HASH_V1)
       return Base::TraverseIfStmt(If);
 
+    // When single byte coverage mode is enabled, add a counter to then and
+    // else.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : If->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == If->getThen())
+        CounterMap[If->getThen()] = NextCounter++;
+      else if (CS == If->getElse())
+        CounterMap[If->getElse()] = NextCounter++;
+    }
+
     // Otherwise, keep track of which branch we're in while traversing.
     VisitStmt(If);
+
     for (Stmt *CS : If->children()) {
       if (!CS)
         continue;
@@ -376,6 +401,81 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return true;
   }
 
+  bool TraverseWhileStmt(WhileStmt *While) {
+    // When single byte coverage mode is enabled, add a counter to condition and
+    // body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : While->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == While->getCond())
+        CounterMap[While->getCond()] = NextCounter++;
+      else if (CS == While->getBody())
+        CounterMap[While->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseWhileStmt(While);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
+  bool TraverseDoStmt(DoStmt *Do) {
+    // When single byte coverage mode is enabled, add a counter to condition and
+    // body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : Do->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == Do->getCond())
+        CounterMap[Do->getCond()] = NextCounter++;
+      else if (CS == Do->getBody())
+        CounterMap[Do->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseDoStmt(Do);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
+  bool TraverseForStmt(ForStmt *For) {
+    // When single byte coverage mode is enabled, add a counter to condition,
+    // increment and body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : For->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == For->getCond())
+        CounterMap[For->getCond()] = NextCounter++;
+      else if (CS == For->getInc())
+        CounterMap[For->getInc()] = NextCounter++;
+      else if (CS == For->getBody())
+        CounterMap[For->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseForStmt(For);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
+  bool TraverseCXXForRangeStmt(CXXForRangeStmt *ForRange) {
+    // When single byte coverage mode is enabled, add a counter to body.
+    bool NoSingleByteCoverage = !llvm::EnableSingleByteCoverage;
+    for (Stmt *CS : ForRange->children()) {
+      if (!CS || NoSingleByteCoverage)
+        continue;
+      if (CS == ForRange->getBody())
+        CounterMap[ForRange->getBody()] = NextCounter++;
+    }
+
+    Base::TraverseCXXForRangeStmt(ForRange);
+    if (Hash.getHashVersion() != PGO_HASH_V1)
+      Hash.combine(PGOHash::EndOfScope);
+    return true;
+  }
+
 // If the statement type \p N is nestable, and its nesting impacts profile
 // stability, define a custom traversal which tracks the end of the statement
 // in the hash (provided we're not using the V1 hash).
@@ -387,10 +487,6 @@ struct MapRegionCounters : public RecursiveASTVisitor<MapRegionCounters> {
     return true;                                                               \
   }
 
-  DEFINE_NESTABLE_TRAVERSAL(WhileStmt)
-  DEFINE_NESTABLE_TRAVERSAL(DoStmt)
-  DEFINE_NESTABLE_TRAVERSAL(ForStmt)
-  DEFINE_NESTABLE_TRAVERSAL(CXXForRangeStmt)
   DEFINE_NESTABLE_TRAVERSAL(ObjCForCollectionStmt)
   DEFINE_NESTABLE_TRAVERSAL(CXXTryStmt)
   DEFINE_NESTABLE_TRAVERSAL(CXXCatchStmt)
@@ -1094,8 +1190,8 @@ CodeGenPGO::applyFunctionAttributes(llvm::IndexedInstrProfReader *PGOReader,
   Fn->setEntryCount(FunctionCount);
 }
 
-void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
-                                      llvm::Value *StepV) {
+void CodeGenPGO::emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
+                                           llvm::Value *StepV) {
   if (!RegionCounterMap || !Builder.GetInsertBlock())
     return;
 
@@ -1105,13 +1201,19 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
                          Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
                          Builder.getInt32(Counter), StepV};
-  if (!StepV)
-    Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment),
+
+  if (llvm::EnableSingleByteCoverage)
+    Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_cover),
                        ArrayRef(Args, 4));
-  else
-    Builder.CreateCall(
-        CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment_step),
-        ArrayRef(Args));
+  else {
+    if (!StepV)
+      Builder.CreateCall(CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment),
+                         ArrayRef(Args, 4));
+    else
+      Builder.CreateCall(
+          CGM.getIntrinsic(llvm::Intrinsic::instrprof_increment_step),
+          ArrayRef(Args));
+  }
 }
 
 bool CodeGenPGO::canEmitMCDCCoverage(const CGBuilderTy &Builder) {
@@ -1222,6 +1324,30 @@ void CodeGenPGO::setValueProfilingFlag(llvm::Module &M) {
                     uint32_t(EnableValueProfiling));
 }
 
+void CodeGenPGO::setProfileVersion(llvm::Module &M) {
+  if (CGM.getCodeGenOpts().hasProfileClangInstr() &&
+      llvm::EnableSingleByteCoverage) {
+    const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
+    llvm::Type *IntTy64 = llvm::Type::getInt64Ty(M.getContext());
+    uint64_t ProfileVersion =
+        (INSTR_PROF_RAW_VERSION | VARIANT_MASK_BYTE_COVERAGE);
+
+    auto IRLevelVersionVariable = new llvm::GlobalVariable(
+        M, IntTy64, true, llvm::GlobalValue::WeakAnyLinkage,
+        llvm::Constant::getIntegerValue(IntTy64,
+                                        llvm::APInt(64, ProfileVersion)),
+        VarName);
+
+    IRLevelVersionVariable->setVisibility(llvm::GlobalValue::DefaultVisibility);
+    llvm::Triple TT(M.getTargetTriple());
+    if (TT.supportsCOMDAT()) {
+      IRLevelVersionVariable->setLinkage(llvm::GlobalValue::ExternalLinkage);
+      IRLevelVersionVariable->setComdat(M.getOrInsertComdat(VarName));
+    }
+    IRLevelVersionVariable->setDSOLocal(true);
+  }
+}
+
 // This method either inserts a call to the profile run-time during
 // instrumentation or puts profile data into metadata for PGO use.
 void CodeGenPGO::valueProfile(CGBuilderTy &Builder, uint32_t ValueKind,
diff --git a/clang/lib/CodeGen/CodeGenPGO.h b/clang/lib/CodeGen/CodeGenPGO.h
index d3c2b277238fc..036fbf6815a49 100644
--- a/clang/lib/CodeGen/CodeGenPGO.h
+++ b/clang/lib/CodeGen/CodeGenPGO.h
@@ -94,6 +94,8 @@ class CodeGenPGO {
   // Set a module flag indicating if value profiling is enabled.
   void setValueProfilingFlag(llvm::Module &M);
 
+  void setProfileVersion(llvm::Module &M);
+
 private:
   void setFuncName(llvm::Function *Fn);
   void setFuncName(StringRef Name, llvm::GlobalValue::LinkageTypes Linkage);
@@ -108,8 +110,8 @@ class CodeGenPGO {
   bool canEmitMCDCCoverage(const CGBuilderTy &Builder);
 
 public:
-  void emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
-                            llvm::Value *StepV);
+  void emitCounterSetOrIncrement(CGBuilderTy &Builder, const Stmt *S,
+                                 llvm::Value *StepV);
   void emitMCDCTestVectorBitmapUpdate(CGBuilderTy &Builder, const Expr *S,
                                       Address MCDCCondBitmapAddr);
   void emitMCDCParameters(CGBuilderTy &Builder);
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index e25a92758f32b..71215da362d3d 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -31,6 +31,14 @@
 // is textually included.
 #define COVMAP_V3
 
+namespace llvm {
+cl::opt<bool>
+    EnableSingleByteCoverage("enable-single-byte-coverage",
+                             llvm::cl::ZeroOrMore,
+                             llvm::cl::desc("Enable single byte coverage"),
+                             llvm::cl::Hidden, llvm::cl::init(false));
+} // namespace llvm
+
 static llvm::cl::opt<bool> EmptyLineCommentCoverage(
     "emptyline-comment-coverage",
     llvm::cl::desc("Emit emptylines and comment lines as skipped regions (only "
@@ -832,16 +840,22 @@ struct CounterCoverageMappingBuilder
 
   /// Return a counter for the subtraction of \c RHS from \c LHS
   Counter subtractCounters(Counter LHS, Counter RHS, bool Simplify = true) {
+    assert(!llvm::EnableSingleByteCoverage &&
+           "cannot add counters when single byte coverage mode is enabled");
     return Builder.subtract(LHS, RHS, Simplify);
   }
 
   /// Return a counter for the sum of \c LHS and \c RHS.
   Counter addCounters(Counter LHS, Counter RHS, bool Simplify = true) {
+    assert(!llvm::EnableSingleByteCoverage &&
+           "cannot add counters when single byte coverage mode is enabled");
     return Builder.add(LHS, RHS, Simplify);
   }
 
   Counter addCounters(Counter C1, Counter C2, Counter C3,
                       bool Simplify = true) {
+    assert(!llvm::EnableSingleByteCoverage &&
+           "cannot add counters when single byte coverage mode is enabled");
     return addCounters(addCounters(C1, C2, Simplify), C3, Simplify);
   }
 
@@ -1443,8 +1457,9 @@ struct CounterCoverageMappingBuilder
 
   void VisitBreakStmt(const BreakStmt *S) {
     assert(!BreakContinueStack.empty() && "break not in a loop or switch!");
-    BreakContinueStack.back().BreakCount = addCounters(
-        BreakContinueStack.back().BreakCount, getRegion().getCounter());
+    if (!llvm::EnableSingleByteCoverage)
+      BreakContinueStack.back().BreakCount = addCounters(
+          BreakContinueStack.back().BreakCount, getRegion().getCounter());
     // FIXME: a break in a switch should terminate regions for all preceding
     // case statements, not just the most recent one.
     terminateRegion(S);
@@ -1452,8 +1467,9 @@ struct CounterCoverageMappingBuilder
 
   void VisitContinueStmt(const ContinueStmt *S) {
     assert(!BreakContinueStack.empty() && "continue stmt not in a loop!");
-    BreakContinueStack.back().ContinueCount = addCounters(
-        BreakContinueStack.back().ContinueCount, getRegion().getCounter());
+    if (!llvm::EnableSingleByteCoverage)
+      BreakContinueStack.back().ContinueCount = addCounters(
+          BreakContinueStack.back().ContinueCount, getRegion().getCounter());
     terminateRegion(S);
   }
 
@@ -1471,7 +1487,9 @@ struct CounterCoverageMappingBuilder
     extendRegion(S);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     // Handle the body first so that we can get the backedge count.
     BreakContinueStack.push_back(BreakContinue());
@@ -1484,7 +1502,9 @@ struct CounterCoverageMappingBuilder
 
     // Go back to handle the condition.
     Counter CondCount =
-        addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S->getCond())
+            : addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
     propagateCounts(CondCount, S->getCond());
     adjustForOutOfOrderTraversal(getEnd(S));
 
@@ -1494,7 +1514,11 @@ struct CounterCoverageMappingBuilder
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
     Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(CondCount, BodyCount));
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S)
+            : addCounters(BC.BreakCount,
+                          subtractCounters(CondCount, BodyCount));
+
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
@@ -1503,38 +1527,53 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(CondCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(CondCount, BodyCount));
   }
 
   void VisitDoStmt(const DoStmt *S) {
     extendRegion(S);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     BreakContinueStack.push_back(BreakContinue());
     extendRegion(S->getBody());
-    Counter BackedgeCount =
-        propagateCounts(addCounters(ParentCount, BodyCount), S->getBody());
+
+    Counter BackedgeCount;
+    if (llvm::EnableSingleByteCoverage)
+      propagateCounts(BodyCount, S->getBody());
+    else
+      BackedgeCount =
+          propagateCounts(addCounters(ParentCount, BodyCount), S->getBody());
+
     BreakContinue BC = BreakContinueStack.pop_back_val();
 
     bool BodyHasTerminateStmt = HasTerminateStmt;
     HasTerminateStmt = false;
 
-    Counter CondCount = addCounters(BackedgeCount, BC.ContinueCount);
+    Counter CondCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getCond())
+                            : addCounters(BackedgeCount, BC.ContinueCount);
     propagateCounts(CondCount, S->getCond());
 
     Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(CondCount, BodyCount));
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S)
+            : addCounters(BC.BreakCount,
+                          subtractCounters(CondCount, BodyCount));
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(CondCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(CondCount, BodyCount));
 
     if (BodyHasTerminateStmt)
       HasTerminateStmt = true;
@@ -1546,7 +1585,9 @@ struct CounterCoverageMappingBuilder
       Visit(S->getInit());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     // The loop increment may contain a break or continue.
     if (S->getInc())
@@ -1565,14 +1606,23 @@ struct CounterCoverageMappingBuilder
     // the count for all the continue statements.
     BreakContinue IncrementBC;
     if (const Stmt *Inc = S->getInc()) {
-      propagateCounts(addCounters(BackedgeCount, BodyBC.ContinueCount), Inc);
+      Counter IncCount;
+      if (llvm::EnableSingleByteCoverage)
+        IncCount = getRegionCounter(S->getInc());
+      else
+        IncCount = addCounters(BackedgeCount, BodyBC.ContinueCount);
+      propagateCounts(IncCount, Inc);
       IncrementBC = BreakContinueStack.pop_back_val();
     }
 
     // Go back to handle the condition.
-    Counter CondCount = addCounters(
-        addCounters(ParentCount, BackedgeCount, BodyBC.ContinueCount),
-        IncrementBC.ContinueCount);
+    Counter CondCount =
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S->getCond())
+            : addCounters(
+                  addCounters(ParentCount, BackedgeCount, BodyBC.ContinueCount),
+                  IncrementBC.ContinueCount);
+
     if (const Expr *Cond = S->getCond()) {
       propagateCounts(CondCount, Cond);
       adjustForOutOfOrderTraversal(getEnd(S));
@@ -1583,8 +1633,11 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter OutCount = addCounters(BodyBC.BreakCount, IncrementBC.BreakCount,
-                                   subtractCounters(CondCount, BodyCount));
+    Counter OutCount =
+        llvm::EnableSingleByteCoverage
+            ? getRegionCounter(S)
+            : addCounters(BodyBC.BreakCount, IncrementBC.BreakCount,
+                          subtractCounters(CondCount, BodyCount));
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
@@ -1593,8 +1646,9 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(CondCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(CondCount, BodyCount));
   }
 
   void VisitCXXForRangeStmt(const CXXForRangeStmt *S) {
@@ -1605,7 +1659,9 @@ struct CounterCoverageMappingBuilder
     Visit(S->getRangeStmt());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter BodyCount = getRegionCounter(S);
+    Counter BodyCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getBody())
+                            : getRegionCounter(S);
 
     BreakContinueStack.push_back(BreakContinue());
     extendRegion(S->getBody());
@@ -1620,10 +1676,15 @@ struct CounterCoverageMappingBuilder
     if (Gap)
       fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), BodyCount);
 
-    Counter LoopCount =
-        addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
-    Counter OutCount =
-        addCounters(BC.BreakCount, subtractCounters(LoopCount, BodyCount));
+    Counter OutCount;
+    Counter LoopCount;
+    if (llvm::EnableSingleByteCoverage)
+      OutCount = getRegionCounter(S);
+    else {
+      LoopCount = addCounters(ParentCount, BackedgeCount, BC.ContinueCount);
+      OutCount =
+          addCounters(BC.BreakCount, subtractCounters(LoopCount, BodyCount));
+    }
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
@@ -1632,8 +1693,9 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), BodyCount,
-                       subtractCounters(LoopCount, BodyCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(S->getCond(), BodyCount,
+                         subtractCounters(LoopCount, BodyCount));
   }
 
   void VisitObjCForCollectionStmt(const ObjCForCollectionStmt *S) {
@@ -1694,7 +1756,7 @@ struct CounterCoverageMappingBuilder
       propagateCounts(Counter::getZero(), Body);
     BreakContinue BC = BreakContinueStack.pop_back_val();
 
-    if (!BreakContinueStack.empty())
+    if (!BreakContinueStack.empty() && !llvm::EnableSingleByteCoverage)
       BreakContinueStack.back().ContinueCount = addCounters(
           BreakContinueStack.back().ContinueCount, BC.ContinueCount);
 
@@ -1709,6 +1771,11 @@ struct CounterCoverageMappingBuilder
     MostRecentLocation = getStart(S);
     handleFileExit(ExitLoc);
 
+    // When single byte coverage mode is enabled, do not create branch region by
+    // early returning.
+    if (llvm::EnableSingleByteCoverage)
+      return;
+
     // Create a Branch Region around each Case. Subtract the case's
     // counter from the Parent counter to track the "False" branch count.
     Counter CaseCountSum;
@@ -1741,8 +1808,10 @@ struct CounterCoverageMappingBuilder
     extendRegion(S);
 
     SourceMappingRegion &Parent = getRegion();
+    Counter Count = llvm::EnableSingleByteCoverage
+                        ? getRegionCounter(S)
+                        : addCounters(Parent.getCounter(), getRegionCounter(S));
 
-    Counter Count = addCounters(Parent.getCounter(), getRegionCounter(S));
     // Reuse the existing region if it starts at our label. This is typical of
     // the first case in a switch.
     if (Parent.hasStartLoc() && Parent.getBeginLoc() == getStart(S))
@@ -1860,7 +1929,9 @@ struct CounterCoverageMappingBuilder
     extendRegion(S->getCond());
 
     Counter ParentCount = getRegion().getCounter();
-    Counter ThenCount = getRegionCounter(S);
+    Counter ThenCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(S->getThen())
+                            : getRegionCounter(S);
 
     // Emitting a counter for the condition makes it easier to interpret the
     // counter for the body when looking at the coverage.
@@ -1874,7 +1945,12 @@ struct CounterCoverageMappingBuilder
 
     extendRegion(S->getThen());
     Counter OutCount = propagateCounts(ThenCount, S->getThen());
-    Counter ElseCount = subtractCounters(ParentCount, ThenCount);
+
+    Counter ElseCount;
+    if (!llvm::EnableSingleByteCoverage)
+      ElseCount = subtractCounters(ParentCount, ThenCount);
+    else if (S->getElse())
+      ElseCount = getRegionCounter(S->getElse());
 
     if (const Stmt *Else = S->getElse()) {
       bool ThenHasTerminateStmt = HasTerminateStmt;
@@ -1885,21 +1961,28 @@ struct CounterCoverageMappingBuilder
       if (Gap)
         fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), ElseCount);
       extendRegion(Else);
-      OutCount = addCounters(OutCount, propagateCounts(ElseCount, Else));
+
+      Counter ElseOutCount = propagateCounts(ElseCount, Else);
+      if (!llvm::EnableSingleByteCoverage)
+        OutCount = addCounters(OutCount, ElseOutCount);
 
       if (ThenHasTerminateStmt)
         HasTerminateStmt = true;
-    } else
+    } else if (!llvm::EnableSingleByteCoverage)
       OutCount = addCounters(OutCount, ElseCount);
 
+    if (llvm::EnableSingleByteCoverage)
+      OutCount = getRegionCounter(S);
+
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
       GapRegionCounter = OutCount;
     }
 
-    // Create Branch Region around condition.
-    createBranchRegion(S->getCond(), ThenCount,
-                       subtractCounters(ParentCount, ThenCount));
+    if (!S->isConsteval() && !llvm::EnableSingleByteCoverage)
+      // Create Branch Region around condition.
+      createBranchRegion(S->getCond(), ThenCount,
+                         subtractCounters(ParentCount, ThenCount));
   }
 
   void VisitCXXTryStmt(const CXXTryStmt *S) {
@@ -1925,7 +2008,9 @@ struct CounterCoverageMappingBuilder
     extendRegion(E);
 
     Counter ParentCount = getRegion().getCounter();
-    Counter TrueCount = getRegionCounter(E);
+    Counter TrueCount = llvm::EnableSingleByteCoverage
+                            ? getRegionCounter(E->getTrueExpr())
+                            : getRegionCounter(E);
 
     propagateCounts(ParentCount, E->getCond());
     Counter OutCount;
@@ -1944,9 +2029,15 @@ struct CounterCoverageMappingBuilder
     }
 
     extendRegion(E->getFalseExpr());
-    OutCount = addCounters(
-        OutCount, propagateCounts(subtractCounters(ParentCount, TrueCount),
-                                  E->getFalseExpr()));
+    Counter FalseCount = llvm::EnableSingleByteCoverage
+                             ? getRegionCounter(E->getFalseExpr())
+                             : subtractCounters(ParentCount, TrueCount);
+
+    Counter FalseOutCount = propagateCounts(FalseCount, E->getFalseExpr());
+    if (llvm::EnableSingleByteCoverage)
+      OutCount = getRegionCounter(E);
+    else
+      OutCount = addCounters(OutCount, FalseOutCount);
 
     if (OutCount != ParentCount) {
       pushRegion(OutCount);
@@ -1954,8 +2045,9 @@ struct CounterCoverageMappingBuilder
     }
 
     // Create Branch Region around condition.
-    createBranchRegion(E->getCond(), TrueCount,
-                       subtractCounters(ParentCount, TrueCount));
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getCond(), TrueCount,
+                         subtractCounters(ParentCount, TrueCount));
   }
 
   void createDecision(const BinaryOperator *E) {
@@ -2002,12 +2094,14 @@ struct CounterCoverageMappingBuilder
     Counter ParentCnt = getRegion().getCounter();
 
     // Create Branch Region around LHS condition.
-    createBranchRegion(E->getLHS(), RHSExecCnt,
-                       subtractCounters(ParentCnt, RHSExecCnt), DecisionLHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getLHS(), RHSExecCnt,
+                         subtractCounters(ParentCnt, RHSExecCnt), DecisionLHS);
 
     // Create Branch Region around RHS condition.
-    createBranchRegion(E->getRHS(), RHSTrueCnt,
-                       subtractCounters(RHSExecCnt, RHSTrueCnt), DecisionRHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getRHS(), RHSTrueCnt,
+                         subtractCounters(RHSExecCnt, RHSTrueCnt), DecisionRHS);
 
     // Create MCDC Decision Region if at top-level (root).
     if (IsRootNode)
@@ -2058,12 +2152,14 @@ struct CounterCoverageMappingBuilder
     Counter ParentCnt = getRegion().getCounter();
 
     // Create Branch Region around LHS condition.
-    createBranchRegion(E->getLHS(), subtractCounters(ParentCnt, RHSExecCnt),
-                       RHSExecCnt, DecisionLHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getLHS(), subtractCounters(ParentCnt, RHSExecCnt),
+                         RHSExecCnt, DecisionLHS);
 
     // Create Branch Region around RHS condition.
-    createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
-                       RHSFalseCnt, DecisionRHS);
+    if (!llvm::EnableSingleByteCoverage)
+      createBranchRegion(E->getRHS(), subtractCounters(RHSExecCnt, RHSFalseCnt),
+                         RHSFalseCnt, DecisionRHS);
 
     // Create MCDC Decision Region if at top-level (root).
     if (IsRootNode)
diff --git a/clang/test/CoverageMapping/single-byte-counters.cpp b/clang/test/CoverageMapping/single-byte-counters.cpp
new file mode 100644
index 0000000000000..8e9b613dcc68f
--- /dev/null
+++ b/clang/test/CoverageMapping/single-byte-counters.cpp
@@ -0,0 +1,169 @@
+// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -mllvm -enable-single-byte-coverage=true -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name single-byte-counters.cpp %s | FileCheck %s
+
+// CHECK: testIf
+int testIf(int x) { // CHECK-NEXT: File 0, [[@LINE]]:19 -> [[@LINE+10]]:2 = #0
+                    // CHECK-NEXT: File 0, [[@LINE+5]]:7 -> [[@LINE+5]]:13 = #0
+                    // CHECK-NEXT: Gap,File 0, [[@LINE+4]]:14 -> [[@LINE+5]]:5 = #1
+                    // CHECK-NEXT: File 0, [[@LINE+4]]:5 -> [[@LINE+4]]:16 = #1
+                    // CHECK-NEXT: File 0, [[@LINE+5]]:3 -> [[@LINE+5]]:16 = #2
+  int result = 0;
+  if (x == 0)
+    result = -1;
+
+  return result;
+}
+
+// CHECK-NEXT: testIfElse
+int testIfElse(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+13]]:2 = #0
+                        // CHECK-NEXT: File 0, [[@LINE+7]]:7 -> [[@LINE+7]]:12 = #0
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:13 -> [[@LINE+7]]:5 = #1
+                        // CHECK-NEXT: File 0, [[@LINE+6]]:5 -> [[@LINE+6]]:15 = #1
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+5]]:16 -> [[@LINE+7]]:5 = #2
+                        // CHECK-NEXT: File 0, [[@LINE+6]]:5 -> [[@LINE+6]]:19 = #2
+                        // CHECK-NEXT: File 0, [[@LINE+6]]:3 -> [[@LINE+6]]:16 = #3
+  int result = 0;
+  if (x < 0)
+    result = 0;
+  else
+    result = x * x;
+  return result;
+}
+
+// CHECK-NEXT: testIfElseReturn
+int testIfElseReturn(int x) { // CHECK-NEXT: File 0, [[@LINE]]:29 -> [[@LINE+14]]:2 = #0
+                              // CHECK-NEXT: File 0, [[@LINE+8]]:7 -> [[@LINE+8]]:12 = #0
+                              // CHECK-NEXT: Gap,File 0, [[@LINE+7]]:13 -> [[@LINE+8]]:5 = #1
+                              // CHECK-NEXT: File 0, [[@LINE+7]]:5 -> [[@LINE+7]]:19 = #1
+                              // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:20 -> [[@LINE+8]]:5 = #2
+                              // CHECK-NEXT: File 0, [[@LINE+7]]:5 -> [[@LINE+7]]:13 = #2
+                              // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:14 -> [[@LINE+7]]:3 = #3
+                              // CHECK-NEXT: File 0, [[@LINE+6]]:3 -> [[@LINE+6]]:16 = #3
+  int result = 0;
+  if (x > 0)
+    result = x * x;
+  else
+    return 0;
+  return result;
+}
+
+// CHECK-NEXT: testSwitch
+int testSwitch(int x) { // CHECK-NEXT: File 0, [[@LINE]]:23 -> [[@LINE+22]]:2 = #0
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+9]]:14 -> [[@LINE+17]]:15 = 0
+                        // CHECK-NEXT: File 0, [[@LINE+9]]:3 -> [[@LINE+11]]:10 = #2
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+10]]:11 -> [[@LINE+11]]:3 = 0
+                        // CHECK-NEXT: File 0, [[@LINE+10]]:3 -> [[@LINE+12]]:10 = #3
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+11]]:11 -> [[@LINE+12]]:3 = 0
+                        // CHECK-NEXT: File 0, [[@LINE+11]]:3 -> [[@LINE+12]]:15 = #4
+                        // CHECK-NEXT: Gap,File 0, [[@LINE+12]]:4 -> [[@LINE+14]]:3 = #1
+                        // CHECK-NEXT: File 0, [[@LINE+13]]:3 -> [[@LINE+13]]:16 = #1
+  int result;
+  switch (x) {
+  case 1:
+    result = 1;
+    break;
+  case 2:
+    result = 2;
+    break;
+  default:
+    result = 0;
+  }
+
+  return result;
+}
+
+// CHECK-NEXT: testWhile
+int testWhile() { // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+13]]:2 = #0
+                  // CHECK-NEXT: File 0, [[@LINE+6]]:10 -> [[@LINE+6]]:16 = #1
+                  // CHECK-NEXT: Gap,File 0, [[@LINE+5]]:17 -> [[@LINE+5]]:18 = #2
+                  // CHECK-NEXT: File 0, [[@LINE+4]]:18 -> [[@LINE+7]]:4 = #2
+                  // CHECK-NEXT: File 0, [[@LINE+8]]:3 -> [[@LINE+8]]:13 = #3
+  int i = 0;
+  int sum = 0;
+  while (i < 10) {
+    sum += i;
+    i++;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testContinue
+int testContinue() { // CHECK-NEXT: File 0, [[@LINE]]:20 -> [[@LINE+21]]:2 = #0
+                     // CHECK-NEXT: File 0, [[@LINE+12]]:10 -> [[@LINE+12]]:16 = #1
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+11]]:17 -> [[@LINE+11]]:18 = #2
+                     // CHECK-NEXT: File 0, [[@LINE+10]]:18 -> [[@LINE+15]]:4 = #2
+                     // CHECK-NEXT: File 0, [[@LINE+10]]:9 -> [[@LINE+10]]:15 = #2
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+9]]:16 -> [[@LINE+10]]:7 = #4
+                     // CHECK-NEXT: File 0, [[@LINE+9]]:7 -> [[@LINE+9]]:15 = #4
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+8]]:16 -> [[@LINE+9]]:5 = #5
+                     // CHECK-NEXT: File 0, [[@LINE+8]]:5 -> [[@LINE+10]]:4 = #5
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+9]]:4 -> [[@LINE+11]]:3 = #3
+                     // CHECK-NEXT: File 0, [[@LINE+10]]:3 -> [[@LINE+10]]:13 = #3
+  int i = 0;
+  int sum = 0;
+  while (i < 10) {
+    if (i == 4)
+      continue;
+    sum += i;
+    i++;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testFor
+int testFor() { // CHECK-NEXT: File 0, [[@LINE]]:15 -> [[@LINE+13]]:2 = #0
+                // CHECK-NEXT: File 0, [[@LINE+7]]:19 -> [[@LINE+7]]:25 = #1
+                // CHECK-NEXT: File 0, [[@LINE+6]]:27 -> [[@LINE+6]]:30 = #2
+                // CHECK-NEXT: Gap,File 0, [[@LINE+5]]:31 -> [[@LINE+5]]:32 = #3
+                // CHECK-NEXT: File 0, [[@LINE+4]]:32 -> [[@LINE+6]]:4 = #3
+                // CHECK-NEXT: File 0, [[@LINE+7]]:3 -> [[@LINE+7]]:13 = #4
+  int i;
+  int sum = 0;
+  for (int i = 0; i < 10; i++) {
+    sum += i;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testForRange
+int testForRange() { // CHECK-NEXT: File 0, [[@LINE]]:20 -> [[@LINE+12]]:2 = #0
+                     // CHECK-NEXT: Gap,File 0, [[@LINE+6]]:28 -> [[@LINE+6]]:29 = #1
+                     // CHECK-NEXT: File 0, [[@LINE+5]]:29 -> [[@LINE+7]]:4 = #1
+                     // CHECK-NEXT: File 0, [[@LINE+8]]:3 -> [[@LINE+8]]:13 = #2
+  int sum = 0;
+  int array[] = {1, 2, 3, 4, 5};
+
+  for (int element : array) {
+      sum += element;
+  }
+
+  return sum;
+}
+
+// CHECK-NEXT: testDo
+int testDo() { // CHECK-NEXT: File 0, [[@LINE]]:14 -> [[@LINE+12]]:2 = #0
+               // CHECK-NEXT: File 0, [[@LINE+5]]:6 -> [[@LINE+8]]:4 = #1
+               // CHECK-NEXT: File 0, [[@LINE+7]]:12 -> [[@LINE+7]]:17 = #2
+               // CHECK-NEXT: File 0, [[@LINE+8]]:3 -> [[@LINE+8]]:13 = #3
+  int i = 0;
+  int sum = 0;
+  do {
+    sum += i;
+    i++;
+  } while (i < 5);
+
+  return sum;
+}
+
+// CHECK-NEXT: testConditional
+int testConditional(int x) { // CHECK-NEXT: File 0, [[@LINE]]:28 -> [[@LINE+8]]:2 = #0
+                             // CHECK-NEXT: File 0, [[@LINE+5]]:15 -> [[@LINE+5]]:22 = #0
+                             // CHECK-NEXT: Gap,File 0, [[@LINE+4]]:24 -> [[@LINE+4]]:25 = #2
+                             // CHECK-NEXT: File 0, [[@LINE+3]]:25 -> [[@LINE+3]]:26 = #2
+                             // CHECK-NEXT: File 0, [[@LINE+2]]:29 -> [[@LINE+2]]:31 = #3
+                             // CHECK-NEXT: File 0, [[@LINE+2]]:2 -> [[@LINE+2]]:15 = #1
+ int result = (x > 0) ? 1 : -1;
+ return result;
+}
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 0123908336918..705b54fddcd7b 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -294,7 +294,8 @@ COMPILER_RT_VISIBILITY extern int INSTR_PROF_PROFILE_RUNTIME_VAR;
  * variable is defined as weak so that compiler can emit an overriding
  * definition depending on user option.
  */
-extern uint64_t INSTR_PROF_RAW_VERSION_VAR; /* __llvm_profile_raw_version */
+COMPILER_RT_VISIBILITY extern uint64_t
+    INSTR_PROF_RAW_VERSION_VAR; /* __llvm_profile_raw_version */
 
 /*!
  * This variable is a weak symbol defined in InstrProfiling.c. It allows
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index b39f262c3d976..f8f549eea2e09 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -359,15 +359,19 @@ struct CountedRegion : public CounterMappingRegion {
   uint64_t ExecutionCount;
   uint64_t FalseExecutionCount;
   bool Folded;
+  bool HasSingleByteCoverage;
 
-  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount)
+  CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
+                bool HasSingleByteCoverage)
       : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
-        FalseExecutionCount(0), Folded(false) {}
+        FalseExecutionCount(0), Folded(false),
+        HasSingleByteCoverage(HasSingleByteCoverage) {}
 
   CountedRegion(const CounterMappingRegion &R, uint64_t ExecutionCount,
-                uint64_t FalseExecutionCount)
+                uint64_t FalseExecutionCount, bool HasSingleByteCoverage)
       : CounterMappingRegion(R), ExecutionCount(ExecutionCount),
-        FalseExecutionCount(FalseExecutionCount), Folded(false) {}
+        FalseExecutionCount(FalseExecutionCount), Folded(false),
+        HasSingleByteCoverage(HasSingleByteCoverage) {}
 };
 
 /// MCDC Record grouping all information together.
@@ -661,10 +665,11 @@ struct FunctionRecord {
   }
 
   void pushRegion(CounterMappingRegion Region, uint64_t Count,
-                  uint64_t FalseCount) {
+                  uint64_t FalseCount, bool HasSingleByteCoverage) {
     if (Region.Kind == CounterMappingRegion::BranchRegion ||
         Region.Kind == CounterMappingRegion::MCDCBranchRegion) {
-      CountedBranchRegions.emplace_back(Region, Count, FalseCount);
+      CountedBranchRegions.emplace_back(Region, Count, FalseCount,
+                                        HasSingleByteCoverage);
       // If both counters are hard-coded to zero, then this region represents a
       // constant-folded branch.
       if (Region.Count.isZero() && Region.FalseCount.isZero())
@@ -673,7 +678,8 @@ struct FunctionRecord {
     }
     if (CountedRegions.empty())
       ExecutionCount = Count;
-    CountedRegions.emplace_back(Region, Count, FalseCount);
+    CountedRegions.emplace_back(Region, Count, FalseCount,
+                                HasSingleByteCoverage);
   }
 };
 
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 047b14f223bd9..7a806fd7fcf34 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -168,6 +168,10 @@ class InstrProfWriter {
 
   InstrProfKind getProfileKind() const { return ProfileKind; }
 
+  bool hasSingleByteCoverage() const {
+    return static_cast<bool>(ProfileKind & InstrProfKind::SingleByteCoverage);
+  }
+
   // Internal interface for testing purpose only.
   void setValueProfDataEndianness(llvm::endianness Endianness);
   void setOutputSparse(bool Sparse);
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 334f5dce879e5..3f892d7aff2f4 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -860,7 +860,8 @@ Error CoverageMapping::loadFunctionRecord(
       consumeError(std::move(E));
       return Error::success();
     }
-    Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount);
+    Function.pushRegion(Region, *ExecutionCount, *AltExecutionCount,
+                        ProfileReader.hasSingleByteCoverage());
 
     // Record ExpansionRegion.
     if (Region.Kind == CounterMappingRegion::ExpansionRegion) {
@@ -1282,8 +1283,14 @@ class SegmentBuilder {
       // value for that area.
       // We add counts of the regions of the same kind as the active region
       // to handle the both situations.
-      if (I->Kind == Active->Kind)
-        Active->ExecutionCount += I->ExecutionCount;
+      if (I->Kind == Active->Kind) {
+        assert(I->HasSingleByteCoverage == Active->HasSingleByteCoverage &&
+               "Regions are generated in different coverage modes");
+        if (I->HasSingleByteCoverage)
+          Active->ExecutionCount = Active->ExecutionCount || I->ExecutionCount;
+        else
+          Active->ExecutionCount += I->ExecutionCount;
+      }
     }
     return Regions.drop_back(std::distance(++Active, End));
   }

From 3d2a918831e7bcf1285641ee446ac1640819819f Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:47:16 -0800
Subject: [PATCH 385/546] [clang] Fixes inf loop parsing fixed point literal
 (#83071)

Clang was incorrectly finding the start of the exponent in a fixed point
hex literal. It would unconditionally find the first `e/E/p/P` in a
constant regardless of if it were hex or not and parser the remaining
digits as an APInt. In a debug build, this would be caught by an
assertion, but in a release build, the assertion is removed and we'd end
up in an infinite loop.

Fixes #83050
---
 clang/lib/Lex/LiteralSupport.cpp               |  9 ++++++---
 clang/test/Frontend/fixed_point_declarations.c | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 571a984884029..438c6d772e6e0 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1513,8 +1513,10 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
                                                : APFloat::opInvalidOp;
 }
 
-static inline bool IsExponentPart(char c) {
-  return c == 'p' || c == 'P' || c == 'e' || c == 'E';
+static inline bool IsExponentPart(char c, bool isHex) {
+  if (isHex)
+    return c == 'p' || c == 'P';
+  return c == 'e' || c == 'E';
 }
 
 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
@@ -1533,7 +1535,8 @@ bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Sc
   if (saw_exponent) {
     const char *Ptr = DigitsBegin;
 
-    while (!IsExponentPart(*Ptr)) ++Ptr;
+    while (!IsExponentPart(*Ptr, radix == 16))
+      ++Ptr;
     ExponentBegin = Ptr;
     ++Ptr;
     NegativeExponent = *Ptr == '-';
diff --git a/clang/test/Frontend/fixed_point_declarations.c b/clang/test/Frontend/fixed_point_declarations.c
index 04ed25d227ca5..c804576744273 100644
--- a/clang/test/Frontend/fixed_point_declarations.c
+++ b/clang/test/Frontend/fixed_point_declarations.c
@@ -125,3 +125,21 @@ long _Fract           long_fract_zero     = 0.0lr;    // CHECK-DAG: @long_fract_
 unsigned short _Fract u_short_fract_zero  = 0.0uhr;   // CHECK-DAG: @u_short_fract_zero   = {{.*}}global i8  0, align 1
 unsigned  _Fract      u_fract_zero        = 0.0ur;    // CHECK-DAG: @u_fract_zero         = {{.*}}global i16 0, align 2
 unsigned long _Fract  u_long_fract_zero   = 0.0ulr;   // CHECK-DAG: @u_long_fract_zero    = {{.*}}global i32 0, align 4
+
+// Hex exponent suffix with E in hex digit sequence.
+unsigned short _Fract e1 = 0x1.e8p-1uhr;  // CHECK-DAG: @e1 = {{.*}}global i8 -12
+unsigned short _Fract e2 = 0x1.8ep-1uhr;  // CHECK-DAG: @e2 = {{.*}}global i8 -57
+unsigned short _Fract e3 = 0x1.ep-1uhr;  //  CHECK-DAG: @e3 = {{.*}}global i8 -16
+unsigned _Accum e4 = 0xep-1uk;  //  CHECK-DAG: @e4 = {{.*}}global i32 458752
+unsigned _Accum e5 = 0xe.1p-1uk;  //  CHECK-DAG: @e5 = {{.*}}global i32 460800
+unsigned _Accum e6 = 0xe.ep-1uk;  //  CHECK-DAG: @e6 = {{.*}}global i32 487424
+unsigned _Accum e7 = 0xe.e8p-1uk;  //  CHECK-DAG: @e7 = {{.*}}global i32 488448
+unsigned _Accum e8 = 0xe.8ep-1uk;  //  CHECK-DAG: @e8 = {{.*}}global i32 476928
+unsigned short _Fract E1 = 0x1.E8p-1uhr;  // CHECK-DAG: @E1 = {{.*}}global i8 -12
+unsigned short _Fract E2 = 0x1.8Ep-1uhr;  // CHECK-DAG: @E2 = {{.*}}global i8 -57
+unsigned short _Fract E3 = 0x1.Ep-1uhr;  //  CHECK-DAG: @E3 = {{.*}}global i8 -16
+unsigned _Accum E4 = 0xEp-1uk;  //  CHECK-DAG: @E4 = {{.*}}global i32 458752
+unsigned _Accum E5 = 0xE.1p-1uk;  //  CHECK-DAG: @E5 = {{.*}}global i32 460800
+unsigned _Accum E6 = 0xE.Ep-1uk;  //  CHECK-DAG: @E6 = {{.*}}global i32 487424
+unsigned _Accum E7 = 0xE.E8p-1uk;  //  CHECK-DAG: @E7 = {{.*}}global i32 488448
+unsigned _Accum E8 = 0xE.8Ep-1uk;  //  CHECK-DAG: @E8 = {{.*}}global i32 476928

From 54cff50791dec977feb0badb74919d97dff5b859 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 26 Feb 2024 22:50:49 +0000
Subject: [PATCH 386/546] [TBAA] Add !tbaa.struct test with unnamed bitfields.

Extra tests with unnamed bitfields for
https://github.com/llvm/llvm-project/pull/82922.
---
 clang/test/CodeGen/tbaa-struct.cpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index e25fbc1a77810..28c7d396121af 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -134,6 +134,23 @@ void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
   *a1 = *a2;
 }
 
+// Test with unnamed bitfield at the start and in between named ones..
+struct NamedBitfields3 {
+  unsigned : 11;
+  signed f0 : 9;
+  char : 2;
+  int f1 : 2;
+  double f2;
+};
+
+void copy10(NamedBitfields3 *a1, NamedBitfields3 *a2) {
+// CHECK-LABEL: _Z6copy10P15NamedBitfields3S0_
+// CHECK:   tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) %a1, ptr noundef nonnull align 8 dereferenceable(16) %a2, i64 16, i1 false),
+// CHECK-OLD-SAME: !tbaa.struct [[TS8:!.*]]
+// CHECK-NEW-SAME: !tbaa [[TAG_NamedBitfields3:!.+]], !tbaa.struct
+  *a1 = *a2;
+}
+
 // CHECK-OLD: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
 // CHECK-OLD: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
 // CHECK-OLD: [[TAG_INT:!.*]] = !{[[INT:!.*]], [[INT]], i64 0}
@@ -149,6 +166,7 @@ void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
 // CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0}
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
 // CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 4, [[TAG_INT]], i64 3, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 4, [[TAG_INT]]}
+// CHECK-OLD: [[TS8]] = !{i64 1, i64 4, [[TAG_INT]], i64 2, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_DOUBLE]]}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}
@@ -168,3 +186,5 @@ void copy9(NamedBitfields2 *a1, NamedBitfields2 *a2) {
 // CHECK-NEW-DAG: [[TYPE_double]] = !{[[TYPE_char]], i64 8, !"double"}
 // CHECK-NEW-DAG: [[TAG_NamedBitfields2]] = !{[[TYPE_NamedBitfields2:!.+]], [[TYPE_NamedBitfields2]], i64 0, i64 24}
 // CHECK-NEW-DAG: [[TYPE_NamedBitfields2]] = !{[[TYPE_char]], i64 24, !"_ZTS15NamedBitfields2", [[TYPE_char]], i64 0, i64 1, [[TYPE_char]], i64 1, i64 1, [[TYPE_char]], i64 2, i64 1, [[TYPE_int]], i64 3, i64 4, [[TYPE_int]], i64 3, i64 4, [[TYPE_char]], i64 4, i64 1, [[TYPE_double]], i64 8, i64 8, [[TYPE_int]], i64 16, i64 4}
+// CHECK-NEW-DAG: [[TAG_NamedBitfields3]] = !{[[TYPE_NamedBitfields3:!.+]], [[TYPE_NamedBitfields3]], i64 0, i64 16}
+// CHECK-NEW-DAG: [[TYPE_NamedBitfields3]] = !{[[TYPE_char]], i64 16, !"_ZTS15NamedBitfields3", [[TYPE_int]], i64 1, i64 4, [[TYPE_int]], i64 2, i64 4, [[TYPE_double]], i64 8, i64 8}

From cda413087c59ed5da46ca83e8a2e07c0ebd2e3f9 Mon Sep 17 00:00:00 2001
From: Fabio D'Urso <fdurso@google.com>
Date: Tue, 27 Feb 2024 00:00:20 +0100
Subject: [PATCH 387/546] [scudo] Do not unmap the memory containing the this
 object in unmapRingBuffer (#83034)

---
 compiler-rt/lib/scudo/standalone/combined.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index cd5a07be1576e..fa6077384d982 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1610,8 +1610,12 @@ class Allocator {
     // is very important.
     RB->RawStackDepotMap.unmap(RB->RawStackDepotMap.getBase(),
                                RB->RawStackDepotMap.getCapacity());
-    RB->RawRingBufferMap.unmap(RB->RawRingBufferMap.getBase(),
-                               RB->RawRingBufferMap.getCapacity());
+    // Note that the `RB->RawRingBufferMap` is stored on the pages managed by
+    // itself. Take over the ownership before calling unmap() so that any
+    // operation along with unmap() won't touch inaccessible pages.
+    MemMapT RawRingBufferMap = RB->RawRingBufferMap;
+    RawRingBufferMap.unmap(RawRingBufferMap.getBase(),
+                           RawRingBufferMap.getCapacity());
     atomic_store(&RingBufferAddress, 0, memory_order_release);
   }
 

From 0339ce06c1813cebf03aea931256901101e28ba5 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 26 Feb 2024 15:00:46 -0800
Subject: [PATCH 388/546] [NFC][flang] Removed unused constexpr var.

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 0344c6ba6cb3b..c84fb27cb38da 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -920,8 +920,6 @@ constexpr auto FuncTypeReal16Real16Real16 =
     genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Real<16>>;
 constexpr auto FuncTypeReal16Integer4Real16 =
     genFuncType<Ty::Real<16>, Ty::Integer<4>, Ty::Real<16>>;
-constexpr auto FuncTypeReal16Integer8Real16 =
-    genFuncType<Ty::Real<16>, Ty::Integer<8>, Ty::Real<16>>;
 constexpr auto FuncTypeInteger4Real16 =
     genFuncType<Ty::Integer<4>, Ty::Real<16>>;
 constexpr auto FuncTypeInteger8Real16 =

From dc5dfc102ffc3b870f7565fb4a90d53b31ec92f8 Mon Sep 17 00:00:00 2001
From: nikitalita <69168929+nikitalita@users.noreply.github.com>
Date: Mon, 26 Feb 2024 15:05:02 -0800
Subject: [PATCH 389/546] [lldb] python-bindings: fix
 `SBTarget.get_target_watchpoints()` (#82295)

Fixes erroneous usage of `bkpts` instead of `watchpoints` (probably
introduced from copying and pasting `get_target_bkpts()`).
---
 lldb/bindings/interface/SBTargetExtensions.i | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/bindings/interface/SBTargetExtensions.i b/lldb/bindings/interface/SBTargetExtensions.i
index c80dadfc0c5ca..d756a351a810a 100644
--- a/lldb/bindings/interface/SBTargetExtensions.i
+++ b/lldb/bindings/interface/SBTargetExtensions.i
@@ -172,7 +172,7 @@ STRING_EXTENSION_LEVEL_OUTSIDE(SBTarget, lldb::eDescriptionLevelBrief)
             '''An accessor function that returns a list() that contains all watchpoints in a lldb.SBtarget object.'''
             watchpoints = []
             for idx in range(self.GetNumWatchpoints()):
-                bkpts.append(self.GetWatchpointAtIndex(idx))
+                watchpoints.append(self.GetWatchpointAtIndex(idx))
             return watchpoints
 
         modules = property(get_modules_array, None, doc='''A read only property that returns a list() of lldb.SBModule objects contained in this target. This list is a list all modules that the target currently is tracking (the main executable and all dependent shared libraries).''')

From 056d62be38c5db3d8332ac300c4ff29214126697 Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Mon, 26 Feb 2024 15:21:32 -0800
Subject: [PATCH 390/546] Revert "[scudo] Store more blocks in each
 TransferBatch" (#83078)

Reverts llvm/llvm-project#70390

There's a bug caught by
`ScudoCombinedTestReallocateInPlaceStress_DefaultConfig.ReallocateInPlaceStress`
with gwp asan. It's an easy fix but given that this is a major change, I
would like to revert it first
---
 .../lib/scudo/standalone/allocator_common.h   |   7 -
 compiler-rt/lib/scudo/standalone/primary32.h  | 106 ++++++------
 compiler-rt/lib/scudo/standalone/primary64.h  | 153 ++++++++----------
 .../scudo/standalone/tests/primary_test.cpp   |  34 ++--
 4 files changed, 141 insertions(+), 159 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/allocator_common.h b/compiler-rt/lib/scudo/standalone/allocator_common.h
index 46dc7c0f3b914..95f4776ac596d 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_common.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_common.h
@@ -40,7 +40,6 @@ template <class SizeClassAllocator> struct TransferBatch {
     B->Count = static_cast<u16>(B->Count - N);
   }
   void clear() { Count = 0; }
-  bool empty() { return Count == 0; }
   void add(CompactPtrT P) {
     DCHECK_LT(Count, MaxNumCached);
     Batch[Count++] = P;
@@ -49,12 +48,6 @@ template <class SizeClassAllocator> struct TransferBatch {
     memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     clear();
   }
-
-  void moveNToArray(CompactPtrT *Array, u16 N) {
-    DCHECK_LE(N, Count);
-    memcpy(Array, Batch + Count - N, sizeof(Batch[0]) * Count);
-    Count -= N;
-  }
   u16 getCount() const { return Count; }
   bool isEmpty() const { return Count == 0U; }
   CompactPtrT get(u16 I) const {
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index c86e75b8fd66a..4d03b282d000d 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -191,21 +191,38 @@ template <typename Config> class SizeClassAllocator32 {
     return BlockSize > PageSize;
   }
 
+  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
+  // count. Now it's the same as the number of blocks stored in the
+  // `TransferBatch`.
   u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
-                const u16 MaxBlockCount) {
+                UNUSED const u16 MaxBlockCount) {
+    TransferBatchT *B = popBatch(C, ClassId);
+    if (!B)
+      return 0;
+
+    const u16 Count = B->getCount();
+    DCHECK_GT(Count, 0U);
+    B->moveToArray(ToArray);
+
+    if (ClassId != SizeClassMap::BatchClassId)
+      C->deallocate(SizeClassMap::BatchClassId, B);
+
+    return Count;
+  }
+
+  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-
-    u16 PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
-    if (UNLIKELY(PopCount == 0)) {
+    TransferBatchT *B = popBatchImpl(C, ClassId, Sci);
+    if (UNLIKELY(!B)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
-        return 0U;
-      PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
-      DCHECK_NE(PopCount, 0U);
+        return nullptr;
+      B = popBatchImpl(C, ClassId, Sci);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
     }
-
-    return PopCount;
+    return B;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -493,7 +510,7 @@ template <typename Config> class SizeClassAllocator32 {
     // by TransferBatch is also free for use. We don't need to recycle the
     // TransferBatch. Note that the correctness is maintained by the invariant,
     //
-    //   Each popBlocks() request returns the entire TransferBatch. Returning
+    //   The unit of each popBatch() request is entire TransferBatch. Return
     //   part of the blocks in a TransferBatch is invalid.
     //
     // This ensures that TransferBatch won't leak the address itself while it's
@@ -617,7 +634,7 @@ template <typename Config> class SizeClassAllocator32 {
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
+      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
 
       return BG;
     };
@@ -709,11 +726,14 @@ template <typename Config> class SizeClassAllocator32 {
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  u16 popBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
-                    CompactPtrT *ToArray, const u16 MaxBlockCount)
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
       REQUIRES(Sci->Mutex) {
     if (Sci->FreeListInfo.BlockList.empty())
-      return 0U;
+      return nullptr;
 
     SinglyLinkedList<TransferBatchT> &Batches =
         Sci->FreeListInfo.BlockList.front()->Batches;
@@ -726,57 +746,33 @@ template <typename Config> class SizeClassAllocator32 {
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
       TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
-      ToArray[0] =
-          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
+      TB->clear();
+      TB->add(
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
       Sci->FreeListInfo.PoppedBlocks += 1;
-      return 1U;
+      return TB;
     }
 
-    // So far, instead of always filling the blocks to `MaxBlockCount`, we only
-    // examine single `TransferBatch` to minimize the time spent on the primary
-    // allocator. Besides, the sizes of `TransferBatch` and
-    // `CacheT::getMaxCached()` may also impact the time spent on accessing the
-    // primary allocator.
-    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
-    // blocks and/or adjust the size of `TransferBatch` according to
-    // `CacheT::getMaxCached()`.
     TransferBatchT *B = Batches.front();
+    Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    // BachClassId should always take all blocks in the TransferBatch. Read the
-    // comment in `pushBatchClassBlocks()` for more details.
-    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
-                             ? B->getCount()
-                             : Min(MaxBlockCount, B->getCount());
-    B->moveNToArray(ToArray, PopCount);
-
-    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
-    // done without holding `Mutex`.
-    if (B->empty()) {
-      Batches.pop_front();
-      // `TransferBatch` of BatchClassId is self-contained, no need to
-      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
-      // details.
+    if (Batches.empty()) {
+      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
+      Sci->FreeListInfo.BlockList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, B);
-
-      if (Batches.empty()) {
-        BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
-        Sci->FreeListInfo.BlockList.pop_front();
-
-        // We don't keep BatchGroup with zero blocks to avoid empty-checking
-        // while allocating. Note that block used for constructing BatchGroup is
-        // recorded as free blocks in the last element of BatchGroup::Batches.
-        // Which means, once we pop the last TransferBatch, the block is
-        // implicitly deallocated.
-        if (ClassId != SizeClassMap::BatchClassId)
-          C->deallocate(SizeClassMap::BatchClassId, BG);
-      }
+        C->deallocate(SizeClassMap::BatchClassId, BG);
     }
 
-    Sci->FreeListInfo.PoppedBlocks += PopCount;
-    return PopCount;
+    Sci->FreeListInfo.PoppedBlocks += B->getCount();
+    return B;
   }
 
   NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 971a65ee15f0f..9a642d23620e1 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -12,7 +12,6 @@
 #include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
-#include "condition_variable.h"
 #include "list.h"
 #include "local_cache.h"
 #include "mem_map.h"
@@ -23,6 +22,8 @@
 #include "string_utils.h"
 #include "thread_annotations.h"
 
+#include "condition_variable.h"
+
 namespace scudo {
 
 // SizeClassAllocator64 is an allocator tuned for 64-bit address space.
@@ -220,24 +221,41 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
   }
 
+  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
+  // count. Now it's the same as the number of blocks stored in the
+  // `TransferBatch`.
   u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
-                const u16 MaxBlockCount) {
+                UNUSED const u16 MaxBlockCount) {
+    TransferBatchT *B = popBatch(C, ClassId);
+    if (!B)
+      return 0;
+
+    const u16 Count = B->getCount();
+    DCHECK_GT(Count, 0U);
+    B->moveToArray(ToArray);
+
+    if (ClassId != SizeClassMap::BatchClassId)
+      C->deallocate(SizeClassMap::BatchClassId, B);
+
+    return Count;
+  }
+
+  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
-    u16 PopCount = 0;
 
     {
       ScopedLock L(Region->FLLock);
-      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
-      if (PopCount != 0U)
-        return PopCount;
+      TransferBatchT *B = popBatchImpl(C, ClassId, Region);
+      if (LIKELY(B))
+        return B;
     }
 
     bool ReportRegionExhausted = false;
+    TransferBatchT *B = nullptr;
 
     if (conditionVariableEnabled()) {
-      PopCount = popBlocksWithCV(C, ClassId, Region, ToArray, MaxBlockCount,
-                                 ReportRegionExhausted);
+      B = popBatchWithCV(C, ClassId, Region, ReportRegionExhausted);
     } else {
       while (true) {
         // When two threads compete for `Region->MMLock`, we only want one of
@@ -246,15 +264,13 @@ template <typename Config> class SizeClassAllocator64 {
         ScopedLock ML(Region->MMLock);
         {
           ScopedLock FL(Region->FLLock);
-          PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
-          if (PopCount != 0U)
-            return PopCount;
+          if ((B = popBatchImpl(C, ClassId, Region)))
+            break;
         }
 
         const bool RegionIsExhausted = Region->Exhausted;
         if (!RegionIsExhausted)
-          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
-                                                  MaxBlockCount);
+          B = populateFreeListAndPopBatch(C, ClassId, Region);
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
         break;
       }
@@ -270,7 +286,7 @@ template <typename Config> class SizeClassAllocator64 {
         reportOutOfBatchClass();
     }
 
-    return PopCount;
+    return B;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -624,7 +640,7 @@ template <typename Config> class SizeClassAllocator64 {
     // by TransferBatch is also free for use. We don't need to recycle the
     // TransferBatch. Note that the correctness is maintained by the invariant,
     //
-    //   Each popBlocks() request returns the entire TransferBatch. Returning
+    //   The unit of each popBatch() request is entire TransferBatch. Return
     //   part of the blocks in a TransferBatch is invalid.
     //
     // This ensures that TransferBatch won't leak the address itself while it's
@@ -747,7 +763,7 @@ template <typename Config> class SizeClassAllocator64 {
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
+      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
 
       return BG;
     };
@@ -839,10 +855,9 @@ template <typename Config> class SizeClassAllocator64 {
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  u16 popBlocksWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
-                      CompactPtrT *ToArray, const u16 MaxBlockCount,
-                      bool &ReportRegionExhausted) {
-    u16 PopCount = 0;
+  TransferBatchT *popBatchWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
+                                 bool &ReportRegionExhausted) {
+    TransferBatchT *B = nullptr;
 
     while (true) {
       // We only expect one thread doing the freelist refillment and other
@@ -863,8 +878,7 @@ template <typename Config> class SizeClassAllocator64 {
 
         const bool RegionIsExhausted = Region->Exhausted;
         if (!RegionIsExhausted)
-          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
-                                                  MaxBlockCount);
+          B = populateFreeListAndPopBatch(C, ClassId, Region);
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
 
         {
@@ -891,8 +905,7 @@ template <typename Config> class SizeClassAllocator64 {
       // blocks were used up right after the refillment. Therefore, we have to
       // check if someone is still populating the freelist.
       ScopedLock FL(Region->FLLock);
-      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
-      if (PopCount != 0U)
+      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
         break;
 
       if (!Region->isPopulatingFreeList)
@@ -905,19 +918,21 @@ template <typename Config> class SizeClassAllocator64 {
       // `pushBatchClassBlocks()` and `mergeGroupsToReleaseBack()`.
       Region->FLLockCV.wait(Region->FLLock);
 
-      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
-      if (PopCount != 0U)
+      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
         break;
     }
 
-    return PopCount;
+    return B;
   }
 
-  u16 popBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
-                    CompactPtrT *ToArray, const u16 MaxBlockCount)
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
       REQUIRES(Region->FLLock) {
     if (Region->FreeListInfo.BlockList.empty())
-      return 0U;
+      return nullptr;
 
     SinglyLinkedList<TransferBatchT> &Batches =
         Region->FreeListInfo.BlockList.front()->Batches;
@@ -930,64 +945,39 @@ template <typename Config> class SizeClassAllocator64 {
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
       TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
-      ToArray[0] =
-          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
+      TB->clear();
+      TB->add(
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
       Region->FreeListInfo.PoppedBlocks += 1;
-      return 1U;
+      return TB;
     }
 
-    // So far, instead of always fill blocks to `MaxBlockCount`, we only exmaine
-    // single `TransferBatch` to minimize the time spent in the primary
-    // allocator. Besides, the sizes of `TransferBatch` and
-    // `CacheT::getMaxCached()` may also impact the time spent on accessing the
-    // primary allocator.
-    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
-    // blocks and/or adjust the size of `TransferBatch` according to
-    // `CacheT::getMaxCached()`.
     TransferBatchT *B = Batches.front();
+    Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    // BachClassId should always take all blocks in the TransferBatch. Read the
-    // comment in `pushBatchClassBlocks()` for more details.
-    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
-                             ? B->getCount()
-                             : Min(MaxBlockCount, B->getCount());
-    B->moveNToArray(ToArray, PopCount);
-
-    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
-    // done without holding `FLLock`.
-    if (B->empty()) {
-      Batches.pop_front();
-      // `TransferBatch` of BatchClassId is self-contained, no need to
-      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
-      // details.
-      if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, B);
-
-      if (Batches.empty()) {
-        BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
-        Region->FreeListInfo.BlockList.pop_front();
+    if (Batches.empty()) {
+      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
+      Region->FreeListInfo.BlockList.pop_front();
 
-        // We don't keep BatchGroup with zero blocks to avoid empty-checking
-        // while allocating. Note that block used for constructing BatchGroup is
-        // recorded as free blocks in the last element of BatchGroup::Batches.
-        // Which means, once we pop the last TransferBatch, the block is
-        // implicitly deallocated.
-        if (ClassId != SizeClassMap::BatchClassId)
-          C->deallocate(SizeClassMap::BatchClassId, BG);
-      }
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
     }
 
-    Region->FreeListInfo.PoppedBlocks += PopCount;
+    Region->FreeListInfo.PoppedBlocks += B->getCount();
 
-    return PopCount;
+    return B;
   }
 
-  NOINLINE u16 populateFreeListAndPopBlocks(CacheT *C, uptr ClassId,
-                                            RegionInfo *Region,
-                                            CompactPtrT *ToArray,
-                                            const u16 MaxBlockCount)
+  // Refill the freelist and return one batch.
+  NOINLINE TransferBatchT *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
+                                                       RegionInfo *Region)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr Size = getSizeByClassId(ClassId);
     const u16 MaxCount = CacheT::getMaxCached(Size);
@@ -1004,7 +994,7 @@ template <typename Config> class SizeClassAllocator64 {
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
       if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
         Region->Exhausted = true;
-        return 0U;
+        return nullptr;
       }
 
       if (UNLIKELY(!Region->MemMapInfo.MemMap.remap(
@@ -1012,7 +1002,7 @@ template <typename Config> class SizeClassAllocator64 {
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
                   (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG
                                                             : 0)))) {
-        return 0U;
+        return nullptr;
       }
       Region->MemMapInfo.MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
@@ -1059,9 +1049,8 @@ template <typename Config> class SizeClassAllocator64 {
       pushBatchClassBlocks(Region, ShuffleArray, NumberOfBlocks);
     }
 
-    const u16 PopCount =
-        popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
-    DCHECK_NE(PopCount, 0U);
+    TransferBatchT *B = popBatchImpl(C, ClassId, Region);
+    DCHECK_NE(B, nullptr);
 
     // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
     // the requests from `PushBlocks` and `PopBatch` which are external
@@ -1073,7 +1062,7 @@ template <typename Config> class SizeClassAllocator64 {
     C->getStats().add(StatFree, AllocatedUser);
     Region->MemMapInfo.AllocatedUser += AllocatedUser;
 
-    return PopCount;
+    return B;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region)
@@ -1197,7 +1186,7 @@ template <typename Config> class SizeClassAllocator64 {
     }
 
     // Note that we have extracted the `GroupsToRelease` from region freelist.
-    // It's safe to let pushBlocks()/popBlocks() access the remaining region
+    // It's safe to let pushBlocks()/popBatches() access the remaining region
     // freelist. In the steps 3 and 4, we will temporarily release the FLLock
     // and lock it again before step 5.
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index f64a5143b30d4..18171511758a1 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -237,6 +237,7 @@ struct SmallRegionsConfig {
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
   using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
+  using TransferBatch = Primary::TransferBatchT;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
@@ -244,26 +245,29 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
   Stats.init();
   Cache.init(&Stats, &Allocator);
   bool AllocationFailed = false;
-  std::vector<void *> Blocks;
+  std::vector<TransferBatch *> Batches;
   const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
   const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
-  const scudo::u16 MaxCachedBlockCount = Primary::CacheT::getMaxCached(Size);
+  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
 
   for (scudo::uptr I = 0; I < 10000U; I++) {
-    for (scudo::uptr J = 0; J < MaxCachedBlockCount; ++J) {
-      void *Ptr = Cache.allocate(ClassId);
-      if (Ptr == nullptr) {
-        AllocationFailed = true;
-        break;
-      }
-      memset(Ptr, 'B', Size);
-      Blocks.push_back(Ptr);
+    TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
+    if (!B) {
+      AllocationFailed = true;
+      break;
     }
+    for (scudo::u16 J = 0; J < B->getCount(); J++)
+      memset(Allocator.decompactPtr(ClassId, B->get(J)), 'B', Size);
+    Batches.push_back(B);
+  }
+  while (!Batches.empty()) {
+    TransferBatch *B = Batches.back();
+    Batches.pop_back();
+    const scudo::u16 Count = B->getCount();
+    B->moveToArray(Blocks);
+    Allocator.pushBlocks(&Cache, ClassId, Blocks, Count);
+    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
-
-  for (auto *Ptr : Blocks)
-    Cache.deallocate(ClassId, Ptr);
-
   Cache.destroy(nullptr);
   Allocator.releaseToOS(scudo::ReleaseToOS::Force);
   scudo::ScopedString Str;
@@ -338,7 +342,7 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
           V.push_back(std::make_pair(ClassId, P));
       }
 
-      // Try to interleave pushBlocks(), popBlocks() and releaseToOS().
+      // Try to interleave pushBlocks(), popBatch() and releaseToOS().
       Allocator->releaseToOS(scudo::ReleaseToOS::Force);
 
       while (!V.empty()) {

From 371e6d0f24dd02b7c8a115c07e74954e448570f6 Mon Sep 17 00:00:00 2001
From: Enna1 <xumingjie.enna1@bytedance.com>
Date: Tue, 27 Feb 2024 07:45:06 +0800
Subject: [PATCH 391/546] [NFC][hwasan] Do not include
 sanitizer_placement_new.h in header files (#82993)

Do not include sanitizer_placement_new.h into header files, only into
source files.
---
 compiler-rt/lib/hwasan/hwasan_report.cpp      | 1 +
 compiler-rt/lib/hwasan/hwasan_thread_list.cpp | 1 +
 compiler-rt/lib/hwasan/hwasan_thread_list.h   | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 016ec182dea0f..9fbf38ae6a1f4 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -27,6 +27,7 @@
 #include "sanitizer_common/sanitizer_flags.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_report_decorator.h"
 #include "sanitizer_common/sanitizer_stackdepot.h"
 #include "sanitizer_common/sanitizer_stacktrace_printer.h"
diff --git a/compiler-rt/lib/hwasan/hwasan_thread_list.cpp b/compiler-rt/lib/hwasan/hwasan_thread_list.cpp
index 7df4dd3d78518..e56d19aad2673 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread_list.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_thread_list.cpp
@@ -1,5 +1,6 @@
 #include "hwasan_thread_list.h"
 
+#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_arg_retval.h"
 
 namespace __hwasan {
diff --git a/compiler-rt/lib/hwasan/hwasan_thread_list.h b/compiler-rt/lib/hwasan/hwasan_thread_list.h
index 82f6c70a03f80..d0eebd1b373a3 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread_list.h
+++ b/compiler-rt/lib/hwasan/hwasan_thread_list.h
@@ -47,7 +47,6 @@
 #include "hwasan_allocator.h"
 #include "hwasan_flags.h"
 #include "hwasan_thread.h"
-#include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_thread_arg_retval.h"
 
 namespace __hwasan {

From 56d58295dd492b8e8a011fa60798d9be4f36ec3c Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Mon, 26 Feb 2024 18:08:28 -0600
Subject: [PATCH 392/546] [mlir][sparse] Introduce batch level format. (#83082)

---
 mlir/include/mlir-c/Dialect/SparseTensor.h    |  9 +++---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      | 29 +++++++++++++++----
 .../SparseTensor/IR/SparseTensorAttrDefs.td   |  3 +-
 .../SparseTensor/IR/Detail/LvlTypeParser.cpp  |  2 ++
 .../SparseTensor/IR/SparseTensorDialect.cpp   |  4 +++
 .../Transforms/Utils/SparseTensorLevel.cpp    |  2 ++
 mlir/test/CAPI/sparse_tensor.c                |  4 +--
 .../SparseTensor/invalid_encoding.mlir        |  6 ++++
 .../SparseTensor/roundtrip_encoding.mlir      | 11 +++++++
 .../SparseTensor/sparse_fill_zero.mlir        |  2 +-
 .../python/dialects/sparse_tensor/dialect.py  |  8 ++---
 11 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir-c/Dialect/SparseTensor.h b/mlir/include/mlir-c/Dialect/SparseTensor.h
index 898d2f12779e3..52ca7ba8a1618 100644
--- a/mlir/include/mlir-c/Dialect/SparseTensor.h
+++ b/mlir/include/mlir-c/Dialect/SparseTensor.h
@@ -29,10 +29,11 @@ typedef uint64_t MlirSparseTensorLevelType;
 
 enum MlirSparseTensorLevelFormat {
   MLIR_SPARSE_TENSOR_LEVEL_DENSE = 0x000000010000,
-  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000020000,
-  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000040000,
-  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000080000,
-  MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000100000,
+  MLIR_SPARSE_TENSOR_LEVEL_BATCH = 0x000000020000,
+  MLIR_SPARSE_TENSOR_LEVEL_COMPRESSED = 0x000000040000,
+  MLIR_SPARSE_TENSOR_LEVEL_SINGLETON = 0x000000080000,
+  MLIR_SPARSE_TENSOR_LEVEL_LOOSE_COMPRESSED = 0x000000100000,
+  MLIR_SPARSE_TENSOR_LEVEL_N_OUT_OF_M = 0x000000200000,
 };
 
 enum MlirSparseTensorLevelPropertyNondefault {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 1c81d80ea7ec4..cc134e7d953ec 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -154,12 +154,26 @@ enum class Action : uint32_t {
 enum class LevelFormat : uint64_t {
   Undef = 0x00000000,
   Dense = 0x00010000,
-  Compressed = 0x00020000,
-  Singleton = 0x00040000,
-  LooseCompressed = 0x00080000,
-  NOutOfM = 0x00100000,
+  Batch = 0x00020000,
+  Compressed = 0x00040000,
+  Singleton = 0x00080000,
+  LooseCompressed = 0x00100000,
+  NOutOfM = 0x00200000,
 };
 
+constexpr bool encPowOfTwo(LevelFormat fmt) {
+  auto enc = static_cast<std::underlying_type_t<LevelFormat>>(fmt);
+  return (enc & (enc - 1)) == 0;
+}
+
+// All LevelFormats must have only one bit set (power of two).
+static_assert(encPowOfTwo(LevelFormat::Dense) &&
+              encPowOfTwo(LevelFormat::Batch) &&
+              encPowOfTwo(LevelFormat::Compressed) &&
+              encPowOfTwo(LevelFormat::Singleton) &&
+              encPowOfTwo(LevelFormat::LooseCompressed) &&
+              encPowOfTwo(LevelFormat::NOutOfM));
+
 template <LevelFormat... targets>
 constexpr bool isAnyOfFmt(LevelFormat fmt) {
   return (... || (targets == fmt));
@@ -172,6 +186,8 @@ constexpr const char *toFormatString(LevelFormat lvlFmt) {
     return "undef";
   case LevelFormat::Dense:
     return "dense";
+  case LevelFormat::Batch:
+    return "batch";
   case LevelFormat::Compressed:
     return "compressed";
   case LevelFormat::Singleton:
@@ -225,10 +241,10 @@ struct LevelType {
   static constexpr bool isValidLvlBits(uint64_t lvlBits) {
     auto fmt = static_cast<LevelFormat>(lvlBits & 0xffff0000);
     const uint64_t propertyBits = lvlBits & 0xffff;
-    // If undefined/dense/NOutOfM, then must be unique and ordered.
+    // If undefined/dense/batch/NOutOfM, then must be unique and ordered.
     // Otherwise, the format must be one of the known ones.
     return (isAnyOfFmt<LevelFormat::Undef, LevelFormat::Dense,
-                       LevelFormat::NOutOfM>(fmt))
+                       LevelFormat::Batch, LevelFormat::NOutOfM>(fmt))
                ? (propertyBits == 0)
                : (isAnyOfFmt<LevelFormat::Compressed, LevelFormat::Singleton,
                              LevelFormat::LooseCompressed>(fmt));
@@ -375,6 +391,7 @@ inline std::optional<LevelType> buildLevelType(LevelFormat lf, bool ordered,
 }
 inline bool isUndefLT(LevelType lt) { return lt.isa<LevelFormat::Undef>(); }
 inline bool isDenseLT(LevelType lt) { return lt.isa<LevelFormat::Dense>(); }
+inline bool isBatchLT(LevelType lt) { return lt.isa<LevelFormat::Batch>(); }
 inline bool isCompressedLT(LevelType lt) {
   return lt.isa<LevelFormat::Compressed>();
 }
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index f0b832571e68e..ca98665256be5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -141,7 +141,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
 
     The supported level-formats are the following:
 
-    - **dense** : all entries along this level are stored
+    - **dense** : all entries along this level are stored and linearized.
+    - **batch** : all entries along this level are stored but not linearized.
     - **compressed** : only nonzeros along this level are stored
     - **loose_compressed** : as compressed, but allows for free space between regions
     - **singleton** : a variant of the compressed format, where coordinates have no siblings
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
index 455e90baf0a71..92e5efaa81049 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/LvlTypeParser.cpp
@@ -62,6 +62,8 @@ FailureOr<uint64_t> LvlTypeParser::parseLvlType(AsmParser &parser) const {
   // Set the base bit for properties.
   if (base.compare("dense") == 0) {
     properties |= static_cast<uint64_t>(LevelFormat::Dense);
+  } else if (base.compare("batch") == 0) {
+    properties |= static_cast<uint64_t>(LevelFormat::Batch);
   } else if (base.compare("compressed") == 0) {
     properties |= static_cast<uint64_t>(LevelFormat::Compressed);
   } else if (base.compare("structured") == 0) {
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index af7b85d458774..fd0ed26fbde07 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -690,6 +690,10 @@ LogicalResult SparseTensorEncodingAttr::verify(
     }
   }
 
+  auto lastBatch = std::find_if(lvlTypes.rbegin(), lvlTypes.rend(), isBatchLT);
+  if (!std::all_of(lastBatch, lvlTypes.rend(), isBatchLT))
+    return emitError() << "Batch lvlType can only be leading levels.";
+
   // SoA property can only be applied on singleton level.
   auto soaLvls = llvm::make_filter_range(lvlTypes, [](LevelType lt) {
     return lt.isa<LevelPropNonDefault::SoA>();
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
index 61a3703b73bf0..011d814cd9009 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorLevel.cpp
@@ -1278,6 +1278,8 @@ sparse_tensor::makeSparseTensorLevel(OpBuilder &b, Location l, Value t,
   switch (lt.getLvlFmt()) {
   case LevelFormat::Dense:
     return std::make_unique<DenseLevel>(tid, lvl, sz, stt.hasEncoding());
+  case LevelFormat::Batch:
+    llvm_unreachable("not implemented");
   case LevelFormat::Compressed: {
     Value pos = genToPositions(b, l, t, lvl);
     Value crd = genToCoordinates(b, l, t, lvl);
diff --git a/mlir/test/CAPI/sparse_tensor.c b/mlir/test/CAPI/sparse_tensor.c
index a8b9f9048d591..f241e0e5c2fb5 100644
--- a/mlir/test/CAPI/sparse_tensor.c
+++ b/mlir/test/CAPI/sparse_tensor.c
@@ -39,8 +39,8 @@ static int testRoundtripEncoding(MlirContext ctx) {
   // CHECK: (d0, d1)[s0] -> (s0, d0, d1)
   mlirAffineMapDump(dimToLvl);
   // CHECK: level_type: 65536
-  // CHECK: level_type: 131072
-  // CHECK: level_type: 131072
+  // CHECK: level_type: 262144
+  // CHECK: level_type: 262144
   MlirAffineMap lvlToDim =
       mlirSparseTensorEncodingAttrGetLvlToDim(originalAttr);
   int lvlRank = mlirSparseTensorEncodingGetLvlRank(originalAttr);
diff --git a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
index 9ed3cee259147..8096c010ac935 100644
--- a/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid_encoding.mlir
@@ -54,6 +54,12 @@ func.func private @tensor_dimlevel_size_mismatch(%arg0: tensor<8xi32, #a>) -> ()
 
 // -----
 
+// expected-error@+1 {{Batch lvlType can only be leading levels}}
+#a = #sparse_tensor.encoding<{map = (d0, d1, d2) -> (d0 : batch, d1 : compressed, d2: batch)}>
+func.func private @non_leading_batch(%arg0: tensor<?x?x?i32, #a>) -> ()
+
+// -----
+
 // expected-error@+1 {{use of undeclared identifier}}
 #a = #sparse_tensor.encoding<{map = (d0) -> (d0 : dense, d1 : compressed)}>
 func.func private @tensor_sizes_mismatch(%arg0: tensor<8xi32, #a>) -> ()
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
index 9d5118ceecc58..66e61afd897dd 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip_encoding.mlir
@@ -22,6 +22,17 @@ func.func private @sparse_csr(tensor<?x?xf32, #CSR>)
 
 // -----
 
+#BCSR = #sparse_tensor.encoding<{
+  map = (d0, d1, d2) -> (d0 : batch, d1: dense, d2 : compressed),
+}>
+
+// CHECK: #[[$BCSR:.*]] = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : batch, d1 : dense, d2 : compressed) }>
+// CHECK-LABEL: func private @sparse_bcsr(
+// CHECK-SAME: tensor<?x?x?xf32, #[[$BCSR]]>)
+func.func private @sparse_bcsr(tensor<?x?x?xf32, #BCSR>)
+
+// -----
+
 #CSR_explicit = #sparse_tensor.encoding<{
   map = {l0, l1} (d0 = l0, d1 = l1) -> (l0 = d0 : dense, l1 = d1 : compressed)
 }>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index d04fbe8ed5c22..6e8a26762d90f 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -14,7 +14,7 @@
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_9:.*]] = arith.constant 100 : index
 // CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 300 : index
-// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 131072 : i64
+// CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 262144 : i64
 // CHECK:           %[[VAL_12:.*]] = memref.alloca() : memref<2xi64>
 // CHECK:           %[[VAL_13:.*]] = memref.cast %[[VAL_12]] : memref<2xi64> to memref<?xi64>
 // CHECK:           memref.store %[[VAL_11]], %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<2xi64>
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 2c0603216ef2c..5666d090c3d5e 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -28,7 +28,7 @@ def testEncodingAttr1D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [131072]
+        # CHECK: lvl_types: [262144]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0) -> (d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
@@ -71,9 +71,9 @@ def testEncodingAttrStructure():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [65536, 65536, 4406637494272]
+        # CHECK: lvl_types: [65536, 65536, 4406638542848]
         print(f"lvl_types: {casted.lvl_types}")
-        # CHECK: lvl_formats_enum: [<LevelFormat.dense: 65536>, <LevelFormat.dense: 65536>, <LevelFormat.n_out_of_m: 1048576>]
+        # CHECK: lvl_formats_enum: [<LevelFormat.dense: 65536>, <LevelFormat.dense: 65536>, <LevelFormat.n_out_of_m: 2097152>]
         print(f"lvl_formats_enum: {casted.lvl_formats_enum}")
         # CHECK: structured_n: 2
         print(f"structured_n: {casted.structured_n}")
@@ -157,7 +157,7 @@ def testEncodingAttr2D():
         # CHECK: equal: True
         print(f"equal: {casted == parsed}")
 
-        # CHECK: lvl_types: [65536, 131072]
+        # CHECK: lvl_types: [65536, 262144]
         print(f"lvl_types: {casted.lvl_types}")
         # CHECK: dim_to_lvl: (d0, d1) -> (d1, d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")

From 62d0c01c2c9adce67f2e1adb9feecd7ba1a97814 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 26 Feb 2024 16:15:34 -0800
Subject: [PATCH 393/546] [SelectionDAG] Remove pointer from MMO for VP strided
 load/store. (#82667)

MachineIR alias analysis assumes that only bytes after the pointer will
be accessed. This is incorrect if the stride is negative.

This is causing miscompiles in our downstream after SLP started making
strided loads.

Fixes #82657
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  6 ++-
 .../rvv/strided-vpload-vpstore-output.ll      | 46 +++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ee600d389c2cc..ab2f42d2024cc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8030,8 +8030,9 @@ void SelectionDAGBuilder::visitVPStridedLoad(
   MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
   bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+  unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
       MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
 
   SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
@@ -8052,8 +8053,9 @@ void SelectionDAGBuilder::visitVPStridedStore(
   if (!Alignment)
     Alignment = DAG.getEVTAlign(VT.getScalarType());
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  unsigned AS = PtrOperand->getType()->getPointerAddressSpace();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MachinePointerInfo(AS), MachineMemOperand::MOStore,
       MemoryLocation::UnknownSize, *Alignment, AAInfo);
 
   SDValue ST = DAG.getStridedStoreVP(
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
new file mode 100644
index 0000000000000..a8934bb25571c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv64 -mattr=+v -stop-after=finalize-isel %s -o - | FileCheck %s
+
+; Test makes sure we don't store a pointer in the MachineMemOperand created for
+; these instructions. MachineMemOperand handling can't currently deal with a
+; negative stride that would allow memory before the pointer to be read.
+
+declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+  ; CHECK-LABEL: name: strided_vpload_nxv1i8_i8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x10, $x11, $v0, $x12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
+  ; CHECK-NEXT:   $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]]
+  ; CHECK-NEXT:   PseudoRET implicit $v8
+  %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %load
+}
+
+declare void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8(<vscale x 1 x i8>, ptr, i8, <vscale x 1 x i1>, i32)
+
+define void @strided_vpstore_nxv1i8_i8(<vscale x 1 x i8> %val, ptr %ptr, i8 signext %stride, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+  ; CHECK-LABEL: name: strided_vpstore_nxv1i8_i8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $v8, $x10, $x11, $v0, $x12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vr = COPY $v8
+  ; CHECK-NEXT:   $v0 = COPY [[COPY1]]
+  ; CHECK-NEXT:   PseudoVSSE8_V_MF8_MASK [[COPY4]], [[COPY3]], [[COPY2]], $v0, [[COPY]], 3 /* e8 */ :: (store unknown-size, align 1)
+  ; CHECK-NEXT:   PseudoRET
+  call void @llvm.experimental.vp.strided.store.nxv1i8.p0.i8(<vscale x 1 x i8> %val, ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}

From ded4ea975241539ba8358bc018c506953b0e6813 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 26 Feb 2024 19:36:30 -0500
Subject: [PATCH 394/546] [libc][stdfix] Add sqrt for fixed point types.
 (#83042)

---
 libc/config/linux/x86_64/entrypoints.txt      |   6 +
 libc/docs/math/stdfix.rst                     |   2 +-
 libc/spec/stdc_ext.td                         |   8 ++
 libc/src/__support/fixed_point/CMakeLists.txt |  13 ++
 libc/src/__support/fixed_point/sqrt.h         | 129 ++++++++++++++++++
 libc/src/stdfix/CMakeLists.txt                |  15 ++
 libc/src/stdfix/sqrtuhk.cpp                   |  19 +++
 libc/src/stdfix/sqrtuhk.h                     |  20 +++
 libc/src/stdfix/sqrtuhr.cpp                   |  19 +++
 libc/src/stdfix/sqrtuhr.h                     |  20 +++
 libc/src/stdfix/sqrtuk.cpp                    |  19 +++
 libc/src/stdfix/sqrtuk.h                      |  20 +++
 libc/src/stdfix/sqrtulr.cpp                   |  19 +++
 libc/src/stdfix/sqrtulr.h                     |  20 +++
 libc/src/stdfix/sqrtur.cpp                    |  19 +++
 libc/src/stdfix/sqrtur.h                      |  20 +++
 libc/test/src/stdfix/CMakeLists.txt           |  22 +++
 libc/test/src/stdfix/SqrtTest.h               |  66 +++++++++
 libc/test/src/stdfix/sqrtuhk_test.cpp         |  13 ++
 libc/test/src/stdfix/sqrtuhr_test.cpp         |  13 ++
 libc/test/src/stdfix/sqrtuk_test.cpp          |  13 ++
 libc/test/src/stdfix/sqrtulr_test.cpp         |  13 ++
 libc/test/src/stdfix/sqrtur_test.cpp          |  13 ++
 23 files changed, 520 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/__support/fixed_point/sqrt.h
 create mode 100644 libc/src/stdfix/sqrtuhk.cpp
 create mode 100644 libc/src/stdfix/sqrtuhk.h
 create mode 100644 libc/src/stdfix/sqrtuhr.cpp
 create mode 100644 libc/src/stdfix/sqrtuhr.h
 create mode 100644 libc/src/stdfix/sqrtuk.cpp
 create mode 100644 libc/src/stdfix/sqrtuk.h
 create mode 100644 libc/src/stdfix/sqrtulr.cpp
 create mode 100644 libc/src/stdfix/sqrtulr.h
 create mode 100644 libc/src/stdfix/sqrtur.cpp
 create mode 100644 libc/src/stdfix/sqrtur.h
 create mode 100644 libc/test/src/stdfix/SqrtTest.h
 create mode 100644 libc/test/src/stdfix/sqrtuhk_test.cpp
 create mode 100644 libc/test/src/stdfix/sqrtuhr_test.cpp
 create mode 100644 libc/test/src/stdfix/sqrtuk_test.cpp
 create mode 100644 libc/test/src/stdfix/sqrtulr_test.cpp
 create mode 100644 libc/test/src/stdfix/sqrtur_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index f2a224d45bbae..cd44b5c52b58c 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -473,6 +473,12 @@ if(LIBC_COMPILER_HAS_FIXED_POINT)
     libc.src.stdfix.roundur
     libc.src.stdfix.roundulk
     libc.src.stdfix.roundulr
+    libc.src.stdfix.sqrtuhk
+    libc.src.stdfix.sqrtuhr
+    libc.src.stdfix.sqrtuk
+    libc.src.stdfix.sqrtur
+    # libc.src.stdfix.sqrtulk
+    libc.src.stdfix.sqrtulr
   )
 endif()
 
diff --git a/libc/docs/math/stdfix.rst b/libc/docs/math/stdfix.rst
index 080066e53bd2f..79f499e61f121 100644
--- a/libc/docs/math/stdfix.rst
+++ b/libc/docs/math/stdfix.rst
@@ -78,7 +78,7 @@ Fixed-point Arithmetics
 +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+
 | round         | |check|        | |check|     | |check|       | |check|    | |check|        | |check|     | |check|        | |check|     | |check|       | |check|    | |check|        | |check|     |
 +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+
-| sqrt          |                |             |               |            |                |             |                |             |               |            |                |             |
+| sqrt          | |check|        |             | |check|       |            | |check|        |             | |check|        |             | |check|       |            |                |             |
 +---------------+----------------+-------------+---------------+------------+----------------+-------------+----------------+-------------+---------------+------------+----------------+-------------+
 
 ================== =========
diff --git a/libc/spec/stdc_ext.td b/libc/spec/stdc_ext.td
index 6620142146c47..be1e6d4ba2fcd 100644
--- a/libc/spec/stdc_ext.td
+++ b/libc/spec/stdc_ext.td
@@ -47,6 +47,14 @@ def StdcExt : StandardSpec<"stdc_ext"> {
           GuardedFunctionSpec<"rounduhk", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
           GuardedFunctionSpec<"rounduk", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
           GuardedFunctionSpec<"roundulk", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UnsignedLongAccumType>, ArgSpec<IntType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+
+          GuardedFunctionSpec<"sqrtuhr", RetValSpec<UnsignedShortFractType>, [ArgSpec<UnsignedShortFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtur", RetValSpec<UnsignedFractType>, [ArgSpec<UnsignedFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtulr", RetValSpec<UnsignedLongFractType>, [ArgSpec<UnsignedLongFractType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+
+          GuardedFunctionSpec<"sqrtuhk", RetValSpec<UnsignedShortAccumType>, [ArgSpec<UnsignedShortAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtuk", RetValSpec<UnsignedAccumType>, [ArgSpec<UnsignedAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
+          GuardedFunctionSpec<"sqrtulk", RetValSpec<UnsignedLongAccumType>, [ArgSpec<UnsignedLongAccumType>], "LIBC_COMPILER_HAS_FIXED_POINT">,
       ]
   >;
 
diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt
index 64f9dacc7ba5f..0ed118f240884 100644
--- a/libc/src/__support/fixed_point/CMakeLists.txt
+++ b/libc/src/__support/fixed_point/CMakeLists.txt
@@ -21,3 +21,16 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.math_extras
 )
+
+add_header_library(
+  sqrt
+  HDRS
+    sqrt.h
+  DEPENDS
+    .fx_rep
+    libc.include.llvm-libc-macros.stdfix_macros
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.optimization
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.type_traits
+)
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
new file mode 100644
index 0000000000000..d8df294b18a1a
--- /dev/null
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -0,0 +1,129 @@
+//===-- Calculate square root of fixed point numbers. -----*- C++ -*-=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
+#define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/macros/attributes.h"   // LIBC_INLINE
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+#include "fx_rep.h"
+
+#ifdef LIBC_COMPILER_HAS_FIXED_POINT
+
+namespace LIBC_NAMESPACE::fixed_point {
+
+namespace internal {
+
+template <typename T> struct SqrtConfig;
+
+template <> struct SqrtConfig<unsigned short fract> {
+  using Type = unsigned short fract;
+  static constexpr int EXTRA_STEPS = 0;
+};
+
+template <> struct SqrtConfig<unsigned fract> {
+  using Type = unsigned fract;
+  static constexpr int EXTRA_STEPS = 1;
+};
+
+template <> struct SqrtConfig<unsigned long fract> {
+  using Type = unsigned long fract;
+  static constexpr int EXTRA_STEPS = 2;
+};
+
+template <>
+struct SqrtConfig<unsigned short accum> : SqrtConfig<unsigned fract> {};
+
+template <>
+struct SqrtConfig<unsigned accum> : SqrtConfig<unsigned long fract> {};
+
+// TODO: unsigned long accum type is 64-bit, and will need 64-bit fract type.
+// Probably we will use DyadicFloat<64> for intermediate computations instead.
+
+// Linear approximation for the initial values, with errors bounded by:
+//   max(1.5 * 2^-11, eps)
+// Generated with Sollya:
+// > for i from 4 to 15 do {
+//     P = fpminimax(sqrt(x), 1, [|8, 8|], [i * 2^-4, (i + 1)*2^-4],
+//                   fixed, absolute);
+//     print("{", coeff(P, 1), "uhr,", coeff(P, 0), "uhr},");
+//   };
+static constexpr unsigned short fract SQRT_FIRST_APPROX[12][2] = {
+    {0x1.e8p-1uhr, 0x1.0cp-2uhr}, {0x1.bap-1uhr, 0x1.28p-2uhr},
+    {0x1.94p-1uhr, 0x1.44p-2uhr}, {0x1.74p-1uhr, 0x1.6p-2uhr},
+    {0x1.6p-1uhr, 0x1.74p-2uhr},  {0x1.4ep-1uhr, 0x1.88p-2uhr},
+    {0x1.3ep-1uhr, 0x1.9cp-2uhr}, {0x1.32p-1uhr, 0x1.acp-2uhr},
+    {0x1.22p-1uhr, 0x1.c4p-2uhr}, {0x1.18p-1uhr, 0x1.d4p-2uhr},
+    {0x1.08p-1uhr, 0x1.fp-2uhr},  {0x1.04p-1uhr, 0x1.f8p-2uhr},
+};
+
+} // namespace internal
+
+template <typename T>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_fixed_point_v<T>, T> sqrt(T x) {
+  using BitType = typename FXRep<T>::StorageType;
+  BitType x_bit = cpp::bit_cast<BitType>(x);
+
+  if (LIBC_UNLIKELY(x_bit == 0))
+    return FXRep<T>::ZERO();
+
+  int leading_zeros = cpp::countl_zero(x_bit);
+  constexpr int STORAGE_LENGTH = sizeof(BitType) * CHAR_BIT;
+  constexpr int EXP_ADJUSTMENT = STORAGE_LENGTH - FXRep<T>::FRACTION_LEN - 1;
+  // x_exp is the real exponent of the leading bit of x.
+  int x_exp = EXP_ADJUSTMENT - leading_zeros;
+  int shift = EXP_ADJUSTMENT - 1 - (x_exp & (~1));
+  // Normalize.
+  x_bit <<= shift;
+  using FracType = typename internal::SqrtConfig<T>::Type;
+  FracType x_frac = cpp::bit_cast<FracType>(x_bit);
+
+  // Use use Newton method to approximate sqrt(a):
+  //   x_{n + 1} = 1/2 (x_n + a / x_n)
+  // For the initial values, we choose x_0
+
+  // Use the leading 4 bits to do look up for sqrt(x).
+  // After normalization, 0.25 <= x_frac < 1, so the leading 4 bits of x_frac
+  // are between 0b0100 and 0b1111.  Hence the lookup table only needs 12
+  // entries, and we can get the index by subtracting the leading 4 bits of
+  // x_frac by 4 = 0b0100.
+  int index = (x_bit >> (STORAGE_LENGTH - 4)) - 4;
+  FracType a = static_cast<FracType>(internal::SQRT_FIRST_APPROX[index][0]);
+  FracType b = static_cast<FracType>(internal::SQRT_FIRST_APPROX[index][1]);
+
+  // Initial approximation step.
+  // Estimated error bounds: | r - sqrt(x_frac) | < max(1.5 * 2^-11, eps).
+  FracType r = a * x_frac + b;
+
+  // Further Newton-method iterations for square-root:
+  //   x_{n + 1} = 0.5 * (x_n + a / x_n)
+  // We distribute and do the multiplication by 0.5 first to avoid overflow.
+  // TODO: Investigate the performance and accuracy of using division-free
+  // iterations from:
+  //   Blanchard, J. D. and Chamberland, M., "Newton's Method Without Division",
+  //   The American Mathematical Monthly (2023).
+  //   https://chamberland.math.grinnell.edu/papers/newton.pdf
+  for (int i = 0; i < internal::SqrtConfig<T>::EXTRA_STEPS; ++i)
+    r = (r >> 1) + (x_frac >> 1) / r;
+
+  // Re-scaling
+  r >>= EXP_ADJUSTMENT - (x_exp >> 1);
+
+  // Return result.
+  return cpp::bit_cast<T>(r);
+}
+
+} // namespace LIBC_NAMESPACE::fixed_point
+
+#endif // LIBC_COMPILER_HAS_FIXED_POINT
+
+#endif // LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt
index 6e2ed1bdfeafe..cb2134fe33cf9 100644
--- a/libc/src/stdfix/CMakeLists.txt
+++ b/libc/src/stdfix/CMakeLists.txt
@@ -17,6 +17,21 @@ foreach(suffix IN ITEMS hr r lr hk k lk)
   )
 endforeach()
 
+foreach(suffix IN ITEMS uhr ur ulr uhk uk)
+  add_entrypoint_object(
+    sqrt${suffix}
+    HDRS
+      sqrt${suffix}.h
+    SRCS
+      sqrt${suffix}.cpp
+    COMPILE_OPTIONS
+      -O3
+      -ffixed-point
+    DEPENDS
+      libc.src.__support.fixed_point.sqrt
+  )
+endforeach()
+
 foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
   add_entrypoint_object(
     round${suffix}
diff --git a/libc/src/stdfix/sqrtuhk.cpp b/libc/src/stdfix/sqrtuhk.cpp
new file mode 100644
index 0000000000000..e8dc842c8a998
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhk.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtuhk function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtuhk.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned short accum, sqrtuhk, (unsigned short accum x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtuhk.h b/libc/src/stdfix/sqrtuhk.h
new file mode 100644
index 0000000000000..80000a0079696
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhk.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtuhk -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned short accum sqrtuhk(unsigned short accum x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
diff --git a/libc/src/stdfix/sqrtuhr.cpp b/libc/src/stdfix/sqrtuhr.cpp
new file mode 100644
index 0000000000000..6bba07aa20d59
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhr.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtuhr function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtuhr.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned short fract, sqrtuhr, (unsigned short fract x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtuhr.h b/libc/src/stdfix/sqrtuhr.h
new file mode 100644
index 0000000000000..fd95f0924e8d4
--- /dev/null
+++ b/libc/src/stdfix/sqrtuhr.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtuhr -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned short fract sqrtuhr(unsigned short fract x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
diff --git a/libc/src/stdfix/sqrtuk.cpp b/libc/src/stdfix/sqrtuk.cpp
new file mode 100644
index 0000000000000..6e5d8118c83b7
--- /dev/null
+++ b/libc/src/stdfix/sqrtuk.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtuk function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtuk.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned accum, sqrtuk, (unsigned accum x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtuk.h b/libc/src/stdfix/sqrtuk.h
new file mode 100644
index 0000000000000..04d0adadde9ad
--- /dev/null
+++ b/libc/src/stdfix/sqrtuk.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtuk ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUK_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUK_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned accum sqrtuk(unsigned accum x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUK_H
diff --git a/libc/src/stdfix/sqrtulr.cpp b/libc/src/stdfix/sqrtulr.cpp
new file mode 100644
index 0000000000000..c9e5cd51f66bc
--- /dev/null
+++ b/libc/src/stdfix/sqrtulr.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtulr function  -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtulr.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned long fract, sqrtulr, (unsigned long fract x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtulr.h b/libc/src/stdfix/sqrtulr.h
new file mode 100644
index 0000000000000..284adaaf35bf5
--- /dev/null
+++ b/libc/src/stdfix/sqrtulr.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtulr -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTULR_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTULR_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned long fract sqrtulr(unsigned long fract x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTULR_H
diff --git a/libc/src/stdfix/sqrtur.cpp b/libc/src/stdfix/sqrtur.cpp
new file mode 100644
index 0000000000000..ac5be84910849
--- /dev/null
+++ b/libc/src/stdfix/sqrtur.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of sqrtur function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sqrtur.h"
+#include "src/__support/common.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(unsigned fract, sqrtur, (unsigned fract x)) {
+  return fixed_point::sqrt(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdfix/sqrtur.h b/libc/src/stdfix/sqrtur.h
new file mode 100644
index 0000000000000..df9dfe5a0bf39
--- /dev/null
+++ b/libc/src/stdfix/sqrtur.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtur ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDFIX_SQRTUR_H
+#define LLVM_LIBC_SRC_STDFIX_SQRTUR_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+
+namespace LIBC_NAMESPACE {
+
+unsigned fract sqrtur(unsigned fract x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDFIX_SQRTUR_H
diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt
index b6e0256bb6880..4140b5b29f3b3 100644
--- a/libc/test/src/stdfix/CMakeLists.txt
+++ b/libc/test/src/stdfix/CMakeLists.txt
@@ -22,6 +22,28 @@ foreach(suffix IN ITEMS hr r lr hk k lk)
   )
 endforeach()
 
+foreach(suffix IN ITEMS uhr ur ulr uhk uk)
+  add_libc_test(
+    sqrt${suffix}_test
+    SUITE
+      libc-stdfix-tests
+    HDRS
+      SqrtTest.h
+    SRCS
+      sqrt${suffix}_test.cpp
+    COMPILE_OPTIONS
+      -O3
+      -ffixed-point
+    DEPENDS
+      libc.src.stdfix.sqrt${suffix}
+      libc.src.__support.CPP.bit
+      libc.src.__support.fixed_point.fx_rep
+      libc.src.__support.fixed_point.sqrt
+      libc.src.__support.FPUtil.basic_operations
+      libc.src.__support.FPUtil.sqrt
+  )
+endforeach()
+
 foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
   add_libc_test(
     round${suffix}_test
diff --git a/libc/test/src/stdfix/SqrtTest.h b/libc/test/src/stdfix/SqrtTest.h
new file mode 100644
index 0000000000000..628be0deb770f
--- /dev/null
+++ b/libc/test/src/stdfix/SqrtTest.h
@@ -0,0 +1,66 @@
+//===-- Utility class to test fixed-point sqrt ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/UnitTest/Test.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/fixed_point/fx_rep.h"
+#include "src/__support/fixed_point/sqrt.h"
+
+template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
+
+  using FXRep = LIBC_NAMESPACE::fixed_point::FXRep<T>;
+  static constexpr T zero = FXRep::ZERO();
+  static constexpr T min = FXRep::MIN();
+  static constexpr T max = FXRep::MAX();
+  static constexpr T half = static_cast<T>(0.5);
+  static constexpr T quarter = static_cast<T>(0.25);
+  static constexpr T one =
+      (FXRep::INTEGRAL_LEN > 0) ? static_cast<T>(1) : FXRep::MAX();
+  static constexpr T eps = FXRep::EPS();
+
+public:
+  typedef T (*SqrtFunc)(T);
+
+  void testSpecialNumbers(SqrtFunc func) {
+    EXPECT_EQ(zero, func(zero));
+    EXPECT_EQ(half, func(quarter));
+
+    if constexpr (FXRep::INTEGRAL_LEN) {
+      EXPECT_EQ(one, func(one));
+      EXPECT_EQ(static_cast<T>(2.0), func(static_cast<T>(4.0)));
+    }
+
+    using StorageType = typename FXRep::StorageType;
+
+    constexpr size_t COUNT = 255;
+    constexpr StorageType STEP =
+        ~StorageType(0) / static_cast<StorageType>(COUNT);
+    constexpr double ERR = 3.0 * static_cast<double>(eps);
+    StorageType x = 0;
+    for (size_t i = 0; i < COUNT; ++i, x += STEP) {
+      T v = LIBC_NAMESPACE::cpp::bit_cast<T>(x);
+      double v_d = static_cast<double>(v);
+      double errors = LIBC_NAMESPACE::fputil::abs(
+          static_cast<double>(func(v)) - LIBC_NAMESPACE::fputil::sqrt(v_d));
+      if (errors > ERR) {
+        // Print out the failure input and output.
+        EXPECT_EQ(v, zero);
+        EXPECT_EQ(func(v), zero);
+      }
+      ASSERT_TRUE(errors <= ERR);
+    }
+  }
+};
+
+#define LIST_SQRT_TESTS(T, func)                                               \
+  using LlvmLibcSqrtTest = SqrtTest<T>;                                        \
+  TEST_F(LlvmLibcSqrtTest, SpecialNumbers) { testSpecialNumbers(&func); }      \
+  static_assert(true, "Require semicolon.")
diff --git a/libc/test/src/stdfix/sqrtuhk_test.cpp b/libc/test/src/stdfix/sqrtuhk_test.cpp
new file mode 100644
index 0000000000000..d6ff5385cab86
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtuhk_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtuhk ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtuhk.h"
+
+LIST_SQRT_TESTS(unsigned short accum, LIBC_NAMESPACE::sqrtuhk);
diff --git a/libc/test/src/stdfix/sqrtuhr_test.cpp b/libc/test/src/stdfix/sqrtuhr_test.cpp
new file mode 100644
index 0000000000000..22f00a4231b31
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtuhr_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtuhr ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtuhr.h"
+
+LIST_SQRT_TESTS(unsigned short fract, LIBC_NAMESPACE::sqrtuhr);
diff --git a/libc/test/src/stdfix/sqrtuk_test.cpp b/libc/test/src/stdfix/sqrtuk_test.cpp
new file mode 100644
index 0000000000000..5a3105de1e0bf
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtuk_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtuk ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtuk.h"
+
+LIST_SQRT_TESTS(unsigned accum, LIBC_NAMESPACE::sqrtuk);
diff --git a/libc/test/src/stdfix/sqrtulr_test.cpp b/libc/test/src/stdfix/sqrtulr_test.cpp
new file mode 100644
index 0000000000000..1be4e2b5e0a6f
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtulr_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtulr ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtulr.h"
+
+LIST_SQRT_TESTS(unsigned long fract, LIBC_NAMESPACE::sqrtulr);
diff --git a/libc/test/src/stdfix/sqrtur_test.cpp b/libc/test/src/stdfix/sqrtur_test.cpp
new file mode 100644
index 0000000000000..12b1c2211db05
--- /dev/null
+++ b/libc/test/src/stdfix/sqrtur_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtur ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/stdfix/sqrtur.h"
+
+LIST_SQRT_TESTS(unsigned fract, LIBC_NAMESPACE::sqrtur);

From 5e4c4365f89b7b31ee3868114dd0f6c5d483e42b Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Tue, 27 Feb 2024 00:37:15 +0000
Subject: [PATCH 395/546] [APINotes] Fix a few accidental refactoring artifacts

This fixes a few breakages introduced during upstreaming.
---
 clang/lib/APINotes/APINotesReader.cpp | 8 ++++----
 clang/lib/APINotes/APINotesWriter.cpp | 1 +
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index ff9b95d9bf75e..55ea4bae81e6e 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -81,9 +81,9 @@ class VersionedTableInfo {
       auto version = ReadVersionTuple(Data);
       const auto *DataBefore = Data;
       (void)DataBefore;
+      auto UnversionedData = Derived::readUnversioned(Key, Data);
       assert(Data != DataBefore &&
              "Unversioned data reader didn't move pointer");
-      auto UnversionedData = Derived::readUnversioned(Key, Data);
       Result.push_back({version, UnversionedData});
     }
     return Result;
@@ -148,7 +148,7 @@ class IdentifierTableInfo {
   external_key_type GetExternalKey(internal_key_type Key) { return Key; }
 
   hash_value_type ComputeHash(internal_key_type Key) {
-    return llvm::hash_value(Key);
+    return llvm::djbHash(Key);
   }
 
   static bool EqualKey(internal_key_type LHS, internal_key_type RHS) {
@@ -1797,8 +1797,8 @@ APINotesReader::Create(std::unique_ptr<llvm::MemoryBuffer> InputBuffer,
 template <typename T>
 APINotesReader::VersionedInfo<T>::VersionedInfo(
     llvm::VersionTuple Version,
-    llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> Results)
-    : Results(std::move(Results)) {
+    llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> R)
+    : Results(std::move(R)) {
 
   assert(!Results.empty());
   assert(std::is_sorted(
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index 62a2ab1799913..76fd24ccfae98 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -128,6 +128,7 @@ class APINotesWriter::Implementation {
   SelectorID getSelector(ObjCSelectorRef SelectorRef) {
     // Translate the selector reference into a stored selector.
     StoredObjCSelector Selector;
+    Selector.NumArgs = SelectorRef.NumArgs;
     Selector.Identifiers.reserve(SelectorRef.Identifiers.size());
     for (auto piece : SelectorRef.Identifiers)
       Selector.Identifiers.push_back(getIdentifier(piece));

From 91791c60bd7d1783d84e2e6ed87e5f957fbaee56 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Mon, 26 Feb 2024 16:53:47 -0800
Subject: [PATCH 396/546] [DSE] Test precommit for a bug caused by a
 read-clobber being skipped. (#83084)

---
 .../read-clobber-skipped.ll                   | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll

diff --git a/llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll b/llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll
new file mode 100644
index 0000000000000..6a824a6afdc27
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=dse < %s | FileCheck %s
+;
+; DSE kills both `store i32 44, ptr %struct.byte.4, align 4` and
+; `call void @llvm.memset.p0.i64(...)` but the memset should not be killed
+; because it has a clobber read: `%ret = load ptr, ptr %struct.byte.8`
+
+%struct.type = type { ptr, ptr }
+
+define ptr @foo(ptr noundef %ptr) {
+; CHECK-LABEL: define ptr @foo(
+; CHECK-SAME: ptr noundef [[PTR:%.*]]) {
+; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
+; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR2]]
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %struct.alloca = alloca %struct.type, align 8
+  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
+  ; Set %struct.alloca[8, 16) to 42.
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
+  ; Set %struct.alloca[8, 12) to 43.
+  store i32 43, ptr %struct.byte.8, align 4
+  ; Set %struct.alloca[4, 8) to 44.
+  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
+  store i32 44, ptr %struct.byte.4, align 4
+  ; Return %struct.alloca[8, 16).
+  %ret = load ptr, ptr %struct.byte.8
+  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
+  ret ptr %ret
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #0
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2

From b1d2e8510b58893e58558ffdf3f8ba29c1e25e5a Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Mon, 26 Feb 2024 17:12:23 -0800
Subject: [PATCH 397/546] Revert "[DSE] Test precommit for a bug caused by a
 read-clobber being skipped. (#83084)"

This reverts commit 91791c60bd7d1783d84e2e6ed87e5f957fbaee56.
---
 .../read-clobber-skipped.ll                   | 39 -------------------
 1 file changed, 39 deletions(-)
 delete mode 100644 llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll

diff --git a/llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll b/llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll
deleted file mode 100644
index 6a824a6afdc27..0000000000000
--- a/llvm/test/Transforms/DeadStoreElimination/read-clobber-skipped.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes=dse < %s | FileCheck %s
-;
-; DSE kills both `store i32 44, ptr %struct.byte.4, align 4` and
-; `call void @llvm.memset.p0.i64(...)` but the memset should not be killed
-; because it has a clobber read: `%ret = load ptr, ptr %struct.byte.8`
-
-%struct.type = type { ptr, ptr }
-
-define ptr @foo(ptr noundef %ptr) {
-; CHECK-LABEL: define ptr @foo(
-; CHECK-SAME: ptr noundef [[PTR:%.*]]) {
-; CHECK-NEXT:    [[STRUCT_ALLOCA:%.*]] = alloca [[STRUCT_TYPE:%.*]], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[STRUCT_BYTE_8:%.*]] = getelementptr inbounds i8, ptr [[STRUCT_ALLOCA]], i64 8
-; CHECK-NEXT:    store i32 43, ptr [[STRUCT_BYTE_8]], align 4
-; CHECK-NEXT:    [[RET:%.*]] = load ptr, ptr [[STRUCT_BYTE_8]], align 8
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 56, ptr nonnull [[STRUCT_ALLOCA]]) #[[ATTR2]]
-; CHECK-NEXT:    ret ptr [[RET]]
-;
-  %struct.alloca = alloca %struct.type, align 8
-  call void @llvm.lifetime.start.p0(i64 56, ptr nonnull %struct.alloca) nounwind
-  %struct.byte.8 = getelementptr inbounds i8, ptr %struct.alloca, i64 8
-  ; Set %struct.alloca[8, 16) to 42.
-  call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 %struct.byte.8, i8 42, i64 8, i1 false)
-  ; Set %struct.alloca[8, 12) to 43.
-  store i32 43, ptr %struct.byte.8, align 4
-  ; Set %struct.alloca[4, 8) to 44.
-  %struct.byte.4 = getelementptr inbounds i8, ptr %struct.alloca, i64 4
-  store i32 44, ptr %struct.byte.4, align 4
-  ; Return %struct.alloca[8, 16).
-  %ret = load ptr, ptr %struct.byte.8
-  call void @llvm.lifetime.end.p0(i64 56, ptr nonnull %struct.alloca) nounwind
-  ret ptr %ret
-}
-
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #0
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2

From 3892e8e59ce4b85fc191a273106d7342f74d5d68 Mon Sep 17 00:00:00 2001
From: MalaySanghiIntel <148750629+MalaySanghiIntel@users.noreply.github.com>
Date: Tue, 27 Feb 2024 06:55:13 +0530
Subject: [PATCH 398/546] Non functional change. Initialize var EnumVal to 0.
 (#82987)

CodeGenInstruction has a new unsigned member EnumVal. It is not
initialized in either the class or the constructor.
---
 llvm/utils/TableGen/CodeGenInstruction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/CodeGenInstruction.h b/llvm/utils/TableGen/CodeGenInstruction.h
index 11a3acd8e7233..963c9f0b25925 100644
--- a/llvm/utils/TableGen/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/CodeGenInstruction.h
@@ -301,7 +301,7 @@ class CodeGenInstruction {
   Record *InferredFrom;
 
   // The enum value assigned by CodeGenTarget::computeInstrsByEnum.
-  mutable unsigned EnumVal;
+  mutable unsigned EnumVal = 0;
 
   CodeGenInstruction(Record *R);
 

From 86f6caa562255f81b93e72a501a926b17f5ad244 Mon Sep 17 00:00:00 2001
From: Connor Sughrue <55301806+cpsughrue@users.noreply.github.com>
Date: Mon, 26 Feb 2024 21:04:11 -0500
Subject: [PATCH 399/546] [llvm][Support] Add support for executing a detached
 process (#81708)

Adds a new parameter, `bool DetachProcess` with a default option of
`false`, to `llvm::sys::ExecuteNoWait`, which, when set to `true`,
executes the specified program without a controlling terminal.

Functionality added so that the module build daemon can be run without a
controlling terminal.
---
 llvm/include/llvm/Support/Program.h    | 25 +++++----
 llvm/lib/Support/Program.cpp           |  9 ++--
 llvm/lib/Support/Unix/Program.inc      | 18 +++++--
 llvm/lib/Support/Windows/Program.inc   |  7 ++-
 llvm/unittests/Support/ProgramTest.cpp | 74 ++++++++++++++++++++++++++
 5 files changed, 112 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index 4c1133e44a21c..9df94eb604c7d 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -141,18 +141,21 @@ namespace sys {
                                         /// program shall run on.
   );
 
-  /// Similar to ExecuteAndWait, but returns immediately.
-  /// @returns The \see ProcessInfo of the newly launched process.
+  /// Similar to \ref ExecuteAndWait, but returns immediately.
+  /// \returns The \ref ProcessInfo of the newly launched process.
   /// \note On Microsoft Windows systems, users will need to either call
-  /// \see Wait until the process finished execution or win32 CloseHandle() API
-  /// on ProcessInfo.ProcessHandle to avoid memory leaks.
-  ProcessInfo ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
-                            std::optional<ArrayRef<StringRef>> Env,
-                            ArrayRef<std::optional<StringRef>> Redirects = {},
-                            unsigned MemoryLimit = 0,
-                            std::string *ErrMsg = nullptr,
-                            bool *ExecutionFailed = nullptr,
-                            BitVector *AffinityMask = nullptr);
+  /// \ref Wait until the process has finished executing or win32's CloseHandle
+  /// API on ProcessInfo.ProcessHandle to avoid memory leaks.
+  ProcessInfo ExecuteNoWait(
+      StringRef Program, ArrayRef<StringRef> Args,
+      std::optional<ArrayRef<StringRef>> Env,
+      ArrayRef<std::optional<StringRef>> Redirects = {},
+      unsigned MemoryLimit = 0, std::string *ErrMsg = nullptr,
+      bool *ExecutionFailed = nullptr, BitVector *AffinityMask = nullptr,
+      /// If true the executed program detatches from the controlling
+      /// terminal. I/O streams such as llvm::outs, llvm::errs, and stdin will
+      /// be closed until redirected to another output location
+      bool DetachProcess = false);
 
   /// Return true if the given arguments fit within system-specific
   /// argument length limits.
diff --git a/llvm/lib/Support/Program.cpp b/llvm/lib/Support/Program.cpp
index 1dcd45e2d69e8..181f68cfbb8c3 100644
--- a/llvm/lib/Support/Program.cpp
+++ b/llvm/lib/Support/Program.cpp
@@ -27,7 +27,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
                     std::optional<ArrayRef<StringRef>> Env,
                     ArrayRef<std::optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg,
-                    BitVector *AffinityMask);
+                    BitVector *AffinityMask, bool DetachProcess);
 
 int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
                         std::optional<ArrayRef<StringRef>> Env,
@@ -39,7 +39,7 @@ int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
   if (Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg,
-              AffinityMask)) {
+              AffinityMask, /*DetachProcess=*/false)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
     ProcessInfo Result = Wait(
@@ -58,13 +58,14 @@ ProcessInfo sys::ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
                                std::optional<ArrayRef<StringRef>> Env,
                                ArrayRef<std::optional<StringRef>> Redirects,
                                unsigned MemoryLimit, std::string *ErrMsg,
-                               bool *ExecutionFailed, BitVector *AffinityMask) {
+                               bool *ExecutionFailed, BitVector *AffinityMask,
+                               bool DetachProcess) {
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
   if (ExecutionFailed)
     *ExecutionFailed = false;
   if (!Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg,
-               AffinityMask))
+               AffinityMask, DetachProcess))
     if (ExecutionFailed)
       *ExecutionFailed = true;
 
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index 5d9757bcc51b3..2742734bb11ed 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -173,10 +173,11 @@ toNullTerminatedCStringArray(ArrayRef<StringRef> Strings, StringSaver &Saver) {
 }
 
 static bool Execute(ProcessInfo &PI, StringRef Program,
-                    ArrayRef<StringRef> Args, std::optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<StringRef> Args,
+                    std::optional<ArrayRef<StringRef>> Env,
                     ArrayRef<std::optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg,
-                    BitVector *AffinityMask) {
+                    BitVector *AffinityMask, bool DetachProcess) {
   if (!llvm::sys::fs::exists(Program)) {
     if (ErrMsg)
       *ErrMsg = std::string("Executable \"") + Program.str() +
@@ -202,7 +203,8 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
   // If this OS has posix_spawn and there is no memory limit being implied, use
   // posix_spawn.  It is more efficient than fork/exec.
 #ifdef HAVE_POSIX_SPAWN
-  if (MemoryLimit == 0) {
+  // Cannot use posix_spawn if you would like to detach the process
+  if (MemoryLimit == 0 && !DetachProcess) {
     posix_spawn_file_actions_t FileActionsStore;
     posix_spawn_file_actions_t *FileActions = nullptr;
 
@@ -270,7 +272,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
 
     return true;
   }
-#endif
+#endif // HAVE_POSIX_SPAWN
 
   // Create a child process.
   int child = fork();
@@ -307,6 +309,14 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
       }
     }
 
+    if (DetachProcess) {
+      // Detach from controlling terminal
+      if (::setsid() == -1) {
+        MakeErrMsg(ErrMsg, "Could not detach process, ::setsid failed");
+        return false;
+      }
+    }
+
     // Set memory limits
     if (MemoryLimit != 0) {
       SetMemoryLimits(MemoryLimit);
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index 0de9d3f756448..d98d55f317a35 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -172,10 +172,11 @@ static HANDLE RedirectIO(std::optional<StringRef> Path, int fd,
 } // namespace llvm
 
 static bool Execute(ProcessInfo &PI, StringRef Program,
-                    ArrayRef<StringRef> Args, std::optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<StringRef> Args,
+                    std::optional<ArrayRef<StringRef>> Env,
                     ArrayRef<std::optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg,
-                    BitVector *AffinityMask) {
+                    BitVector *AffinityMask, bool DetachProcess) {
   if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
       *ErrMsg = "program not executable";
@@ -284,6 +285,8 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
   unsigned CreateFlags = CREATE_UNICODE_ENVIRONMENT;
   if (AffinityMask)
     CreateFlags |= CREATE_SUSPENDED;
+  if (DetachProcess)
+    CreateFlags |= DETACHED_PROCESS;
 
   std::vector<wchar_t> CommandUtf16(Command.size() + 1, 0);
   std::copy(Command.begin(), Command.end(), CommandUtf16.begin());
diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp
index 2e2b1958b9ac9..58f98892c9cc7 100644
--- a/llvm/unittests/Support/ProgramTest.cpp
+++ b/llvm/unittests/Support/ProgramTest.cpp
@@ -260,6 +260,80 @@ TEST_F(ProgramEnvTest, TestExecuteNoWait) {
   ASSERT_GT(LoopCount, 1u) << "LoopCount should be >1";
 }
 
+TEST_F(ProgramEnvTest, TestExecuteNoWaitDetached) {
+  using namespace llvm::sys;
+
+  if (getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED")) {
+    sleep_for(/*seconds=*/5);
+
+#if _WIN32
+    HWND ConsoleWnd = GetConsoleWindow();
+    if (ConsoleWnd == NULL)
+      exit(100);
+    else
+      exit(200);
+#else
+    int ParentSID = std::stoi(
+        std::string(getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_SID")));
+
+    pid_t ChildSID = ::getsid(0);
+    if (ChildSID == -1) {
+      llvm::errs() << "Could not get process SID: " << strerror(errno) << '\n';
+      exit(1);
+    }
+
+    char *Detached = getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_TRUE");
+    if (Detached && (ChildSID != ParentSID))
+      exit(100);
+    if (!Detached && (ChildSID == ParentSID))
+      exit(200);
+#endif
+    exit(0);
+  }
+
+  std::string Executable =
+      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
+  StringRef argv[] = {
+      Executable, "--gtest_filter=ProgramEnvTest.TestExecuteNoWaitDetached"};
+  addEnvVar("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED=1");
+
+#ifndef _WIN32
+  pid_t SID = ::getsid(0);
+  ASSERT_NE(SID, -1);
+  std::string SIDEnvVar =
+      "LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_SID=" + std::to_string(SID);
+  addEnvVar(SIDEnvVar);
+#endif
+
+  // DetachProcess = true
+  {
+    std::string Error;
+    bool ExecutionFailed;
+    std::vector<llvm::StringRef> Env = getEnviron();
+    Env.emplace_back("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_TRUE=1");
+    ProcessInfo PI1 =
+        ExecuteNoWait(Executable, argv, Env, {}, 0, &Error, &ExecutionFailed,
+                      nullptr, /*DetachProcess=*/true);
+    ASSERT_FALSE(ExecutionFailed) << Error;
+    ASSERT_NE(PI1.Pid, ProcessInfo::InvalidPid) << "Invalid process id";
+    ProcessInfo WaitResult = Wait(PI1, std::nullopt, &Error);
+    ASSERT_EQ(WaitResult.ReturnCode, 100);
+  }
+
+  // DetachProcess = false
+  {
+    std::string Error;
+    bool ExecutionFailed;
+    ProcessInfo PI2 =
+        ExecuteNoWait(Executable, argv, getEnviron(), {}, 0, &Error,
+                      &ExecutionFailed, nullptr, /*DetachProcess=*/false);
+    ASSERT_FALSE(ExecutionFailed) << Error;
+    ASSERT_NE(PI2.Pid, ProcessInfo::InvalidPid) << "Invalid process id";
+    ProcessInfo WaitResult = Wait(PI2, std::nullopt, &Error);
+    ASSERT_EQ(WaitResult.ReturnCode, 200);
+  }
+}
+
 TEST_F(ProgramEnvTest, TestExecuteAndWaitTimeout) {
   using namespace llvm::sys;
 

From c11627c2f4d550613a3cb360c89a0cf52d2eb720 Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Mon, 26 Feb 2024 18:15:12 -0800
Subject: [PATCH 400/546] [MLIR][LLVM] Fix memory explosion when converting
 global variable bodies in ModuleTranslation (#82708)

There is memory explosion when converting the body or initializer region
of a large global variable, e.g. a constant array.

For example, when translating a constant array of 100000 strings:

llvm.mlir.global internal constant @cats_strings() {addr_space = 0 :
i32, alignment = 16 : i64} : !llvm.array<100000 x ptr<i8>> {
    %0 = llvm.mlir.undef : !llvm.array<100000 x ptr<i8>>
    %1 = llvm.mlir.addressof @om_1 : !llvm.ptr<array<1 x i8>>
%2 = llvm.getelementptr %1[0, 0] : (!llvm.ptr<array<1 x i8>>) ->
!llvm.ptr<i8>
    %3 = llvm.insertvalue %2, %0[0] : !llvm.array<100000 x ptr<i8>>
    %4 = llvm.mlir.addressof @om_2 : !llvm.ptr<array<1 x i8>>
%5 = llvm.getelementptr %4[0, 0] : (!llvm.ptr<array<1 x i8>>) ->
!llvm.ptr<i8>
    %6 = llvm.insertvalue %5, %3[1] : !llvm.array<100000 x ptr<i8>>
    %7 = llvm.mlir.addressof @om_3 : !llvm.ptr<array<1 x i8>>
%8 = llvm.getelementptr %7[0, 0] : (!llvm.ptr<array<1 x i8>>) ->
!llvm.ptr<i8>
    %9 = llvm.insertvalue %8, %6[2] : !llvm.array<100000 x ptr<i8>>
    %10 = llvm.mlir.addressof @om_4 : !llvm.ptr<array<1 x i8>>
%11 = llvm.getelementptr %10[0, 0] : (!llvm.ptr<array<1 x i8>>) ->
!llvm.ptr<i8>
    %12 = llvm.insertvalue %11, %9[3] : !llvm.array<100000 x ptr<i8>>

    ... (ignore the remaining part)
}

where @om_1, @om_2, ... are string global constants.

Each time an operation is converted to LLVM, a new constant is created.
When it comes to llvm.insertvalue, a new constant array of 100000
elements is created and the old constant array (input) is not destroyed.
This causes memory explosion. We observed that, on a system with 128 GB
memory, the translation of 100000 elements got killed due to using up
all the memory. On a system with 64 GB, 65536 elements was enough to
cause the translation killed.

There is a previous patch (https://reviews.llvm.org/D148487) which fix
this issue but was reverted for
https://github.com/llvm/llvm-project/issues/62802

The old patch checks generated constants and destroyed them if there is
no use. But the check of use for the constant is too early, which cause
the constant be removed before use.


This new patch added a map was added a map to save expected use count
for a constant. Then decrease when reach each use.
And only erase the constant when the use count reach to zero

With new patch, the repro in
https://github.com/llvm/llvm-project/issues/62802 finished correctly.
---
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 71 +++++++++++++++++-
 .../LLVMIR/erase-dangling-constants.mlir      | 72 +++++++++++++++++++
 2 files changed, 141 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/erase-dangling-constants.mlir

diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index ee8fffd959c88..7cffd7bd1fa43 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -51,11 +51,15 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <optional>
 
+#define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
+
 using namespace mlir;
 using namespace mlir::LLVM;
 using namespace mlir::LLVM::detail;
@@ -1042,17 +1046,80 @@ LogicalResult ModuleTranslation::convertGlobals() {
   for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
     if (Block *initializer = op.getInitializerBlock()) {
       llvm::IRBuilder<> builder(llvmModule->getContext());
+
+      int numConstantsHit = 0;
+      int numConstantsErased = 0;
+      DenseMap<llvm::ConstantAggregate *, int> constantAggregateUseMap;
+
       for (auto &op : initializer->without_terminator()) {
-        if (failed(convertOperation(op, builder)) ||
-            !isa<llvm::Constant>(lookupValue(op.getResult(0))))
+        if (failed(convertOperation(op, builder)))
+          return emitError(op.getLoc(), "fail to convert global initializer");
+        auto *cst = dyn_cast<llvm::Constant>(lookupValue(op.getResult(0)));
+        if (!cst)
           return emitError(op.getLoc(), "unemittable constant value");
+
+        // When emitting an LLVM constant, a new constant is created and the old
+        // constant may become dangling and take space. We should remove the
+        // dangling constants to avoid memory explosion especially for constant
+        // arrays whose number of elements is large.
+        // Because multiple operations may refer to the same constant, we need
+        // to count the number of uses of each constant array and remove it only
+        // when the count becomes zero.
+        if (auto *agg = dyn_cast<llvm::ConstantAggregate>(cst)) {
+          numConstantsHit++;
+          Value result = op.getResult(0);
+          int numUsers = std::distance(result.use_begin(), result.use_end());
+          auto [iterator, inserted] =
+              constantAggregateUseMap.try_emplace(agg, numUsers);
+          if (!inserted) {
+            // Key already exists, update the value
+            iterator->second += numUsers;
+          }
+        }
+        // Scan the operands of the operation to decrement the use count of
+        // constants. Erase the constant if the use count becomes zero.
+        for (Value v : op.getOperands()) {
+          auto cst = dyn_cast<llvm::ConstantAggregate>(lookupValue(v));
+          if (!cst)
+            continue;
+          auto iter = constantAggregateUseMap.find(cst);
+          assert(iter != constantAggregateUseMap.end() && "constant not found");
+          iter->second--;
+          if (iter->second == 0) {
+            // NOTE: cannot call removeDeadConstantUsers() here because it
+            // may remove the constant which has uses not be converted yet.
+            if (cst->user_empty()) {
+              cst->destroyConstant();
+              numConstantsErased++;
+            }
+            constantAggregateUseMap.erase(iter);
+          }
+        }
       }
+
       ReturnOp ret = cast<ReturnOp>(initializer->getTerminator());
       llvm::Constant *cst =
           cast<llvm::Constant>(lookupValue(ret.getOperand(0)));
       auto *global = cast<llvm::GlobalVariable>(lookupGlobal(op));
       if (!shouldDropGlobalInitializer(global->getLinkage(), cst))
         global->setInitializer(cst);
+
+      // Try to remove the dangling constants again after all operations are
+      // converted.
+      for (auto it : constantAggregateUseMap) {
+        auto cst = it.first;
+        cst->removeDeadConstantUsers();
+        if (cst->user_empty()) {
+          cst->destroyConstant();
+          numConstantsErased++;
+        }
+      }
+
+      LLVM_DEBUG(llvm::dbgs()
+                     << "Convert initializer for " << op.getName() << "\n";
+                 llvm::dbgs() << numConstantsHit << " new constants hit\n";
+                 llvm::dbgs()
+                 << numConstantsErased << " dangling constants erased\n";);
     }
   }
 
diff --git a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
new file mode 100644
index 0000000000000..b3b5d540ae88f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
@@ -0,0 +1,72 @@
+// RUN: mlir-translate -mlir-to-llvmir %s -debug-only=llvm-dialect-to-llvm-ir 2>&1 | FileCheck %s
+
+// CHECK: Convert initializer for dup_const
+// CHECK: 6 new constants hit
+// CHECK: 3 dangling constants erased
+// CHECK: Convert initializer for unique_const
+// CHECK: 6 new constants hit
+// CHECK: 5 dangling constants erased
+
+
+// CHECK:@dup_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02] }
+
+llvm.mlir.global @dup_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
+    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
+    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
+
+    %empty0 = llvm.mlir.undef : !llvm.array<2 x f64>
+    %a00 = llvm.insertvalue %c0, %empty0[0] : !llvm.array<2 x f64>
+
+    %empty1 = llvm.mlir.undef : !llvm.array<2 x f64>
+    %a10 = llvm.insertvalue %c0, %empty1[0] : !llvm.array<2 x f64>
+
+    %empty2 = llvm.mlir.undef : !llvm.array<2 x f64>
+    %a20 = llvm.insertvalue %c0, %empty2[0] : !llvm.array<2 x f64>
+
+// NOTE: a00, a10, a20 are all same ConstantAggregate which not used at this point.
+//       should not delete it before all of the uses of the ConstantAggregate finished.
+
+    %a01 = llvm.insertvalue %c1, %a00[1] : !llvm.array<2 x f64>
+    %a11 = llvm.insertvalue %c1, %a10[1] : !llvm.array<2 x f64>
+    %a21 = llvm.insertvalue %c1, %a20[1] : !llvm.array<2 x f64>
+    %empty_r = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+    %r0 = llvm.insertvalue %a01, %empty_r[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+    %r1 = llvm.insertvalue %a11, %r0[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+    %r2 = llvm.insertvalue %a21, %r1[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+
+    llvm.return %r2 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+  }
+
+// CHECK:@unique_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.312250e-02, double 5.219230e-02], [2 x double] [double 3.412250e-02, double 5.419230e-02] }
+
+llvm.mlir.global @unique_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
+    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
+    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
+
+    %c2 = llvm.mlir.constant(3.312250e-02 : f64) : f64
+    %c3 = llvm.mlir.constant(5.219230e-02 : f64) : f64
+
+    %c4 = llvm.mlir.constant(3.412250e-02 : f64) : f64
+    %c5 = llvm.mlir.constant(5.419230e-02 : f64) : f64
+
+    %2 = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+
+    %3 = llvm.mlir.undef : !llvm.array<2 x f64>
+
+    %4 = llvm.insertvalue %c0, %3[0] : !llvm.array<2 x f64>
+    %5 = llvm.insertvalue %c1, %4[1] : !llvm.array<2 x f64>
+
+    %6 = llvm.insertvalue %5, %2[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+
+    %7 = llvm.insertvalue %c2, %3[0] : !llvm.array<2 x f64>
+    %8 = llvm.insertvalue %c3, %7[1] : !llvm.array<2 x f64>
+
+    %9 = llvm.insertvalue %8, %6[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+
+    %10 = llvm.insertvalue %c4, %3[0] : !llvm.array<2 x f64>
+    %11 = llvm.insertvalue %c5, %10[1] : !llvm.array<2 x f64>
+
+    %12 = llvm.insertvalue %11, %9[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+
+    llvm.return %12 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
+}

From b791a51730f145308f3607d0d33038af78138304 Mon Sep 17 00:00:00 2001
From: Visoiu Mistrih Francis <890283+francisvm@users.noreply.github.com>
Date: Mon, 26 Feb 2024 18:25:21 -0800
Subject: [PATCH 401/546] [CodeGenSchedule] Don't allow invalid ReadAdvances to
 be formed (#82685)

Forming a `ReadAdvance` with an entry in the `ValidWrites` list that is
not used by any instruction results in the entire `ReadAdvance` to be
ignored by the scheduler due to an invalid entry.

The `SchedRW` collection code only picks up `SchedWrites` that are
reachable from `Instructions`, `InstRW`, `ItinRW` and `SchedAlias`,
leaving the unreachable ones with an invalid entry (0) in
`SubtargetEmitter::GenSchedClassTables` when going through the list of
`ReadAdvances`
---
 .../Target/AArch64/AArch64SchedExynosM4.td    |  4 +--
 .../Target/AArch64/AArch64SchedExynosM5.td    |  4 +--
 llvm/test/TableGen/ReadAdvanceInvalidWrite.td | 29 +++++++++++++++++++
 .../AArch64/Exynos/float-divide-multiply.s    | 12 ++++----
 llvm/utils/TableGen/CodeGenSchedule.cpp       |  9 ++++++
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  5 +++-
 6 files changed, 50 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/TableGen/ReadAdvanceInvalidWrite.td

diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index 5163de280f2e4..b75264602dbc1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -309,7 +309,6 @@ def M4WriteFMAC3H  : SchedWriteRes<[M4UnitFMACH]> { let Latency = 3; }
 def M4WriteFMAC3   : SchedWriteRes<[M4UnitFMAC]>  { let Latency = 3; }
 def M4WriteFMAC4   : SchedWriteRes<[M4UnitFMAC]>  { let Latency = 4; }
 def M4WriteFMAC4H  : SchedWriteRes<[M4UnitFMACH]> { let Latency = 4; }
-def M4WriteFMAC5   : SchedWriteRes<[M4UnitFMAC]>  { let Latency = 5; }
 
 def M4WriteFSQR7H  : SchedWriteRes<[M4UnitFSQRH]> { let Latency = 7;
                                                     let ReleaseAtCycles = [6]; }
@@ -495,8 +494,7 @@ def M4WriteMOVI    : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M4WriteZ0]>
 // Fast forwarding.
 def M4ReadAESM1    : SchedReadAdvance<+1, [M4WriteNCRY1]>;
 def M4ReadFMACM1   : SchedReadAdvance<+1, [M4WriteFMAC4,
-                                           M4WriteFMAC4H,
-                                           M4WriteFMAC5]>;
+                                           M4WriteFMAC4H]>;
 def M4ReadNMULM1   : SchedReadAdvance<+1, [M4WriteNMUL3]>;
 def M4ReadNMULP2   : SchedReadAdvance<-2, [M4WriteNMUL3]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 2ccbe1614dcd7..6b5a6da76b3a8 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -338,7 +338,6 @@ def M5WriteFDIV12  : SchedWriteRes<[M5UnitFDIV]>  { let Latency = 12;
 
 def M5WriteFMAC3   : SchedWriteRes<[M5UnitFMAC]>  { let Latency = 3; }
 def M5WriteFMAC4   : SchedWriteRes<[M5UnitFMAC]>  { let Latency = 4; }
-def M5WriteFMAC5   : SchedWriteRes<[M5UnitFMAC]>  { let Latency = 5; }
 
 def M5WriteFSQR5   : SchedWriteRes<[M5UnitFSQR]>  { let Latency = 5;
                                                     let ReleaseAtCycles = [2]; }
@@ -530,8 +529,7 @@ def M5WriteMOVI    : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M5WriteZ0]>
 // Fast forwarding.
 def M5ReadFM1      : SchedReadAdvance<+1, [M5WriteF2]>;
 def M5ReadAESM2    : SchedReadAdvance<+2, [M5WriteNCRY2]>;
-def M5ReadFMACM1   : SchedReadAdvance<+1, [M5WriteFMAC4,
-                                           M5WriteFMAC5]>;
+def M5ReadFMACM1   : SchedReadAdvance<+1, [M5WriteFMAC4]>;
 def M5ReadNMULM1   : SchedReadAdvance<+1, [M5WriteNMUL3]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/TableGen/ReadAdvanceInvalidWrite.td b/llvm/test/TableGen/ReadAdvanceInvalidWrite.td
new file mode 100644
index 0000000000000..d185cbd56f8e9
--- /dev/null
+++ b/llvm/test/TableGen/ReadAdvanceInvalidWrite.td
@@ -0,0 +1,29 @@
+// RUN: not llvm-tblgen -gen-subtarget -I %p/../../include %s 2>&1 | FileCheck %s
+
+// Make sure we don't form ReadAdvances with ValidWrites entries that are not
+// associated with any instructions.
+
+include "llvm/Target/Target.td"
+
+def TargetX : Target;
+
+def WriteX : SchedWrite;
+def WriteY : SchedWrite;
+def ReadX : SchedRead;
+
+def InstX : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+  let SchedRW = [WriteX, ReadX];
+}
+
+def SchedModelX: SchedMachineModel {
+  let CompleteModel = 0;
+}
+
+let SchedModel = SchedModelX in {
+  def : ReadAdvance<ReadX, 1, [WriteX, WriteY]>;
+  // CHECK: error: ReadAdvance referencing a ValidWrite that is not used by any instruction (WriteY)
+}
+
+def ProcessorX: ProcessorModel<"ProcessorX", SchedModelX, []>;
diff --git a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
index a24d8a279606c..ecfd019452afc 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Exynos/float-divide-multiply.s
@@ -26,11 +26,11 @@ fsqrt	d11, d12
 # EM3-NEXT: Total uOps:        800
 
 # EM4-NEXT: Instructions:      1200
-# EM4-NEXT: Total Cycles:      575
+# EM4-NEXT: Total Cycles:      572
 # EM4-NEXT: Total uOps:        1200
 
 # EM5-NEXT: Instructions:      1200
-# EM5-NEXT: Total Cycles:      433
+# EM5-NEXT: Total Cycles:      434
 # EM5-NEXT: Total uOps:        1200
 
 # ALL:      Dispatch Width:    6
@@ -39,12 +39,12 @@ fsqrt	d11, d12
 # EM3-NEXT: IPC:               0.18
 # EM3-NEXT: Block RThroughput: 45.0
 
-# EM4-NEXT: uOps Per Cycle:    2.09
-# EM4-NEXT: IPC:               2.09
+# EM4-NEXT: uOps Per Cycle:    2.10
+# EM4-NEXT: IPC:               2.10
 # EM4-NEXT: Block RThroughput: 4.0
 
-# EM5-NEXT: uOps Per Cycle:    2.77
-# EM5-NEXT: IPC:               2.77
+# EM5-NEXT: uOps Per Cycle:    2.76
+# EM5-NEXT: IPC:               2.76
 # EM5-NEXT: Block RThroughput: 4.0
 
 # ALL:      Instruction Info:
diff --git a/llvm/utils/TableGen/CodeGenSchedule.cpp b/llvm/utils/TableGen/CodeGenSchedule.cpp
index b4c624703626c..d819016f8b561 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/CodeGenSchedule.cpp
@@ -2190,6 +2190,15 @@ void CodeGenSchedModels::addWriteRes(Record *ProcWriteResDef, unsigned PIdx) {
 // Add resources for a ReadAdvance to this processor if they don't exist.
 void CodeGenSchedModels::addReadAdvance(Record *ProcReadAdvanceDef,
                                         unsigned PIdx) {
+  for (const Record *ValidWrite :
+       ProcReadAdvanceDef->getValueAsListOfDefs("ValidWrites"))
+    if (getSchedRWIdx(ValidWrite, /*IsRead=*/false) == 0)
+      PrintFatalError(
+          ProcReadAdvanceDef->getLoc(),
+          "ReadAdvance referencing a ValidWrite that is not used by "
+          "any instruction (" +
+              ValidWrite->getName() + ")");
+
   RecVec &RADefs = ProcModels[PIdx].ReadAdvanceDefs;
   if (is_contained(RADefs, ProcReadAdvanceDef))
     return;
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 2707f54eed6e9..d350d7de139f6 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1262,7 +1262,10 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         WriteIDs.push_back(0);
       else {
         for (Record *VW : ValidWrites) {
-          WriteIDs.push_back(SchedModels.getSchedRWIdx(VW, /*IsRead=*/false));
+          unsigned WriteID = SchedModels.getSchedRWIdx(VW, /*IsRead=*/false);
+          assert(WriteID != 0 &&
+                 "Expected a valid SchedRW in the list of ValidWrites");
+          WriteIDs.push_back(WriteID);
         }
       }
       llvm::sort(WriteIDs);

From 2e564840e08d28fec9035aacc7a2d28106ed8606 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 27 Feb 2024 10:34:04 +0800
Subject: [PATCH 402/546] [RISCV] Use getVectorIdxConstant in
 RISCVISelLowering.cpp. NFC (#83019)

We use getVectorIdxConstant() in some places and getConstant(XLenVT) or
getIntPtrConstant() in others, but getVectorIdxTy() == getPointerTy() ==
XLenVT.

This refactors RISCVISelLowering to use the former for nodes that use
getVectorIdxTy(), i.e. INSERT_SUBVECTOR, EXTRACT_SUBVECTOR,
INSERT_VECTOR_ELT and EXTRACT_VECTOR_ELT, so that we're consistent.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 79 ++++++++++-----------
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c98642748d4e..e34750d057301 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2610,7 +2610,7 @@ static SDValue convertToScalableVector(EVT VT, SDValue V, SelectionDAG &DAG,
   assert(V.getValueType().isFixedLengthVector() &&
          "Expected a fixed length vector operand!");
   SDLoc DL(V);
-  SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT());
+  SDValue Zero = DAG.getVectorIdxConstant(0, DL);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
 }
 
@@ -3472,7 +3472,7 @@ static SDValue lowerBuildVectorViaDominantValues(SDValue Op, SelectionDAG &DAG,
         continue;
       if (ValueCounts[V] == 1) {
         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V,
-                          DAG.getConstant(OpIdx.index(), DL, XLenVT));
+                          DAG.getVectorIdxConstant(OpIdx.index(), DL));
       } else {
         // Blend in all instances of this value using a VSELECT, using a
         // mask where each bit signals whether that element is the one
@@ -3688,7 +3688,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
     SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT,
                               DAG.getUNDEF(ViaVecVT),
                               DAG.getConstant(SplatValue, DL, XLenVT),
-                              DAG.getConstant(0, DL, XLenVT));
+                              DAG.getVectorIdxConstant(0, DL));
     if (ViaVecLen != 1)
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
                         MVT::getVectorVT(ViaIntVT, 1), Vec,
@@ -4141,9 +4141,9 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
       }
       if (ExtractedContainerVT.bitsLE(VT))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Passthru,
-                           ExtractedVal, DAG.getConstant(0, DL, XLenVT));
+                           ExtractedVal, DAG.getVectorIdxConstant(0, DL));
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtractedVal,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
     }
   }
 
@@ -5020,12 +5020,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     assert(EvenSrc >= 0 && "Undef source?");
     EvenV = (EvenSrc / Size) == 0 ? V1 : V2;
     EvenV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, EvenV,
-                        DAG.getConstant(EvenSrc % Size, DL, XLenVT));
+                        DAG.getVectorIdxConstant(EvenSrc % Size, DL));
 
     assert(OddSrc >= 0 && "Undef source?");
     OddV = (OddSrc / Size) == 0 ? V1 : V2;
     OddV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, OddV,
-                       DAG.getConstant(OddSrc % Size, DL, XLenVT));
+                       DAG.getVectorIdxConstant(OddSrc % Size, DL));
 
     return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
   }
@@ -6088,7 +6088,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
           return SDValue();
         return DAG.getBitcast(VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, BVT,
                                               DAG.getUNDEF(BVT), Op0,
-                                              DAG.getConstant(0, DL, XLenVT)));
+                                              DAG.getVectorIdxConstant(0, DL)));
       }
       return SDValue();
     }
@@ -6101,7 +6101,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
         return SDValue();
       SDValue BVec = DAG.getBitcast(BVT, Op0);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
     }
     return SDValue();
   }
@@ -6600,8 +6600,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
       // Don't insert undef subvectors.
       if (SubVec.isUndef())
         continue;
-      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, SubVec,
-                        DAG.getIntPtrConstant(OpIdx.index() * NumOpElts, DL));
+      Vec =
+          DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, SubVec,
+                      DAG.getVectorIdxConstant(OpIdx.index() * NumOpElts, DL));
     }
     return Vec;
   }
@@ -8404,7 +8405,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (!EltVT.isInteger()) {
     // Floating-point extracts are handled in TableGen.
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
-                       DAG.getConstant(0, DL, XLenVT));
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   SDValue Elt0 = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Vec);
@@ -8837,7 +8838,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::riscv_vfmv_f_s:
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
-                       Op.getOperand(1), DAG.getConstant(0, DL, XLenVT));
+                       Op.getOperand(1), DAG.getVectorIdxConstant(0, DL));
   case Intrinsic::riscv_vmv_v_x:
     return lowerScalarSplat(Op.getOperand(1), Op.getOperand(2),
                             Op.getOperand(3), Op.getSimpleValueType(), DL, DAG,
@@ -9437,15 +9438,15 @@ static SDValue lowerReductionSeq(unsigned RVVOpcode, MVT ResVT,
   SDValue InitialValue = lowerScalarInsert(StartValue, InnerVL, InnerVT, DL,
                                            DAG, Subtarget);
   if (M1VT != InnerVT)
-    InitialValue = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, M1VT,
-                               DAG.getUNDEF(M1VT),
-                               InitialValue, DAG.getConstant(0, DL, XLenVT));
+    InitialValue =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, DL, M1VT, DAG.getUNDEF(M1VT),
+                    InitialValue, DAG.getVectorIdxConstant(0, DL));
   SDValue PassThru = NonZeroAVL ? DAG.getUNDEF(M1VT) : InitialValue;
   SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
   SDValue Ops[] = {PassThru, Vec, InitialValue, Mask, VL, Policy};
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, Ops);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
-                     DAG.getConstant(0, DL, XLenVT));
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
@@ -9490,9 +9491,8 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op,
   case ISD::UMIN:
   case ISD::SMAX:
   case ISD::SMIN:
-    MVT XLenVT = Subtarget.getXLenVT();
     StartV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Vec,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
   }
   return lowerReductionSeq(RVVOpcode, Op.getSimpleValueType(), StartV, Vec,
                            Mask, VL, DL, DAG, Subtarget);
@@ -9521,10 +9521,9 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT,
                            Op.getOperand(0));
   case ISD::VECREDUCE_FMIN:
   case ISD::VECREDUCE_FMAX: {
-    MVT XLenVT = Subtarget.getXLenVT();
     SDValue Front =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op.getOperand(0),
-                    DAG.getConstant(0, DL, XLenVT));
+                    DAG.getVectorIdxConstant(0, DL));
     unsigned RVVOpc = (Opcode == ISD::VECREDUCE_FMIN)
                           ? RISCVISD::VECREDUCE_FMIN_VL
                           : RISCVISD::VECREDUCE_FMAX_VL;
@@ -9646,14 +9645,14 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     if (OrigIdx == 0 && Vec.isUndef() && VecVT.isFixedLengthVector()) {
       SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
                            DAG.getUNDEF(ContainerVT), SubVec,
-                           DAG.getConstant(0, DL, XLenVT));
+                           DAG.getVectorIdxConstant(0, DL));
       SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
       return DAG.getBitcast(Op.getValueType(), SubVec);
     }
 
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
                          DAG.getUNDEF(ContainerVT), SubVec,
-                         DAG.getConstant(0, DL, XLenVT));
+                         DAG.getVectorIdxConstant(0, DL));
     SDValue Mask =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. Note
@@ -9720,12 +9719,12 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
-                                 DAG.getConstant(AlignedIdx, DL, XLenVT));
+                                 DAG.getVectorIdxConstant(AlignedIdx, DL));
   }
 
   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InterSubVT,
                        DAG.getUNDEF(InterSubVT), SubVec,
-                       DAG.getConstant(0, DL, XLenVT));
+                       DAG.getVectorIdxConstant(0, DL));
 
   auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
 
@@ -9751,7 +9750,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   // This should resolve to an INSERT_SUBREG instruction.
   if (VecVT.bitsGT(InterSubVT))
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, SubVec,
-                         DAG.getConstant(AlignedIdx, DL, XLenVT));
+                         DAG.getVectorIdxConstant(AlignedIdx, DL));
 
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
@@ -9846,7 +9845,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
                       DAG.getUNDEF(ContainerVT), Vec, SlidedownAmt, Mask, VL);
     // Now we can use a cast-like subvector extract to get the result.
     Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown,
-                            DAG.getConstant(0, DL, XLenVT));
+                            DAG.getVectorIdxConstant(0, DL));
     return DAG.getBitcast(Op.getValueType(), Slidedown);
   }
 
@@ -9923,7 +9922,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // Now the vector is in the right position, extract our final subvector. This
   // should resolve to a COPY.
   Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown,
-                          DAG.getConstant(0, DL, XLenVT));
+                          DAG.getVectorIdxConstant(0, DL));
 
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
@@ -9964,7 +9963,6 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VecVT = Op.getSimpleValueType();
-  MVT XLenVT = Subtarget.getXLenVT();
 
   assert(VecVT.isScalableVector() &&
          "vector_interleave on non-scalable vector!");
@@ -10030,9 +10028,9 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
 
   // Extract the result half of the gather for even and odd
   SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,
-                             DAG.getConstant(0, DL, XLenVT));
+                             DAG.getVectorIdxConstant(0, DL));
   SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide,
-                            DAG.getConstant(0, DL, XLenVT));
+                            DAG.getVectorIdxConstant(0, DL));
 
   return DAG.getMergeValues({Even, Odd}, DL);
 }
@@ -10195,10 +10193,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
       // FIXME: This is a CONCAT_VECTORS.
       SDValue Res =
           DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, DAG.getUNDEF(VecVT), Hi,
-                      DAG.getIntPtrConstant(0, DL));
+                      DAG.getVectorIdxConstant(0, DL));
       return DAG.getNode(
           ISD::INSERT_SUBVECTOR, DL, VecVT, Res, Lo,
-          DAG.getIntPtrConstant(LoVT.getVectorMinNumElements(), DL));
+          DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
     }
 
     // Just promote the int type to i16 which will double the LMUL.
@@ -10331,9 +10329,9 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   // If the size less than a byte, we need to pad with zeros to make a byte.
   if (VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() < 8) {
     VT = MVT::v8i1;
-    StoreVal = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                           DAG.getConstant(0, DL, VT), StoreVal,
-                           DAG.getIntPtrConstant(0, DL));
+    StoreVal =
+        DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getConstant(0, DL, VT),
+                    StoreVal, DAG.getVectorIdxConstant(0, DL));
   }
 
   MVT ContainerVT = getContainerForFixedLengthVector(VT);
@@ -12109,7 +12107,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       if (isTypeLegal(BVT)) {
         SDValue BVec = DAG.getBitcast(BVT, Op0);
         Results.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, BVec,
-                                      DAG.getConstant(0, DL, XLenVT)));
+                                      DAG.getVectorIdxConstant(0, DL)));
       }
     }
     break;
@@ -12598,7 +12596,7 @@ static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG,
   if (ScalarVT != ScalarV.getValueType())
     NewScalarV =
         DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalarVT, DAG.getUNDEF(ScalarVT),
-                    NewScalarV, DAG.getConstant(0, DL, Subtarget.getXLenVT()));
+                    NewScalarV, DAG.getVectorIdxConstant(0, DL));
 
   SDValue Ops[] = {Reduce.getOperand(0), Reduce.getOperand(1),
                    NewScalarV,           Reduce.getOperand(3),
@@ -15248,8 +15246,7 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   if (ConcatVT.getVectorElementType() != InVal.getValueType())
     return SDValue();
   unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
-  SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, DL,
-                                   EltNo.getValueType());
+  SDValue NewIdx = DAG.getVectorIdxConstant(Elt % ConcatNumElts, DL);
 
   unsigned ConcatOpIdx = Elt / ConcatNumElts;
   SDValue ConcatOp = InVec.getOperand(ConcatOpIdx);
@@ -16449,7 +16446,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Result =
           DAG.getNode(N->getOpcode(), DL, M1VT, M1Passthru, Scalar, VL);
       Result = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Passthru, Result,
-                           DAG.getConstant(0, DL, XLenVT));
+                           DAG.getVectorIdxConstant(0, DL));
       return Result;
     }
 

From ef7417fbc597eab1adbad4abc6ecdb2e62319513 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Tue, 27 Feb 2024 10:37:17 +0800
Subject: [PATCH 403/546] [mlir] Fix -Wunused-but-set-variable in
 ModuleTranslation.cpp (NFC)

mlir/lib/Target/LLVMIR/ModuleTranslation.cpp:1050:11:
error: variable 'numConstantsHit' set but not used [-Werror,-Wunused-but-set-variable]
      int numConstantsHit = 0;
          ^
mlir/lib/Target/LLVMIR/ModuleTranslation.cpp:1051:11:
error: variable 'numConstantsErased' set but not used [-Werror,-Wunused-but-set-variable]
      int numConstantsErased = 0;
          ^
2 errors generated.
---
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 7cffd7bd1fa43..a11603a44dcd1 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1047,8 +1047,8 @@ LogicalResult ModuleTranslation::convertGlobals() {
     if (Block *initializer = op.getInitializerBlock()) {
       llvm::IRBuilder<> builder(llvmModule->getContext());
 
-      int numConstantsHit = 0;
-      int numConstantsErased = 0;
+      [[maybe_unused]] int numConstantsHit = 0;
+      [[maybe_unused]] int numConstantsErased = 0;
       DenseMap<llvm::ConstantAggregate *, int> constantAggregateUseMap;
 
       for (auto &op : initializer->without_terminator()) {

From f410f74cd5b26319b5796e0404c6a0f3b5cc00a5 Mon Sep 17 00:00:00 2001
From: cpsughrue <cpsughrue@gmail.com>
Date: Mon, 26 Feb 2024 21:40:21 -0500
Subject: [PATCH 404/546] Revert "[llvm][Support] Add support for executing a
 detached process (#81708)"

This reverts commit 86f6caa562255f81b93e72a501a926b17f5ad244. Unit test
was failing on a few windows build bots
---
 llvm/include/llvm/Support/Program.h    | 25 ++++-----
 llvm/lib/Support/Program.cpp           |  9 ++--
 llvm/lib/Support/Unix/Program.inc      | 18 ++-----
 llvm/lib/Support/Windows/Program.inc   |  7 +--
 llvm/unittests/Support/ProgramTest.cpp | 74 --------------------------
 5 files changed, 21 insertions(+), 112 deletions(-)

diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index 9df94eb604c7d..4c1133e44a21c 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -141,21 +141,18 @@ namespace sys {
                                         /// program shall run on.
   );
 
-  /// Similar to \ref ExecuteAndWait, but returns immediately.
-  /// \returns The \ref ProcessInfo of the newly launched process.
+  /// Similar to ExecuteAndWait, but returns immediately.
+  /// @returns The \see ProcessInfo of the newly launched process.
   /// \note On Microsoft Windows systems, users will need to either call
-  /// \ref Wait until the process has finished executing or win32's CloseHandle
-  /// API on ProcessInfo.ProcessHandle to avoid memory leaks.
-  ProcessInfo ExecuteNoWait(
-      StringRef Program, ArrayRef<StringRef> Args,
-      std::optional<ArrayRef<StringRef>> Env,
-      ArrayRef<std::optional<StringRef>> Redirects = {},
-      unsigned MemoryLimit = 0, std::string *ErrMsg = nullptr,
-      bool *ExecutionFailed = nullptr, BitVector *AffinityMask = nullptr,
-      /// If true the executed program detatches from the controlling
-      /// terminal. I/O streams such as llvm::outs, llvm::errs, and stdin will
-      /// be closed until redirected to another output location
-      bool DetachProcess = false);
+  /// \see Wait until the process finished execution or win32 CloseHandle() API
+  /// on ProcessInfo.ProcessHandle to avoid memory leaks.
+  ProcessInfo ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
+                            std::optional<ArrayRef<StringRef>> Env,
+                            ArrayRef<std::optional<StringRef>> Redirects = {},
+                            unsigned MemoryLimit = 0,
+                            std::string *ErrMsg = nullptr,
+                            bool *ExecutionFailed = nullptr,
+                            BitVector *AffinityMask = nullptr);
 
   /// Return true if the given arguments fit within system-specific
   /// argument length limits.
diff --git a/llvm/lib/Support/Program.cpp b/llvm/lib/Support/Program.cpp
index 181f68cfbb8c3..1dcd45e2d69e8 100644
--- a/llvm/lib/Support/Program.cpp
+++ b/llvm/lib/Support/Program.cpp
@@ -27,7 +27,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
                     std::optional<ArrayRef<StringRef>> Env,
                     ArrayRef<std::optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg,
-                    BitVector *AffinityMask, bool DetachProcess);
+                    BitVector *AffinityMask);
 
 int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
                         std::optional<ArrayRef<StringRef>> Env,
@@ -39,7 +39,7 @@ int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
   if (Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg,
-              AffinityMask, /*DetachProcess=*/false)) {
+              AffinityMask)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
     ProcessInfo Result = Wait(
@@ -58,14 +58,13 @@ ProcessInfo sys::ExecuteNoWait(StringRef Program, ArrayRef<StringRef> Args,
                                std::optional<ArrayRef<StringRef>> Env,
                                ArrayRef<std::optional<StringRef>> Redirects,
                                unsigned MemoryLimit, std::string *ErrMsg,
-                               bool *ExecutionFailed, BitVector *AffinityMask,
-                               bool DetachProcess) {
+                               bool *ExecutionFailed, BitVector *AffinityMask) {
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
   if (ExecutionFailed)
     *ExecutionFailed = false;
   if (!Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg,
-               AffinityMask, DetachProcess))
+               AffinityMask))
     if (ExecutionFailed)
       *ExecutionFailed = true;
 
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index 2742734bb11ed..5d9757bcc51b3 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -173,11 +173,10 @@ toNullTerminatedCStringArray(ArrayRef<StringRef> Strings, StringSaver &Saver) {
 }
 
 static bool Execute(ProcessInfo &PI, StringRef Program,
-                    ArrayRef<StringRef> Args,
-                    std::optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<StringRef> Args, std::optional<ArrayRef<StringRef>> Env,
                     ArrayRef<std::optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg,
-                    BitVector *AffinityMask, bool DetachProcess) {
+                    BitVector *AffinityMask) {
   if (!llvm::sys::fs::exists(Program)) {
     if (ErrMsg)
       *ErrMsg = std::string("Executable \"") + Program.str() +
@@ -203,8 +202,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
   // If this OS has posix_spawn and there is no memory limit being implied, use
   // posix_spawn.  It is more efficient than fork/exec.
 #ifdef HAVE_POSIX_SPAWN
-  // Cannot use posix_spawn if you would like to detach the process
-  if (MemoryLimit == 0 && !DetachProcess) {
+  if (MemoryLimit == 0) {
     posix_spawn_file_actions_t FileActionsStore;
     posix_spawn_file_actions_t *FileActions = nullptr;
 
@@ -272,7 +270,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
 
     return true;
   }
-#endif // HAVE_POSIX_SPAWN
+#endif
 
   // Create a child process.
   int child = fork();
@@ -309,14 +307,6 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
       }
     }
 
-    if (DetachProcess) {
-      // Detach from controlling terminal
-      if (::setsid() == -1) {
-        MakeErrMsg(ErrMsg, "Could not detach process, ::setsid failed");
-        return false;
-      }
-    }
-
     // Set memory limits
     if (MemoryLimit != 0) {
       SetMemoryLimits(MemoryLimit);
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index d98d55f317a35..0de9d3f756448 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -172,11 +172,10 @@ static HANDLE RedirectIO(std::optional<StringRef> Path, int fd,
 } // namespace llvm
 
 static bool Execute(ProcessInfo &PI, StringRef Program,
-                    ArrayRef<StringRef> Args,
-                    std::optional<ArrayRef<StringRef>> Env,
+                    ArrayRef<StringRef> Args, std::optional<ArrayRef<StringRef>> Env,
                     ArrayRef<std::optional<StringRef>> Redirects,
                     unsigned MemoryLimit, std::string *ErrMsg,
-                    BitVector *AffinityMask, bool DetachProcess) {
+                    BitVector *AffinityMask) {
   if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
       *ErrMsg = "program not executable";
@@ -285,8 +284,6 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
   unsigned CreateFlags = CREATE_UNICODE_ENVIRONMENT;
   if (AffinityMask)
     CreateFlags |= CREATE_SUSPENDED;
-  if (DetachProcess)
-    CreateFlags |= DETACHED_PROCESS;
 
   std::vector<wchar_t> CommandUtf16(Command.size() + 1, 0);
   std::copy(Command.begin(), Command.end(), CommandUtf16.begin());
diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp
index 58f98892c9cc7..2e2b1958b9ac9 100644
--- a/llvm/unittests/Support/ProgramTest.cpp
+++ b/llvm/unittests/Support/ProgramTest.cpp
@@ -260,80 +260,6 @@ TEST_F(ProgramEnvTest, TestExecuteNoWait) {
   ASSERT_GT(LoopCount, 1u) << "LoopCount should be >1";
 }
 
-TEST_F(ProgramEnvTest, TestExecuteNoWaitDetached) {
-  using namespace llvm::sys;
-
-  if (getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED")) {
-    sleep_for(/*seconds=*/5);
-
-#if _WIN32
-    HWND ConsoleWnd = GetConsoleWindow();
-    if (ConsoleWnd == NULL)
-      exit(100);
-    else
-      exit(200);
-#else
-    int ParentSID = std::stoi(
-        std::string(getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_SID")));
-
-    pid_t ChildSID = ::getsid(0);
-    if (ChildSID == -1) {
-      llvm::errs() << "Could not get process SID: " << strerror(errno) << '\n';
-      exit(1);
-    }
-
-    char *Detached = getenv("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_TRUE");
-    if (Detached && (ChildSID != ParentSID))
-      exit(100);
-    if (!Detached && (ChildSID == ParentSID))
-      exit(200);
-#endif
-    exit(0);
-  }
-
-  std::string Executable =
-      sys::fs::getMainExecutable(TestMainArgv0, &ProgramTestStringArg1);
-  StringRef argv[] = {
-      Executable, "--gtest_filter=ProgramEnvTest.TestExecuteNoWaitDetached"};
-  addEnvVar("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED=1");
-
-#ifndef _WIN32
-  pid_t SID = ::getsid(0);
-  ASSERT_NE(SID, -1);
-  std::string SIDEnvVar =
-      "LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_SID=" + std::to_string(SID);
-  addEnvVar(SIDEnvVar);
-#endif
-
-  // DetachProcess = true
-  {
-    std::string Error;
-    bool ExecutionFailed;
-    std::vector<llvm::StringRef> Env = getEnviron();
-    Env.emplace_back("LLVM_PROGRAM_TEST_EXECUTE_NO_WAIT_DETACHED_TRUE=1");
-    ProcessInfo PI1 =
-        ExecuteNoWait(Executable, argv, Env, {}, 0, &Error, &ExecutionFailed,
-                      nullptr, /*DetachProcess=*/true);
-    ASSERT_FALSE(ExecutionFailed) << Error;
-    ASSERT_NE(PI1.Pid, ProcessInfo::InvalidPid) << "Invalid process id";
-    ProcessInfo WaitResult = Wait(PI1, std::nullopt, &Error);
-    ASSERT_EQ(WaitResult.ReturnCode, 100);
-  }
-
-  // DetachProcess = false
-  {
-    std::string Error;
-    bool ExecutionFailed;
-    ProcessInfo PI2 =
-        ExecuteNoWait(Executable, argv, getEnviron(), {}, 0, &Error,
-                      &ExecutionFailed, nullptr, /*DetachProcess=*/false);
-    ASSERT_FALSE(ExecutionFailed) << Error;
-    ASSERT_NE(PI2.Pid, ProcessInfo::InvalidPid) << "Invalid process id";
-    ProcessInfo WaitResult = Wait(PI2, std::nullopt, &Error);
-    ASSERT_EQ(WaitResult.ReturnCode, 200);
-  }
-}
-
 TEST_F(ProgramEnvTest, TestExecuteAndWaitTimeout) {
   using namespace llvm::sys;
 

From b2a4f64e19247d0553d3dc63af62b652664c3cd6 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 26 Feb 2024 20:46:46 -0800
Subject: [PATCH 405/546] [clang-format][NFC] Skip ObjCHeaderStyleGuesser for
 empty code (#82957)

---
 clang/lib/Format/Format.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 794e326fb1c94..13588ff705f54 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3923,7 +3923,7 @@ FormatStyle::LanguageKind guessLanguage(StringRef FileName, StringRef Code) {
     auto Extension = llvm::sys::path::extension(FileName);
     // If there's no file extension (or it's .h), we need to check the contents
     // of the code to see if it contains Objective-C.
-    if (Extension.empty() || Extension == ".h") {
+    if (!Code.empty() && (Extension.empty() || Extension == ".h")) {
       auto NonEmptyFileName = FileName.empty() ? "guess.h" : FileName;
       Environment Env(Code, NonEmptyFileName, /*Ranges=*/{});
       ObjCHeaderStyleGuesser Guesser(Env, getLLVMStyle());

From e7900e695e7dfb36be8651d914a31f42a5d6c634 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 27 Feb 2024 10:04:22 +0530
Subject: [PATCH 406/546] AMDGPU: Regenerate baseline mir tests

---
 .../buffer-atomic-fadd.f32-no-rtn.ll          |  73 ++
 .../GlobalISel/buffer-atomic-fadd.f32-rtn.ll  |  77 ++
 .../buffer-atomic-fadd.v2f16-no-rtn.ll        |   8 +
 .../AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll |   4 +
 .../global-atomic-fadd.v2f16-no-rtn.ll        |   4 +
 .../GlobalISel/irtranslator-amdgpu_kernel.ll  |  59 ++
 .../GlobalISel/irtranslator-constrained-fp.ll |  32 +-
 .../GlobalISel/irtranslator-metadata.ll       |  10 +-
 ...lize-llvm.amdgcn.image.load.2darraymsaa.ll |   2 +
 .../legalize-llvm.amdgcn.image.load.3d.ll     |   2 +
 .../legalize-llvm.amdgcn.image.sample.d.ll    |   8 +
 ...galize-llvm.amdgcn.image.sample.g16.a16.ll |   6 +
 .../legalize-llvm.amdgcn.image.sample.g16.ll  |  38 +
 .../llvm.amdgcn.raw.buffer.atomic.cmpswap.ll  | 596 ++++++++----
 ...lvm.amdgcn.struct.buffer.atomic.cmpswap.ll | 632 ++++++++-----
 .../regbankselect-amdgcn.image.load.1d.ll     |   4 +
 .../regbankselect-amdgcn.image.sample.1d.ll   |   5 +
 llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir     |  66 ++
 .../AMDGPU/accvgpr-spill-scc-clobber.mir      |  18 +
 .../CodeGen/AMDGPU/agpr-copy-no-vgprs.mir     |   2 +
 .../CodeGen/AMDGPU/agpr-copy-propagation.mir  |   4 +
 .../AMDGPU/agpr-copy-sgpr-no-vgprs.mir        |   2 +
 .../test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir |  44 +-
 ...oalescer-verifier-error-empty-subrange.mir |  16 +-
 .../block-should-not-be-in-alive-blocks.mir   |   8 +-
 .../AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll   |   8 +
 .../AMDGPU/buffer-atomic-fadd.f32-rtn.ll      |   8 +
 .../AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll |   8 +
 ...coalesce-identity-copies-undef-subregs.mir |  50 +-
 .../coalesce-into-dead-subreg-copies.mir      |   2 +-
 .../AMDGPU/coalesce-liveout-undef-copy.mir    |   8 +-
 .../AMDGPU/coalescer-remat-dead-use.mir       |   2 +-
 ...er-removepartial-extend-undef-subrange.mir |  14 +-
 .../coalescer-subranges-prune-kill-copy.mir   |   6 +-
 ...escing-subreg-was-undef-but-became-def.mir |   8 +-
 .../AMDGPU/coalescing_makes_lanes_undef.mir   |  10 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir  |  26 +-
 .../CodeGen/AMDGPU/combine-sreg64-inits.mir   |  50 +-
 llvm/test/CodeGen/AMDGPU/commute-vop3.mir     |   1 +
 llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir |  57 ++
 .../AMDGPU/couldnt-join-subrange-3.mir        |  38 +-
 .../AMDGPU/dbg-value-ends-sched-region.mir    |  16 +-
 llvm/test/CodeGen/AMDGPU/dead_bundle.mir      |   6 +-
 .../AMDGPU/debug-value-scheduler-crash.mir    |  12 +-
 .../extend-phi-subrange-not-in-parent.mir     |  20 +-
 llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll   |  52 +-
 .../CodeGen/AMDGPU/flat-atomic-fadd.f32.ll    |   4 +
 .../AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll |   4 +
 .../greedy-alloc-fail-sgpr1024-spill.mir      |   6 +-
 .../greedy-instruction-split-subrange.mir     |  28 +-
 llvm/test/CodeGen/AMDGPU/gws-hazards.mir      |  12 +
 .../AMDGPU/high-bits-zeroed-16-bit-ops.mir    |  96 ++
 ...ef-subreg-redef-blender-verifier-error.mir |  24 +-
 .../CodeGen/AMDGPU/loop_header_nopred.mir     |   1 +
 .../lower-control-flow-live-intervals.mir     |   6 +-
 ...wer-control-flow-live-variables-update.mir |  42 +-
 .../lower-control-flow-other-terminators.mir  |  16 +-
 .../machine-scheduler-sink-trivial-remats.mir | 874 +++++++++---------
 .../machine-sink-ignorable-exec-use.mir       |  62 +-
 llvm/test/CodeGen/AMDGPU/merge-s-load.mir     |   4 +
 llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir    | 306 ++++++
 .../CodeGen/AMDGPU/mode-register-fptrunc.mir  |   2 +
 .../CodeGen/AMDGPU/move-to-valu-lshlrev.mir   |   1 +
 .../AMDGPU/move-to-valu-vimage-vsample.ll     |   3 +
 .../AMDGPU/mubuf-legalize-operands.mir        |   6 +
 .../AMDGPU/neighboring-mfma-padding.mir       |  40 +
 .../CodeGen/AMDGPU/no-remat-indirect-mov.mir  |  34 +-
 ...-masking-pre-ra-update-liveness-wave32.mir |  38 +-
 ...pt-exec-masking-pre-ra-update-liveness.mir |  20 +-
 .../optimize-exec-mask-pre-ra-loop-phi.mir    |  28 +-
 .../AMDGPU/optimize-exec-masking-pre-ra.mir   |   4 +-
 .../AMDGPU/partial-forwarding-hazards.mir     |   7 +
 .../CodeGen/AMDGPU/pei-build-av-spill.mir     | 168 ++++
 .../AMDGPU/pei-build-spill-partial-agpr.mir   |   7 +
 llvm/test/CodeGen/AMDGPU/pei-build-spill.mir  | 126 +++
 .../ran-out-of-sgprs-allocation-failure.mir   |   2 +-
 ...fast-dont-drop-subreg-index-issue61134.mir |  14 +-
 .../regcoalesce-cannot-join-failures.mir      |  36 +-
 ...keep-valid-lanes-implicit-def-bug39602.mir |  16 +-
 ...cer-resolve-lane-conflict-by-subranges.mir |  19 +-
 .../test/CodeGen/AMDGPU/remat-dead-subreg.mir |  16 +-
 ...ssert-dead-def-subreg-use-other-subreg.mir |  28 +-
 ...ched-assert-onlydbg-value-empty-region.mir |  34 +-
 ...dleMoveUp-subreg-def-across-subreg-def.mir |  48 +-
 .../AMDGPU/schedule-barrier-fpmode.mir        |   4 +-
 llvm/test/CodeGen/AMDGPU/schedule-barrier.mir |  42 +-
 .../CodeGen/AMDGPU/set-gpr-idx-peephole.mir   | 278 +++---
 .../AMDGPU/shrink-instructions-flags.mir      |   2 +-
 llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir   |  16 +
 .../CodeGen/AMDGPU/si-fold-aligned-vgprs.mir  |  13 +
 .../CodeGen/AMDGPU/si-lower-control-flow.mir  |  10 +-
 llvm/test/CodeGen/AMDGPU/spill-agpr.mir       |  42 +
 .../AMDGPU/spill-empty-live-interval.mir      |  20 +-
 .../split-liverange-overlapping-copies.mir    |  42 +-
 .../CodeGen/AMDGPU/split-mbb-lis-subrange.mir |   8 +-
 .../CodeGen/AMDGPU/splitkit-copy-bundle.mir   | 207 +++--
 .../AMDGPU/splitkit-copy-live-lanes.mir       | 802 ++++++++--------
 ...ubreg-undef-def-with-other-subreg-defs.mir |  28 +-
 .../AMDGPU/trans-forwarding-hazards.mir       |   8 +
 .../transform-block-with-return-to-epilog.ll  | 222 +++--
 llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll |  20 +-
 .../CodeGen/AMDGPU/twoaddr-regsequence.mir    |   6 +-
 .../undef-subreg-use-after-coalesce.mir       |  18 +-
 .../CodeGen/AMDGPU/valu-mask-write-hazard.mir |  30 +
 llvm/test/CodeGen/AMDGPU/vgpr-remat.mir       |  10 +-
 .../CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir |  39 +
 llvm/test/CodeGen/AMDGPU/vopd-combine.mir     |  30 +
 llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir  |  18 +-
 llvm/test/CodeGen/AMDGPU/wqm-terminators.mir  |   6 +-
 109 files changed, 4054 insertions(+), 2171 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
index 684fd8a86a694..0816eae28f614 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-no-rtn.ll
@@ -19,6 +19,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -32,6 +33,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32
   ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -64,6 +66,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -78,6 +81,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -111,6 +115,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -125,6 +130,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -160,6 +166,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -176,6 +183,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -210,6 +218,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -223,6 +232,20 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -242,6 +265,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -256,6 +280,21 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
@@ -275,6 +314,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -289,6 +329,21 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret void
 }
@@ -310,6 +365,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -326,6 +382,23 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX12-NEXT:   BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
index 42d4d21f31081..c0b84c914ce5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.f32-rtn.ll
@@ -19,6 +19,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -33,6 +34,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -67,6 +69,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -82,6 +85,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_offen_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -117,6 +121,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -132,6 +137,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -169,6 +175,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -186,6 +193,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn
   ; GFX12: bb.1 (%ir-block.0):
   ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -222,6 +230,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -236,6 +245,21 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET_RTN [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFSET_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 0, i32 %soffset, i32 0)
   ret float %ret
 }
@@ -256,6 +280,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -271,6 +296,22 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_OFFEN_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret float %ret
 }
@@ -291,6 +332,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -306,6 +348,22 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN_RTN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_IDXEN_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
   ret float %ret
 }
@@ -328,6 +386,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -345,6 +404,24 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
   ; GFX11-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN_RTN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_VBUFFER_BOTHEN_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret float %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
index 1ba27c72803d3..9514bea86e4d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -17,6 +17,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -49,6 +50,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -82,6 +84,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -117,6 +120,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
@@ -151,6 +155,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY]], [[REG_SEQUENCE]], [[COPY5]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0
@@ -183,6 +188,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -216,6 +222,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1
@@ -251,6 +258,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY6]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY]], [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY7]], 0, 2, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index b34c52425758f..aa9ebb9226cdd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -14,6 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -40,6 +41,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -66,6 +68,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -92,6 +95,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
   ; GFX11: bb.1 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
index 3dea9777cfda8..c71beff4dc5c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.v2f16-no-rtn.ll
@@ -14,6 +14,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -40,6 +41,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -66,6 +68,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -92,6 +95,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR [[V_MOV_B32_e32_]], [[COPY2]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (<2 x s16>) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.1 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
index 6b2e6d8dfdb39..f2fe815a71202 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll
@@ -17,6 +17,7 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -51,6 +52,7 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_zext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -85,6 +87,7 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i8_sext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -119,6 +122,7 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -153,6 +157,7 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_zext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -187,6 +192,7 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i16_sext_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -220,6 +226,7 @@ define amdgpu_kernel void @i32_arg(ptr addrspace(1) nocapture %out, i32 %in) nou
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -252,6 +259,7 @@ define amdgpu_kernel void @f32_arg(ptr addrspace(1) nocapture %out, float %in) n
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -284,6 +292,7 @@ define amdgpu_kernel void @v2i8_arg(ptr addrspace(1) %out, <2 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store (<2 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -316,6 +325,7 @@ define amdgpu_kernel void @v2i16_arg(ptr addrspace(1) %out, <2 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s16>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store (<2 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -348,6 +358,7 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -380,6 +391,7 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<2 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store (<2 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -412,6 +424,7 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store (<3 x s8>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -444,6 +457,7 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s16>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store (<3 x s16>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -476,6 +490,7 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -508,6 +523,7 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<3 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store (<3 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v3f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -540,6 +556,7 @@ define amdgpu_kernel void @v4i8_arg(ptr addrspace(1) %out, <4 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s8>), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store (<4 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -572,6 +589,7 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s16>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store (<4 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -604,6 +622,7 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -636,6 +655,7 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<4 x s32>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v4f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -668,6 +688,7 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s8>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store (<8 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -700,6 +721,7 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s16>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store (<8 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -732,6 +754,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -764,6 +787,7 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<8 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store (<8 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v8f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -796,6 +820,7 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s8>), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store (<16 x s8>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i8_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -828,6 +853,7 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s16>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store (<16 x s16>) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i16_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -860,6 +886,7 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -892,6 +919,7 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (<16 x s32>), align 16, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v16f32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -924,6 +952,7 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: kernel_arg_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -955,6 +984,7 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double  %in) {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s64), addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: f64_kernel_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -987,6 +1017,7 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s1), align 8, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store (s1) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1019,6 +1050,7 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i32
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1053,6 +1085,7 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_zext_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1087,6 +1120,7 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i32
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1121,6 +1155,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
   ; HSA-VI-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
   ; HSA-VI-NEXT:   G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store (s64) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: i1_arg_sext_i64
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1154,6 +1189,7 @@ define amdgpu_kernel void @empty_struct_arg({} %arg0, i32 %arg1) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: empty_struct_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1181,6 +1217,7 @@ define amdgpu_kernel void @empty_array_arg([0 x i8] %arg0, i32 %arg1) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: empty_array_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1232,6 +1269,7 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8 %pad,
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[C5]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD4]](s64), [[C5]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: struct_argument_alignment
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1299,6 +1337,7 @@ define amdgpu_kernel void @pointer_in_struct_argument({ptr addrspace(3), ptr add
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](p3), [[C5]](p1) :: (volatile store (p3) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD4]](p1234), [[C5]](p1) :: (volatile store (p1234) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: pointer_in_struct_argument
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1364,6 +1403,7 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[C4]](p1) :: (volatile store (s32) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s64), [[C4]](p1) :: (volatile store (s64) into `ptr addrspace(1) null`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: packed_struct_argument_alignment
   ; LEGACY-MESA-VI: bb.1 (%ir-block.1):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1405,6 +1445,7 @@ define amdgpu_kernel void @unused_i32_arg(ptr addrspace(1) nocapture %out, i32 %
   ; HSA-VI-NEXT: {{  $}}
   ; HSA-VI-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: unused_i32_arg
   ; LEGACY-MESA-VI: bb.1.entry:
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1431,6 +1472,7 @@ define amdgpu_kernel void @byref_constant_i8_arg(ptr addrspace(1) nocapture %out
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1466,6 +1508,7 @@ define amdgpu_kernel void @byref_constant_i16_arg(ptr addrspace(1) nocapture %ou
   ; HSA-VI-NEXT:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
   ; HSA-VI-NEXT:   G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i16_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1504,6 +1547,7 @@ define amdgpu_kernel void @byref_constant_i32_arg(ptr addrspace(1) nocapture %ou
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1545,6 +1589,7 @@ define amdgpu_kernel void @byref_constant_v4i32_arg(ptr addrspace(1) nocapture %
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](<4 x s32>), [[LOAD]](p1) :: (volatile store (<4 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_v4i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1586,6 +1631,7 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_align_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1627,6 +1673,7 @@ define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(ptr addrspace
   ; HSA-VI-NEXT:   G_STORE [[LOAD2]](<16 x s32>), [[LOAD]](p1) :: (volatile store (<16 x s32>) into %ir.out, align 4, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_natural_align_constant_v16i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.1):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1666,6 +1713,7 @@ define amdgpu_kernel void @byref_global_i32_arg(ptr addrspace(1) nocapture %out,
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1700,6 +1748,7 @@ define amdgpu_kernel void @byref_flat_i32_arg(ptr addrspace(1) nocapture %out, p
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p0) :: (dereferenceable load (s32) from %ir.in.byref)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_flat_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1734,6 +1783,7 @@ define amdgpu_kernel void @byref_constant_32bit_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p6) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 6)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_32bit_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1768,6 +1818,7 @@ define amdgpu_kernel void @byref_unknown_as_i32_arg(ptr addrspace(1) nocapture %
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p999) :: (dereferenceable load (s32) from %ir.in.byref, addrspace 999)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_unknown_as_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1803,6 +1854,7 @@ define amdgpu_kernel void @byref_local_i32_arg(ptr addrspace(1) nocapture %out,
   ; HSA-VI-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p3) :: (dereferenceable load (s32) from %ir.in.byref, addrspace 3)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_local_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1844,6 +1896,7 @@ define amdgpu_kernel void @multi_byref_constant_i32_arg(ptr addrspace(1) nocaptu
   ; HSA-VI-NEXT:   G_STORE [[LOAD3]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (volatile store (s32) into %ir.out, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: multi_byref_constant_i32_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1885,6 +1938,7 @@ define amdgpu_kernel void @byref_constant_i32_arg_offset0(ptr addrspace(4) byref
   ; HSA-VI-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32) from %ir.in.byref, addrspace 4)
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: byref_constant_i32_arg_offset0
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1913,6 +1967,7 @@ define amdgpu_kernel void @p3i8_arg(ptr addrspace(3) %arg) nounwind {
   ; HSA-VI-NEXT:   [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 9
   ; HSA-VI-NEXT:   G_STORE [[C1]](s8), [[LOAD]](p3) :: (store (s8) into %ir.arg, align 4, addrspace 3)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: p3i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1938,6 +1993,7 @@ define amdgpu_kernel void @p1i8_arg(ptr addrspace(1) %arg) nounwind {
   ; HSA-VI-NEXT:   [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 0
   ; HSA-VI-NEXT:   G_STORE [[C]](s8), [[C1]](p3) :: (store (s8) into `ptr addrspace(3) null`, addrspace 3)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: p1i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1963,6 +2019,7 @@ define amdgpu_kernel void @v2p1i8_arg(<2 x ptr addrspace(1)> %arg) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](<2 x p1>), [[DEF]](p1) :: (store (<2 x p1>) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p1i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -1990,6 +2047,7 @@ define amdgpu_kernel void @v2p3i8_arg(<2 x ptr addrspace(3)> %arg) nounwind {
   ; HSA-VI-NEXT:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
   ; HSA-VI-NEXT:   G_STORE [[LOAD]](<2 x p3>), [[DEF]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef`, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p3i8_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
@@ -2023,6 +2081,7 @@ define amdgpu_kernel void @v2p1i8_in_struct_arg({ <2 x ptr addrspace(1)>, <2 x p
   ; HSA-VI-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[DEF]], [[C2]](s64)
   ; HSA-VI-NEXT:   G_STORE [[LOAD1]](<2 x p3>), [[PTR_ADD2]](p1) :: (store (<2 x p3>) into `ptr addrspace(1) undef` + 16, align 16, addrspace 1)
   ; HSA-VI-NEXT:   S_ENDPGM 0
+  ;
   ; LEGACY-MESA-VI-LABEL: name: v2p1i8_in_struct_arg
   ; LEGACY-MESA-VI: bb.1 (%ir-block.0):
   ; LEGACY-MESA-VI-NEXT:   liveins: $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
index 27eb0b7682ec6..3a31ab4ab9d0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
@@ -36,8 +36,8 @@ define float @v_constained_fadd_f32_fpexcept_ignore(float %x, float %y) #0 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -50,8 +50,8 @@ define float @v_constained_fadd_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FADD [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FADD]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -102,8 +102,8 @@ define <2 x float> @v_constained_fadd_v2f32_fpexcept_ignore(<2 x float> %x, <2 x
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; CHECK-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK-NEXT:   %6:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
-  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+  ; CHECK-NEXT:   [[STRICT_FADD:%[0-9]+]]:_(<2 x s32>) = nofpexcept G_STRICT_FADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
+  ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[STRICT_FADD]](<2 x s32>)
   ; CHECK-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; CHECK-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
@@ -138,8 +138,8 @@ define float @v_constained_fsub_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FSUB:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FSUB [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FSUB]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -152,8 +152,8 @@ define float @v_constained_fmul_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FMUL:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FMUL [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FMUL]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -166,8 +166,8 @@ define float @v_constained_fdiv_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FDIV:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FDIV [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FDIV]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -180,8 +180,8 @@ define float @v_constained_frem_f32_fpexcept_ignore_flags(float %x, float %y) #0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-  ; CHECK-NEXT:   %2:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %2(s32)
+  ; CHECK-NEXT:   [[STRICT_FREM:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FREM [[COPY]], [[COPY1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FREM]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
@@ -195,8 +195,8 @@ define float @v_constained_fma_f32_fpexcept_ignore_flags(float %x, float %y, flo
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; CHECK-NEXT:   %3:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
-  ; CHECK-NEXT:   $vgpr0 = COPY %3(s32)
+  ; CHECK-NEXT:   [[STRICT_FMA:%[0-9]+]]:_(s32) = nsz nofpexcept G_STRICT_FMA [[COPY]], [[COPY1]], [[COPY2]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[STRICT_FMA]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val = call nsz float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret float %val
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
index 75337b76994e6..101bb6c0ed123 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-metadata.ll
@@ -5,12 +5,12 @@
 define i32 @reloc_constant() {
   ; CHECK-LABEL: name: reloc_constant
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK:   [[INT0:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), !0
   ; We cannot have any specific metadata check here as ConstantAsMetadata is printed as <raw_ptr_val>
-  ; CHECK:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), <0x{{[0-9a-f]+}}>
-  ; CHECK:   [[SUM:%[0-9]+]]:_(s32) = G_ADD [[INT0]], [[INT1]]
-  ; CHECK:   $vgpr0 = COPY [[SUM]](s32)
-  ; CHECK:   SI_RETURN implicit $vgpr0
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.reloc.constant), <0x{{[0-9a-f]+}}>
+  ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[INT]], [[INT1]]
+  ; CHECK-NEXT:   $vgpr0 = COPY [[ADD]](s32)
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %val0 = call i32 @llvm.amdgcn.reloc.constant(metadata !0)
   %val1 = call i32 @llvm.amdgcn.reloc.constant(metadata i32 4)
   %res = add i32 %val0, %val1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
index 12aa8de2baf43..4d36e0f797016 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.2darraymsaa.ll
@@ -30,6 +30,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
   ; GFX6-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX6-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -88,6 +89,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
   ; GFX6-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX6-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX10NSA-LABEL: name: load_2darraymsaa_tfe
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $vgpr0, $vgpr1, $vgpr2, $vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
index f15307563f7b8..2c155b72c649f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.load.3d.ll
@@ -25,6 +25,7 @@ define amdgpu_ps float @image_load_3d_f32(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX6-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.load.3d), 1, [[BUILD_VECTOR1]](<3 x s32>), $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable load (s32), addrspace 8)
   ; GFX6-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX10NSA-LABEL: name: image_load_3d_f32
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
@@ -72,6 +73,7 @@ define amdgpu_ps float @image_load_3d_tfe_f32(<8 x i32> inreg %rsrc, i32 %s, i32
   ; GFX6-NEXT:   G_STORE [[UV1]](s32), [[DEF]](p1) :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
   ; GFX6-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX10NSA-LABEL: name: image_load_3d_tfe_f32
   ; GFX10NSA: bb.1 (%ir-block.0):
   ; GFX10NSA-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
index 0d7d3abd918ce..241170b94318a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.d.ll
@@ -39,6 +39,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -74,6 +75,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -151,6 +153,7 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
@@ -187,6 +190,7 @@ define amdgpu_ps <4 x float> @sample_c_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
@@ -266,6 +270,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
@@ -303,6 +308,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_3d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
@@ -384,6 +390,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_o_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
@@ -422,6 +429,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_o_3d(<8 x i32> inreg %rsrc, <4 x i32
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_o_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
index 288c46f5f0f2b..f05b258c974d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.a16.ll
@@ -39,6 +39,7 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_1d_g16_a16
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -74,6 +75,7 @@ define amdgpu_ps <4 x float> @sample_d_1d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_1d_g16_a16
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -155,6 +157,7 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_2d_g16_a16
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -195,6 +198,7 @@ define amdgpu_ps <4 x float> @sample_d_2d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_2d_g16_a16
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -292,6 +296,7 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_3d_g16_a16
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -343,6 +348,7 @@ define amdgpu_ps <4 x float> @sample_d_3d_g16_a16(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_3d_g16_a16
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
index b36b35937cf8b..cc2a8ba9c4d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.sample.g16.ll
@@ -38,6 +38,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -72,6 +73,7 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -151,6 +153,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -190,6 +193,7 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -284,6 +288,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_3d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -333,6 +338,7 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_3d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -424,6 +430,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -460,6 +467,7 @@ define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -543,6 +551,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -584,6 +593,7 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -668,6 +678,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -704,6 +715,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -787,6 +799,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_d_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -828,6 +841,7 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_d_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -914,6 +928,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -952,6 +967,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -1041,6 +1057,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_d_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -1085,6 +1102,7 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_d_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -1169,6 +1187,7 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -1203,6 +1222,7 @@ define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2
@@ -1282,6 +1302,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -1321,6 +1342,7 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
@@ -1402,6 +1424,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1438,6 +1461,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1521,6 +1545,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1562,6 +1587,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1646,6 +1672,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1682,6 +1709,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3
@@ -1765,6 +1793,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_cd_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1806,6 +1835,7 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_cd_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
@@ -1892,6 +1922,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_cl_1d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -1930,6 +1961,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_cl_1d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
@@ -2019,6 +2051,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX10-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX11-LABEL: name: sample_c_cd_cl_2d
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -2063,6 +2096,7 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   $vgpr2 = COPY [[UV2]](s32)
   ; GFX11-NEXT:   $vgpr3 = COPY [[UV3]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+  ;
   ; GFX12-LABEL: name: sample_c_cd_cl_2d
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
@@ -2155,6 +2189,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX10-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[CONCAT_VECTORS]](<14 x s16>), $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32), addrspace 8)
   ; GFX10-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX11-LABEL: name: sample_c_d_o_2darray_V1
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -2197,6 +2232,7 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
   ; GFX11-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.c.d.o.2darray), 4, [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[CONCAT_VECTORS]](<6 x s16>), $noreg, $noreg, $noreg, $noreg, [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 2 :: (dereferenceable load (s32), addrspace 8)
   ; GFX11-NEXT:   $vgpr0 = COPY [[AMDGPU_INTRIN_IMAGE_LOAD]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
   ; GFX12-LABEL: name: sample_c_d_o_2darray_V1
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -2289,6 +2325,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX10-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX10-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  ;
   ; GFX11-LABEL: name: sample_c_d_o_2darray_V2
   ; GFX11: bb.1.main_body:
   ; GFX11-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
@@ -2333,6 +2370,7 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
   ; GFX11-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GFX11-NEXT:   $vgpr1 = COPY [[UV1]](s32)
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  ;
   ; GFX12-LABEL: name: sample_c_d_o_2darray_V2
   ; GFX12: bb.1.main_body:
   ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index 7bde96d0be9b6..c2799e5836a97 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -367,33 +367,61 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
 
 ; Natural mapping
 define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -401,97 +429,184 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr
 
 ; Natural mapping
 define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 ; All operands need regbank legalization
 define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+  ; GFX8-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -499,96 +614,183 @@ define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr
 
 ; All operands need regbank legalization
 define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_OFFEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %voffset = add i32 %voffset.base, 4095
   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index 70d22df868be4..f9f70ecadfe60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -390,35 +390,65 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
 
 ; Natural mapping
 define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -426,102 +456,194 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__s
 
 ; Natural mapping
 define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 ; All operands need legalization
 define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
-  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; GFX8-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; GFX12-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
   ret double %cast
@@ -529,101 +651,193 @@ define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__v
 
 ; All operands need legalization
 define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
-  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
-  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   S_ENDPGM 0
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX8-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX8-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.2:
+  ; GFX8-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX8-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX8-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.3:
+  ; GFX8-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; GFX8-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.4:
+  ; GFX8-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT: bb.5:
+  ; GFX8-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; GFX12-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.2:
+  ; GFX12-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; GFX12-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; GFX12-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[S_AND_B32_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.3:
+  ; GFX12-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc
+  ; GFX12-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.4:
+  ; GFX12-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   $exec_lo = S_MOV_B32_term [[S_MOV_B32_]]
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT: bb.5:
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
 }
 
 define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) {
-  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
-  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
-  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
-  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
-  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
-  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
-  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
-  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ; GFX8-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX8-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX8-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX8-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX8-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX8-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX8-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; GFX8-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX8-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX8-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX8-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX8-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  ;
+  ; GFX12-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX12-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; GFX12-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; GFX12-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; GFX12-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; GFX12-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; GFX12-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_VBUFFER_BOTHEN_RTN]].sub0_sub1
+  ; GFX12-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; GFX12-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; GFX12-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; GFX12-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; GFX12-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
   %voffset = add i32 %voffset.base, 4095
   %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   %cast = bitcast i64 %ret to double
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 8c4ce1caa8d8d..61263e0efa2ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -23,6 +23,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__sgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0
@@ -69,6 +70,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; FAST-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__sgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10
@@ -157,6 +159,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; FAST-NEXT:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY9]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_vgpr_vaddr__vgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
@@ -287,6 +290,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; FAST-NEXT:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[COPY10]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: load_1d_sgpr_vaddr__vgpr_srsrc
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index 9833cd1318e5f..d6a7ae8d867fe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -27,6 +27,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; FAST-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $vgpr0
@@ -81,6 +82,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; FAST-NEXT:   [[AMDGPU_INTRIN_IMAGE_LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_AMDGPU_INTRIN_IMAGE_LOAD intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0, 0 :: (dereferenceable load (<4 x s32>), addrspace 8)
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14
@@ -177,6 +179,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT: bb.5:
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
@@ -306,6 +309,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; FAST-NEXT: bb.5:
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
@@ -447,6 +451,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; FAST-NEXT: bb.5:
   ; FAST-NEXT:   G_STORE [[AMDGPU_INTRIN_IMAGE_LOAD]](<4 x s32>), [[DEF]](p1) :: (store (<4 x s32>) into `ptr addrspace(1) undef`, addrspace 1)
   ; FAST-NEXT:   S_ENDPGM 0
+  ;
   ; GREEDY-LABEL: name: sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp
   ; GREEDY: bb.1 (%ir-block.0):
   ; GREEDY-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
index 3176feaf67d88..63b8cb6ffcaae 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir
@@ -57,11 +57,13 @@ body:             |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0
+    ;
     ; GFX90A-LABEL: name: a_to_v
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0
+    ;
     ; GFX940-LABEL: name: a_to_v
     ; GFX940: liveins: $agpr0
     ; GFX940-NEXT: {{  $}}
@@ -84,12 +86,14 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
+    ;
     ; GFX90A-LABEL: name: a2_to_v2
     ; GFX90A: liveins: $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1
+    ;
     ; GFX940-LABEL: name: a2_to_v2
     ; GFX940: liveins: $agpr0_agpr1
     ; GFX940-NEXT: {{  $}}
@@ -114,6 +118,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
+    ;
     ; GFX90A-LABEL: name: a3_to_v3
     ; GFX90A: liveins: $agpr0_agpr1_agpr2
     ; GFX90A-NEXT: {{  $}}
@@ -121,6 +126,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2
+    ;
     ; GFX940-LABEL: name: a3_to_v3
     ; GFX940: liveins: $agpr0_agpr1_agpr2
     ; GFX940-NEXT: {{  $}}
@@ -146,6 +152,7 @@ body:             |
     ; GFX908-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ;
     ; GFX90A-LABEL: name: a4_to_v4
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -154,6 +161,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3
+    ;
     ; GFX940-LABEL: name: a4_to_v4
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -185,6 +193,7 @@ body:             |
     ; GFX908-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX908-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ;
     ; GFX90A-LABEL: name: a8_to_v8
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX90A-NEXT: {{  $}}
@@ -197,6 +206,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX90A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+    ;
     ; GFX940-LABEL: name: a8_to_v8
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX940-NEXT: {{  $}}
@@ -239,6 +249,7 @@ body:             |
     ; GFX908-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX908-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ;
     ; GFX90A-LABEL: name: a16_to_v16
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX90A-NEXT: {{  $}}
@@ -259,6 +270,7 @@ body:             |
     ; GFX90A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX90A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ;
     ; GFX940-LABEL: name: a16_to_v16
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX940-NEXT: {{  $}}
@@ -294,11 +306,13 @@ body:             |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: v_to_a
     ; GFX90A: liveins: $vgpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: v_to_a
     ; GFX940: liveins: $vgpr0
     ; GFX940-NEXT: {{  $}}
@@ -320,12 +334,14 @@ body:             |
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX90A-LABEL: name: v2_to_a2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX940-LABEL: name: v2_to_a2
     ; GFX940: liveins: $vgpr0_vgpr1
     ; GFX940-NEXT: {{  $}}
@@ -349,6 +365,7 @@ body:             |
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX90A-LABEL: name: v3_to_a3
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX90A-NEXT: {{  $}}
@@ -356,6 +373,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX940-LABEL: name: v3_to_a3
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX940-NEXT: {{  $}}
@@ -381,6 +399,7 @@ body:             |
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: v4_to_a4
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -389,6 +408,7 @@ body:             |
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX940-LABEL: name: v4_to_a4
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
@@ -419,6 +439,7 @@ body:             |
     ; GFX908-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: v8_to_a8
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX90A-NEXT: {{  $}}
@@ -431,6 +452,7 @@ body:             |
     ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: v8_to_a8
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX940-NEXT: {{  $}}
@@ -473,6 +495,7 @@ body:             |
     ; GFX908-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX90A-LABEL: name: v16_to_a16
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX90A-NEXT: {{  $}}
@@ -493,6 +516,7 @@ body:             |
     ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX940-LABEL: name: v16_to_a16
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX940-NEXT: {{  $}}
@@ -529,11 +553,13 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: s_to_a
     ; GFX90A: liveins: $sgpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $sgpr0, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: s_to_a
     ; GFX940: liveins: $sgpr0
     ; GFX940-NEXT: {{  $}}
@@ -557,12 +583,14 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX90A-LABEL: name: s2_to_a2
     ; GFX90A: liveins: $sgpr0_sgpr1
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $sgpr0_sgpr1
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1
+    ;
     ; GFX940-LABEL: name: s2_to_a2
     ; GFX940: liveins: $sgpr0_sgpr1
     ; GFX940-NEXT: {{  $}}
@@ -589,6 +617,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX90A-LABEL: name: s3_to_a3
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: {{  $}}
@@ -596,6 +625,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX940-LABEL: name: s3_to_a3
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: {{  $}}
@@ -625,6 +655,7 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: s4_to_a4
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -633,6 +664,7 @@ body:             |
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX940-LABEL: name: s4_to_a4
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
@@ -667,6 +699,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX908-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX90A-LABEL: name: s6_to_a6
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX90A-NEXT: {{  $}}
@@ -677,6 +710,7 @@ body:             |
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX940-LABEL: name: s6_to_a6
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5
     ; GFX940-NEXT: {{  $}}
@@ -717,6 +751,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX908-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: s8_to_a8
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: {{  $}}
@@ -729,6 +764,7 @@ body:             |
     ; GFX90A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: s8_to_a8
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX940-NEXT: {{  $}}
@@ -787,6 +823,7 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX908-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX90A-LABEL: name: s16_to_a16
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX90A-NEXT: {{  $}}
@@ -807,6 +844,7 @@ body:             |
     ; GFX90A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX940-LABEL: name: s16_to_a16
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
     ; GFX940-NEXT: {{  $}}
@@ -841,10 +879,12 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: a_to_a
     ; GFX90A: $agpr1 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: a_to_a
     ; GFX940: $agpr1 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
@@ -869,6 +909,7 @@ body:             |
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ;
     ; GFX90A-LABEL: name: a2_to_a2_kill
     ; GFX90A: liveins: $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
@@ -876,6 +917,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1, implicit $exec
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ;
     ; GFX940-LABEL: name: a2_to_a2_kill
     ; GFX940: liveins: $agpr0_agpr1
     ; GFX940-NEXT: {{  $}}
@@ -905,6 +947,7 @@ body:             |
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr3_agpr4
     ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit killed $agpr1_agpr2
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
+    ;
     ; GFX90A-LABEL: name: a2_to_a2_implicit_defs
     ; GFX90A: liveins: $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
@@ -914,6 +957,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec, implicit-def $agpr1_agpr2
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec, implicit-def $agpr3_agpr4, implicit $agpr1_agpr2
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit killed $agpr1_agpr2, implicit $exec
+    ;
     ; GFX940-LABEL: name: a2_to_a2_implicit_defs
     ; GFX940: liveins: $agpr0_agpr1
     ; GFX940-NEXT: {{  $}}
@@ -946,6 +990,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill
     ; GFX90A: liveins: $agpr4_agpr5_agpr6
     ; GFX90A-NEXT: {{  $}}
@@ -953,6 +998,7 @@ body:             |
     ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2
+    ;
     ; GFX940-LABEL: name: a3_to_a3_nonoverlap_kill
     ; GFX940: liveins: $agpr4_agpr5_agpr6
     ; GFX940-NEXT: {{  $}}
@@ -981,6 +1027,7 @@ body:             |
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
+    ;
     ; GFX90A-LABEL: name: a3_to_a3_overlap_kill
     ; GFX90A: liveins: $agpr1_agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -989,6 +1036,7 @@ body:             |
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1
+    ;
     ; GFX940-LABEL: name: a3_to_a3_overlap_kill
     ; GFX940: liveins: $agpr1_agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -1018,6 +1066,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX90A-LABEL: name: a4_to_a4
     ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1025,6 +1074,7 @@ body:             |
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5
+    ;
     ; GFX940-LABEL: name: a4_to_a4
     ; GFX940: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr3, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3
@@ -1055,6 +1105,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
+    ;
     ; GFX90A-LABEL: name: a4_to_a4_overlap
     ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1063,6 +1114,7 @@ body:             |
     ; GFX90A-NEXT: $agpr3 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr2 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5
+    ;
     ; GFX940-LABEL: name: a4_to_a4_overlap
     ; GFX940: liveins: $agpr0_agpr1_agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -1099,6 +1151,7 @@ body:             |
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX908-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX90A-LABEL: name: a8_to_a8
     ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1110,6 +1163,7 @@ body:             |
     ; GFX90A-NEXT: $agpr9 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; GFX90A-NEXT: $agpr8 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ;
     ; GFX940-LABEL: name: a8_to_a8
     ; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr15 = V_ACCVGPR_MOV_B32 $agpr7, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -1167,6 +1221,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX908-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ;
     ; GFX90A-LABEL: name: a16_to_a16
     ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1186,6 +1241,7 @@ body:             |
     ; GFX90A-NEXT: $agpr17 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; GFX90A-NEXT: $agpr16 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ;
     ; GFX940-LABEL: name: a16_to_a16
     ; GFX940: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; GFX940-NEXT: $agpr31 = V_ACCVGPR_MOV_B32 $agpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -1226,12 +1282,14 @@ body:             |
     ; GFX908-NEXT: $vgpr255 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec
     ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr255, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX90A-LABEL: name: a_to_a_spill
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $agpr1 = IMPLICIT_DEF
     ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_MOV_B32 killed $agpr1, implicit $exec, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr0
+    ;
     ; GFX940-LABEL: name: a_to_a_spill
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253, $vgpr254
     ; GFX940-NEXT: {{  $}}
@@ -1263,6 +1321,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ;
     ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple
     ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1272,6 +1331,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ;
     ; GFX940-LABEL: name: copy_sgpr_to_agpr_tuple
     ; GFX940: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
@@ -1305,6 +1365,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill
     ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1314,6 +1375,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: copy_sgpr_to_agpr_tuple_kill
     ; GFX940: liveins: $agpr0, $sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
@@ -1348,6 +1410,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple
     ; GFX90A: liveins: $agpr0, $agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1357,6 +1420,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3
+    ;
     ; GFX940-LABEL: name: copy_agpr_to_agpr_tuple
     ; GFX940: liveins: $agpr0, $agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
@@ -1391,6 +1455,7 @@ body:             |
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3
     ; GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill
     ; GFX90A: liveins: $agpr0, $agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
@@ -1400,6 +1465,7 @@ body:             |
     ; GFX90A-NEXT: $agpr5 = V_ACCVGPR_MOV_B32 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; GFX90A-NEXT: $agpr4 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7
+    ;
     ; GFX940-LABEL: name: copy_agpr_to_agpr_tuple_kill
     ; GFX940: liveins: $agpr0, $agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
index 53540e4a00049..9794130d2b000 100644
--- a/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/accvgpr-spill-scc-clobber.mir
@@ -39,6 +39,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr32_restore_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -509,6 +510,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr32_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -532,6 +534,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr32_restore_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1056,6 +1059,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr64_restore_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1527,6 +1531,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr64_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1552,6 +1557,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr64_restore_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -2078,6 +2084,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr96_restore_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -2550,6 +2557,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr96_restore_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -2577,6 +2585,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr96_restore_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -3099,6 +3108,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr32_save_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -3569,6 +3579,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr32_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -3592,6 +3603,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $agpr0, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr32_save_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -4115,6 +4127,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr64_save_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -4586,6 +4599,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr64_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -4611,6 +4625,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr64_save_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -5135,6 +5150,7 @@ body:             |
   ; GFX908-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-LABEL: name: agpr96_save_clobber_scc
   ; GFX90A: bb.0:
   ; GFX90A-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -5607,6 +5623,7 @@ body:             |
   ; GFX90A-NEXT:   $agpr33 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX90A-NEXT:   $agpr32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX90A-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX908-FLATSCR-LABEL: name: agpr96_save_clobber_scc
   ; GFX908-FLATSCR: bb.0:
   ; GFX908-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -5634,6 +5651,7 @@ body:             |
   ; GFX908-FLATSCR-NEXT:   liveins: $agpr0_agpr1, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
   ; GFX908-FLATSCR-NEXT: {{  $}}
   ; GFX908-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX90A-FLATSCR-LABEL: name: agpr96_save_clobber_scc
   ; GFX90A-FLATSCR: bb.0:
   ; GFX90A-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
index ec8533170ebfe..950382758ffbc 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-vgprs.mir
@@ -15,6 +15,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr1
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_a32_to_a32
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0
     ; GFX90A-NEXT: {{  $}}
@@ -38,6 +39,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1
     ; GFX908-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_a64_to_a64
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $agpr0_agpr1
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
index 7eb3862764a4e..a42cf43fe56fd 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
@@ -16,6 +16,7 @@ body: |
     ; GFX908-NEXT: renamable $agpr2 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: renamable $agpr3 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3
+    ;
     ; GFX90A-LABEL: name: propagate_agpr
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
@@ -42,6 +43,7 @@ body: |
     ; GFX908-NEXT: renamable $agpr1 = COPY renamable $vgpr0, implicit $exec
     ; GFX908-NEXT: renamable $agpr2 = COPY renamable $vgpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0, implicit $agpr1, implicit $agpr2
+    ;
     ; GFX90A-LABEL: name: do_not_propagate_agpr_to_agpr
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
@@ -68,6 +70,7 @@ body: |
     ; GFX908-NEXT: renamable $agpr1 = COPY $vgpr0, implicit $exec
     ; GFX908-NEXT: renamable $agpr2 = COPY $vgpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2
+    ;
     ; GFX90A-LABEL: name: propagate_vgpr_to_agpr
     ; GFX90A: liveins: $vgpr0
     ; GFX90A-NEXT: {{  $}}
@@ -94,6 +97,7 @@ body: |
     ; GFX908-NEXT: renamable $vgpr1 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: renamable $vgpr2 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+    ;
     ; GFX90A-LABEL: name: propagate_agpr_to_vgpr
     ; GFX90A: liveins: $agpr0
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
index dc54aff6bd63a..a9d31c1c45b0e 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-sgpr-no-vgprs.mir
@@ -15,6 +15,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr8, implicit $exec
     ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr1
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_s32_to_a32
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8
     ; GFX90A-NEXT: {{  $}}
@@ -39,6 +40,7 @@ body: |
     ; GFX908-NEXT: $vgpr63 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9
     ; GFX908-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr63, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, implicit $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, implicit $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, implicit $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, implicit $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, implicit $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, implicit $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, implicit $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, implicit $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, implicit $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, implicit $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, implicit $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, implicit $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, implicit $agpr2_agpr3
+    ;
     ; GFX90A-LABEL: name: no_free_vgprs_for_copy_s64_to_a64
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr8_sgpr9
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
index f3f986a176d67..ffa9e643409d3 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-to-agpr-copy.mir
@@ -15,13 +15,13 @@ body: |
     ; GFX908-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
     ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec
-    ; GFX908-NEXT: undef %4.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec
-    ; GFX908-NEXT: %4.sub1:areg_128 = COPY [[V_MOV_B32_e32_1]]
-    ; GFX908-NEXT: %4.sub2:areg_128 = COPY [[V_MOV_B32_e32_1]]
-    ; GFX908-NEXT: %4.sub3:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub2:areg_128 = COPY [[V_MOV_B32_e32_1]]
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub3:areg_128 = COPY [[V_MOV_B32_e32_1]]
     ; GFX908-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
     ; GFX908-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], %4, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], [[V_ACCVGPR_WRITE_B32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; GFX908-NEXT: S_ENDPGM 0
@@ -53,13 +53,13 @@ body: |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
     ; GFX908-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
     ; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX908-NEXT: undef %3.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
-    ; GFX908-NEXT: %3.sub1:areg_128 = COPY %3.sub0
-    ; GFX908-NEXT: %3.sub2:areg_128 = COPY %3.sub0
-    ; GFX908-NEXT: %3.sub3:areg_128 = COPY %3.sub0
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_128 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub2:areg_128 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub3:areg_128 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
     ; GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
     ; GFX908-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], %3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX908-NEXT: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], [[V_ACCVGPR_WRITE_B32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; GFX908-NEXT: S_ENDPGM 0
@@ -88,11 +88,11 @@ body: |
     ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-    ; GFX908-NEXT: undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
-    ; GFX908-NEXT: %1.sub1:areg_128 = COPY [[COPY]].sub0
-    ; GFX908-NEXT: %1.sub2:areg_128 = COPY [[COPY]].sub0
-    ; GFX908-NEXT: %1.sub3:areg_128 = COPY [[COPY]].sub0
-    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit %1
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ACCVGPR_WRITE_B32_e64_]]
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
     undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
     %1.sub1:areg_128 = COPY %1.sub0:areg_128
@@ -111,10 +111,10 @@ body: |
     ; GFX908: liveins: $vgpr0_vgpr1
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
-    ; GFX908-NEXT: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
-    ; GFX908-NEXT: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
-    ; GFX908-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY %1
-    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
+    ; GFX908-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[V_ACCVGPR_WRITE_B32_e64_]]
+    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ACCVGPR_WRITE_B32_e64_]], implicit [[COPY1]]
     %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
     undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
     %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
@@ -132,10 +132,10 @@ body: |
     ; GFX908: liveins: $vgpr0_vgpr1
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
-    ; GFX908-NEXT: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
-    ; GFX908-NEXT: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
+    ; GFX908-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
+    ; GFX908-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
     ; GFX908-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]].sub1
-    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
+    ; GFX908-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ACCVGPR_WRITE_B32_e64_]], implicit [[COPY1]]
     %0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
     undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
     %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir b/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir
index 3b1951e287d36..007c4c094029c 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir
@@ -20,18 +20,18 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def %0
-  ; CHECK-NEXT:   undef %1.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef %1.sub0:sgpr_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[DEF:%[0-9]+]].sub0:sgpr_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %1.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
@@ -73,12 +73,12 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 123
-  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 123
+  ; CHECK-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 123
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 123
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
   ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
index 69cfbeeb9a491..6483ff28c0de0 100644
--- a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
+++ b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
@@ -34,8 +34,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %15, 0, implicit $exec
-  ; CHECK-NEXT:   %7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %7, %subreg.sub1
+  ; CHECK-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; CHECK-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
   ; CHECK-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1)
@@ -55,7 +55,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.6(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
-  ; CHECK-NEXT:   dead %13:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
@@ -71,7 +71,7 @@ body:             |
   ; CHECK-NEXT: bb.6:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.7:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
index 590d73b310a29..41eb2b7bb2748 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-no-rtn.ll
@@ -18,6 +18,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offset_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -50,6 +51,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_offen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -83,6 +85,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_idxen_no_rtn(float %val, <4 x i32>
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -118,6 +121,7 @@ define amdgpu_ps void @buffer_atomic_fadd_f32_bothen_no_rtn(float %val, <4 x i32
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
@@ -158,6 +162,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offset_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -202,6 +207,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_offen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -247,6 +253,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_idxen_no_rtn(float %val, ptr a
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -294,6 +301,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_f32_bothen_no_rtn(float %val, ptr
   ; GFX908_GFX11-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908_GFX11-NEXT:   BUFFER_ATOMIC_ADD_F32_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908_GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
index dc164191b2169..f964da2ddf402 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.f32-rtn.ll
@@ -18,6 +18,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offset_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -52,6 +53,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_offen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -87,6 +89,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_idxen_rtn(float %val, <4 x i32> i
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -124,6 +127,7 @@ define amdgpu_ps float @buffer_atomic_fadd_f32_bothen_rtn(float %val, <4 x i32>
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
@@ -166,6 +170,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offset_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFSET_RTN [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFSET_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offset_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -212,6 +217,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_offen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_OFFEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_OFFEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_offen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -259,6 +265,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_idxen_rtn(float %val, ptr add
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_IDXEN_RTN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_IDXEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_idxen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -308,6 +315,7 @@ define amdgpu_ps float @buffer_ptr_atomic_fadd_f32_bothen_rtn(float %val, ptr ad
   ; GFX90A_GFX940-NEXT:   [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX90A_GFX940-NEXT:   $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_F32_BOTHEN_RTN]]
   ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: buffer_ptr_atomic_fadd_f32_bothen_rtn
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
index ea98b61a3b7db..0b62977613f1d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-atomic-fadd.v2f16-no-rtn.ll
@@ -17,6 +17,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY1]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -49,6 +50,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -82,6 +84,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val, <4
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -117,6 +120,7 @@ define amdgpu_ps void @buffer_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %val, <
   ; GFX908-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
@@ -157,6 +161,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offset_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY6]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFSET [[COPY5]], killed [[REG_SEQUENCE2]], [[COPY]], 4095, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offset_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4
@@ -201,6 +206,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_offen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_OFFEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_offen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -246,6 +252,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn(<2 x half> %val
   ; GFX908-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY10]], %subreg.sub0, killed [[COPY9]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_IDXEN [[COPY6]], [[COPY1]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_idxen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $sgpr4
@@ -293,6 +300,7 @@ define amdgpu_ps void @buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn(<2 x half> %va
   ; GFX908-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908-NEXT:   BUFFER_ATOMIC_PK_ADD_F16_BOTHEN [[COPY7]], killed [[REG_SEQUENCE3]], killed [[REG_SEQUENCE2]], [[COPY]], 0, 2, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.rsrc, align 1, addrspace 8)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: buffer_ptr_atomic_fadd_v2f16_bothen_no_rtn
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr1, $vgpr2, $sgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir b/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir
index 23590cad83271..8e0c544a3a570 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir
@@ -18,7 +18,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -29,7 +29,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -59,20 +59,20 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -102,7 +102,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -113,7 +113,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -143,7 +143,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -154,7 +154,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_MUL_F32_e32 0, [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -185,7 +185,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
@@ -195,12 +195,12 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
   bb.0:
     liveins: $vgpr0
 
@@ -229,7 +229,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -237,7 +237,7 @@ body:             |
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[COPY]].sub0
   bb.0:
     liveins: $vgpr0
 
@@ -261,7 +261,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -269,7 +269,7 @@ body:             |
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY]].sub1
   bb.0:
     liveins: $vgpr0
 
@@ -295,7 +295,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef %2.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   dead undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -336,7 +336,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   dead undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -376,7 +376,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   dead undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -417,7 +417,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -428,7 +428,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
@@ -458,7 +458,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:vreg_64 = COPY $vgpr0
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -469,7 +469,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 [[COPY]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir b/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir
index 45ccb4b8866e8..6f1e888661601 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-into-dead-subreg-copies.mir
@@ -18,7 +18,7 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM undef %1:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64), addrspace 4)
+  ; CHECK-NEXT:   dead [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM undef %1:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64), addrspace 4)
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir b/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir
index 1f235ebccfa33..b477ec85d3a12 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-liveout-undef-copy.mir
@@ -23,19 +23,19 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %2:vreg_128_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY]].sub0:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]].sub0:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %4:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V4 [[COPY]], undef %5:sgpr_32, 11, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   dead [[V_INDIRECT_REG_READ_GPR_IDX_B32_V4_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V4 [[COPY]], undef %5:sgpr_32, 11, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir b/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir
index f4b8f12ad246d..35f0399b78aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-remat-dead-use.mir
@@ -89,7 +89,7 @@ body:             |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[COPY]], implicit $exec
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 [[V_ADD_U32_e32_]], implicit $exec
-    ; GCN-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $vgpr0, implicit [[V_ADD_U32_e32_]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = V_ADD_U32_e32 1, %0, implicit $exec
     %2:vgpr_32 = V_MOV_B32_e32 %1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir b/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir
index b6a1719b3c701..e9a6e89245e9d 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-removepartial-extend-undef-subrange.mir
@@ -22,29 +22,29 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $vgpr3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr2
-  ; CHECK-NEXT:   undef %1.sub0:vreg_64 = COPY [[COPY]]
-  ; CHECK-NEXT:   undef %2.sub0:vreg_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]]
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]]
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY %2
-  ; CHECK-NEXT:   %1.sub0:vreg_64 = COPY [[COPY1]].sub0
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY %1
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY3]].sub0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.2, implicit undef $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY3]]
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir
index b28e9f86a43d1..2fb3467da9a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-subranges-prune-kill-copy.mir
@@ -12,8 +12,8 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   undef %1.sub0:vreg_128 = IMPLICIT_DEF
-  ; GCN-NEXT:   %1.sub1:vreg_128 = IMPLICIT_DEF
+  ; GCN-NEXT:   undef [[DEF:%[0-9]+]].sub0:vreg_128 = IMPLICIT_DEF
+  ; GCN-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_128 = IMPLICIT_DEF
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
@@ -22,7 +22,7 @@ body:             |
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   [[DEF]].sub2:vreg_128 = COPY undef %3:sreg_32
+  ; GCN-NEXT:   [[DEF:%[0-9]+]].sub2:vreg_128 = COPY undef %3:sreg_32
   ; GCN-NEXT:   S_ENDPGM 0, implicit [[DEF]]
   bb.0:
     undef %0.sub0:vreg_128 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir b/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir
index de37345a87fba..b988aec3971ee 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescing-subreg-was-undef-but-became-def.mir
@@ -16,17 +16,17 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %4.sub2:sgpr_128 = S_MOV_B32 0
-  ; CHECK-NEXT:   dead undef %7.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   dead undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit undef $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %4.sub0:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 -1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   bb.0:
     successors: %bb.1, %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir b/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir
index 0bfdbf2629a1b..cc839ff966abf 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescing_makes_lanes_undef.mir
@@ -19,18 +19,18 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_64 = S_MOV_B32 1
-  ; CHECK-NEXT:   %0.sub1:sgpr_64 = S_MOV_B32 2
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   bb.0:
     successors: %bb.1, %bb.2
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
index bbc585d01142e..d62a63286b3bf 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf2.mir
@@ -37,15 +37,15 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   undef %5.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GCN-NEXT:   undef %6.sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec
-  ; GCN-NEXT:   %6.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY %5.sub1
-  ; GCN-NEXT:   undef %8.sub0:vreg_64, %9:sreg_64_xexec = V_ADD_CO_U32_e64 %5.sub0, %6.sub0, 0, implicit $exec
-  ; GCN-NEXT:   %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec
-  ; GCN-NEXT:   %5.sub3:sgpr_128 = S_MOV_B32 61440
-  ; GCN-NEXT:   %5.sub2:sgpr_128 = S_MOV_B32 0
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GCN-NEXT:   undef [[V_LSHLREV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_LSHLREV_B32_e32 2, [[COPY1]], implicit $exec
+  ; GCN-NEXT:   [[V_LSHLREV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GCN-NEXT:   undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[V_LSHLREV_B32_e32_]].sub0, 0, implicit $exec
+  ; GCN-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 61440
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 [[V_LSHLREV_B32_e32_]].sub1, [[V_LSHLREV_B32_e32_]], [[S_LOAD_DWORDX2_IMM]], 0, 0, 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
@@ -56,17 +56,17 @@ body:             |
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   %5.sub0:sgpr_128 = COPY %5.sub2
-  ; GCN-NEXT:   %5.sub1:sgpr_128 = COPY %5.sub2
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
+  ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub2
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], [[V_ADD_CO_U32_e64_]], [[S_LOAD_DWORDX2_IMM]], 0, 4, 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc
   ; GCN-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead %16:sreg_64 = SI_CALL [[DEF]], @func, csr_amdgpu
+  ; GCN-NEXT:   dead [[SI_CALL:%[0-9]+]]:sreg_64 = SI_CALL [[DEF]], @func, csr_amdgpu
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
   ; GCN-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
index 9ca28f9f230de..57afb456d6037 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
+++ b/llvm/test/CodeGen/AMDGPU/combine-sreg64-inits.mir
@@ -7,7 +7,7 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: combine_sreg64_inits
-    ; GCN: dead %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
+    ; GCN: dead [[S_MOV_B:%[0-9]+]]:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
     ; GCN-NEXT: S_NOP 0
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     S_NOP 0
@@ -19,7 +19,7 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: combine_sreg64_inits_swap
-    ; GCN: dead %0:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
+    ; GCN: dead [[S_MOV_B:%[0-9]+]]:sgpr_64 = S_MOV_B64_IMM_PSEUDO 8589934593
     ; GCN-NEXT: S_NOP 0
     undef %0.sub1:sgpr_64 = S_MOV_B32 2
     S_NOP 0
@@ -32,9 +32,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: sreg64_subreg_copy_0
     ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: undef %1.sub0:sgpr_64 = COPY [[DEF]]
-    ; GCN-NEXT: %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: dead %1.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: undef [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY [[DEF]]
+    ; GCN-NEXT: [[COPY:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: dead [[COPY:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     %0:sgpr_32 = IMPLICIT_DEF
     undef %1.sub0:sgpr_64 = COPY %0:sgpr_32
     %1.sub0:sgpr_64 = S_MOV_B32 1
@@ -47,9 +47,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: sreg64_subreg_copy_1
     ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: undef %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %1.sub1:sgpr_64 = COPY [[DEF]]
-    ; GCN-NEXT: dead %1.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[DEF]]
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     %0:sgpr_32 = IMPLICIT_DEF
     undef %1.sub0:sgpr_64 = S_MOV_B32 1
     %1.sub1:sgpr_64 = COPY %0:sgpr_32
@@ -62,9 +62,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: sreg64_subreg_copy_2
     ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: undef %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %1.sub1:sgpr_64 = S_MOV_B32 2
-    ; GCN-NEXT: dead %1.sub0:sgpr_64 = COPY [[DEF]]
+    ; GCN-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = COPY [[DEF]]
     %0:sgpr_32 = IMPLICIT_DEF
     undef %1.sub0:sgpr_64 = S_MOV_B32 1
     %1.sub1:sgpr_64 = S_MOV_B32 2
@@ -78,10 +78,10 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   undef %0.sub0:sgpr_64 = S_MOV_B32 1
+  ; GCN-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
-  ; GCN-NEXT:   dead %0.sub1:sgpr_64 = S_MOV_B32 2
+  ; GCN-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
   bb.0:
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
 
@@ -94,9 +94,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_two_defs_sub1
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %0.sub1:sgpr_64 = S_MOV_B32 2
-    ; GCN-NEXT: dead %0.sub1:sgpr_64 = S_MOV_B32 3
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 3
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     %0.sub1:sgpr_64 = S_MOV_B32 2
     %0.sub1:sgpr_64 = S_MOV_B32 3
@@ -107,9 +107,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_two_defs_sub0
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: %0.sub1:sgpr_64 = S_MOV_B32 2
-    ; GCN-NEXT: dead %0.sub0:sgpr_64 = S_MOV_B32 3
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 3
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     %0.sub1:sgpr_64 = S_MOV_B32 2
     %0.sub0:sgpr_64 = S_MOV_B32 3
@@ -120,8 +120,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_full_def
-    ; GCN: dead undef %1.sub0:sgpr_64 = S_MOV_B32 1
-    ; GCN-NEXT: dead %0:sgpr_64 = S_MOV_B64 3
+    ; GCN: dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+    ; GCN-NEXT: dead [[S_MOV_B64_:%[0-9]+]]:sgpr_64 = S_MOV_B64 3
     undef %0.sub0:sgpr_64 = S_MOV_B32 1
     %0:sgpr_64 = S_MOV_B64 3
 ...
@@ -131,8 +131,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_imp_use
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
-    ; GCN-NEXT: dead %0.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit $m0
     %0.sub1:sgpr_64 = S_MOV_B32 2
 ...
@@ -142,8 +142,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; GCN-LABEL: name: sreg64_inits_imp_def
-    ; GCN: undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
-    ; GCN-NEXT: dead %0.sub1:sgpr_64 = S_MOV_B32 2
+    ; GCN: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
+    ; GCN-NEXT: dead [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = S_MOV_B32 2
     undef %0.sub0:sgpr_64 = S_MOV_B32 1, implicit-def $scc
     %0.sub1:sgpr_64 = S_MOV_B32 2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/commute-vop3.mir b/llvm/test/CodeGen/AMDGPU/commute-vop3.mir
index 64a75e5f3a90f..9a8805effb5bd 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-vop3.mir
+++ b/llvm/test/CodeGen/AMDGPU/commute-vop3.mir
@@ -24,6 +24,7 @@ body: |
     ; GFX9-NEXT: [[V_XAD_U32_e64_:%[0-9]+]]:vgpr_32 = V_XAD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX9-NEXT: [[V_SUB_I32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX9-NEXT: [[V_SUB_I32_e64_1:%[0-9]+]]:vgpr_32 = V_SUB_I32_e64 [[COPY1]], [[COPY]], 0, implicit $exec
+    ;
     ; GFX10-LABEL: name: commute_vop3
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
index ac595abf9269b..7c21b3e085804 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir
@@ -16,14 +16,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64
     ; GFX90A: liveins: $vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64
     ; GFX940: liveins: $vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64
     ; GFX10: liveins: $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -43,14 +46,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s64_to_v64
     ; GFX90A: liveins: $sgpr2_sgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s64_to_v64
     ; GFX940: liveins: $sgpr2_sgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $sgpr2_sgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s64_to_v64
     ; GFX10: liveins: $sgpr2_sgpr3
     ; GFX10-NEXT: {{  $}}
@@ -70,16 +76,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
     ; GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_a64_to_v64
     ; GFX90A: liveins: $agpr2_agpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
     ; GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_a64_to_v64
     ; GFX940: liveins: $agpr2_agpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3
     ; GFX940-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_a64_to_v64
     ; GFX10: liveins: $agpr2_agpr3
     ; GFX10-NEXT: {{  $}}
@@ -101,16 +110,19 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_to_v128_fwd
     ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_to_v128_fwd
     ; GFX940: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr4_vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_to_v128_fwd
     ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -134,16 +146,19 @@ body: |
     ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_to_v128_back
     ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_to_v128_back
     ; GFX940: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $vgpr2_vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $vgpr0_vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_to_v128_back
     ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -166,18 +181,21 @@ body: |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v96_to_v96
     ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v96_to_v96
     ; GFX940: liveins: $vgpr4_vgpr5_vgpr6
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v96_to_v96
     ; GFX10: liveins: $vgpr4_vgpr5_vgpr6
     ; GFX10-NEXT: {{  $}}
@@ -198,14 +216,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0
     ; GFX90A: liveins: $vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub0
     ; GFX940: liveins: $vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0
     ; GFX10: liveins: $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -225,14 +246,17 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1
     ; GFX90A: liveins: $vgpr2
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64_undef_sub1
     ; GFX940: liveins: $vgpr2
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 killed $vgpr2_vgpr3, implicit $exec, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1
     ; GFX10: liveins: $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -254,16 +278,19 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ;
     ; GFX90A-LABEL: name: copy_s128_to_v128_killed
     ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX90A-NEXT: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ;
     ; GFX940-LABEL: name: copy_s128_to_v128_killed
     ; GFX940: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr4_sgpr5, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX940-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 $sgpr6_sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7
+    ;
     ; GFX10-LABEL: name: copy_s128_to_v128_killed
     ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX10-NEXT: {{  $}}
@@ -285,16 +312,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_to_v64_unaligned
     ; GFX90A: liveins: $vgpr2_vgpr3
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_to_v64_unaligned
     ; GFX940: liveins: $vgpr2_vgpr3
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_to_v64_unaligned
     ; GFX10: liveins: $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -314,16 +344,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v64_unaligned_to_v64
     ; GFX90A: liveins: $vgpr3_vgpr4
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v64_unaligned_to_v64
     ; GFX940: liveins: $vgpr3_vgpr4
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v64_unaligned_to_v64
     ; GFX10: liveins: $vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
@@ -345,6 +378,7 @@ body: |
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_to_v128_unaligned
     ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX90A-NEXT: {{  $}}
@@ -352,6 +386,7 @@ body: |
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_to_v128_unaligned
     ; GFX940: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX940-NEXT: {{  $}}
@@ -359,6 +394,7 @@ body: |
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_to_v128_unaligned
     ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX10-NEXT: {{  $}}
@@ -382,6 +418,7 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v128_unaligned_to_v128
     ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: {{  $}}
@@ -389,6 +426,7 @@ body: |
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v128_unaligned_to_v128
     ; GFX940: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: {{  $}}
@@ -396,6 +434,7 @@ body: |
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v128_unaligned_to_v128
     ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10
     ; GFX10-NEXT: {{  $}}
@@ -417,16 +456,19 @@ body: |
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s64_to_v64_unaligned
     ; GFX90A: liveins: $sgpr8_sgpr9
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s64_to_v64_unaligned
     ; GFX940: liveins: $sgpr8_sgpr9
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s64_to_v64_unaligned
     ; GFX10: liveins: $sgpr8_sgpr9
     ; GFX10-NEXT: {{  $}}
@@ -448,6 +490,7 @@ body: |
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX908-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s128_to_v128_unaligned
     ; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX90A-NEXT: {{  $}}
@@ -455,6 +498,7 @@ body: |
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX90A-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s128_to_v128_unaligned
     ; GFX940: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX940-NEXT: {{  $}}
@@ -462,6 +506,7 @@ body: |
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX940-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s128_to_v128_unaligned
     ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GFX10-NEXT: {{  $}}
@@ -484,18 +529,21 @@ body: |
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v96_to_v96_unaligned
     ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v96_to_v96_unaligned
     ; GFX940: liveins: $vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v96_to_v96_unaligned
     ; GFX10: liveins: $vgpr8_vgpr9_vgpr10
     ; GFX10-NEXT: {{  $}}
@@ -517,18 +565,21 @@ body: |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_v96_unaligned_to_v96
     ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_v96_unaligned_to_v96
     ; GFX940: liveins: $vgpr7_vgpr8_vgpr9
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_v96_unaligned_to_v96
     ; GFX10: liveins: $vgpr7_vgpr8_vgpr9
     ; GFX10-NEXT: {{  $}}
@@ -550,18 +601,21 @@ body: |
     ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s96_to_v96
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s96_to_v96
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s96_to_v96
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX10-NEXT: {{  $}}
@@ -583,18 +637,21 @@ body: |
     ; GFX908-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX90A-LABEL: name: copy_s96_to_v96_unaligned
     ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX940-LABEL: name: copy_s96_to_v96_unaligned
     ; GFX940: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2
     ; GFX940-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec
+    ;
     ; GFX10-LABEL: name: copy_s96_to_v96_unaligned
     ; GFX10: liveins: $sgpr0_sgpr1_sgpr2
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
index 4d06f4a19597f..004abb4bb0ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
+++ b/llvm/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir
@@ -18,20 +18,20 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   %3:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %4:vgpr_32, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %5:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 %3, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 4, %5, implicit $exec
-  ; GCN-NEXT:   undef %11.sub0:vreg_128 = V_MUL_LO_I32_e64 [[V_LSHRREV_B32_e32_]], 3, implicit $exec
-  ; GCN-NEXT:   %11.sub3:vreg_128 = COPY %11.sub0
+  ; GCN-NEXT:   [[V_TRUNC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_TRUNC_F32_e32 undef %4:vgpr_32, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_TRUNC_F32_e32_]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 4, [[V_CVT_U32_F32_e32_]], implicit $exec
+  ; GCN-NEXT:   undef [[V_MUL_LO_I32_e64_:%[0-9]+]].sub0:vreg_128 = V_MUL_LO_I32_e64 [[V_LSHRREV_B32_e32_]], 3, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_LO_I32_e64_:%[0-9]+]].sub3:vreg_128 = COPY [[V_MUL_LO_I32_e64_]].sub0
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
-  ; GCN-NEXT:   %11.sub3:vreg_128 = V_ADD_U32_e32 target-flags(amdgpu-rel32-lo) 1, [[COPY]].sub3, implicit $exec
-  ; GCN-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32_xm0 = S_ADD_I32 [[S_ADD_I32_]], 1, implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LT_U32 [[S_ADD_I32_]], 3, implicit-def $scc
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY [[V_MUL_LO_I32_e64_]]
+  ; GCN-NEXT:   [[V_MUL_LO_I32_e64_:%[0-9]+]].sub3:vreg_128 = V_ADD_U32_e32 target-flags(amdgpu-rel32-lo) 1, [[COPY]].sub3, implicit $exec
+  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_ADD_I32 [[S_MOV_B32_]], 1, implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LT_U32 [[S_MOV_B32_]], 3, implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit killed $scc
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
@@ -44,10 +44,10 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   dead %16:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from constant-pool, align 1, addrspace 4)
-  ; GCN-NEXT:   dead %18:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+  ; GCN-NEXT:   dead [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]].sub3, undef %17:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from constant-pool, align 1, addrspace 4)
+  ; GCN-NEXT:   dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc
-  ; GCN-NEXT:   dead %20:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
   ; GCN-NEXT:   successors: %bb.4(0x7c000000), %bb.6(0x04000000)
@@ -57,13 +57,13 @@ body:             |
   ; GCN-NEXT:   S_BRANCH %bb.6
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
-  ; GCN-NEXT:   %21:vgpr_32 = nofpexcept V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, %11.sub0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %22:vgpr_32 = nofpexcept V_MIN_F32_e32 1106771968, %21, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %23:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, %22, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %24:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, %23, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %25:vgpr_32 = nofpexcept V_MAD_F32_e64 0, %24, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %26:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, %25, 0, undef %27:vgpr_32, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   EXP_DONE 0, %26, undef %28:vgpr_32, undef %29:vgpr_32, undef %30:vgpr_32, -1, -1, 15, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 target-flags(amdgpu-gotprel) 0, [[V_MUL_LO_I32_e64_]].sub0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MIN_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MIN_F32_e32 1106771968, [[V_MUL_F32_e32_]], implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, [[V_MIN_F32_e32_]], 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nnan arcp contract reassoc nofpexcept V_MAD_F32_e64 0, [[V_MAD_F32_e64_]], 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAD_F32_e64_1]], 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_CVT_PKRTZ_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_PKRTZ_F16_F32_e64 0, [[V_MAD_F32_e64_2]], 0, undef %27:vgpr_32, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   EXP_DONE 0, [[V_CVT_PKRTZ_F16_F32_e64_]], undef %28:vgpr_32, undef %29:vgpr_32, undef %30:vgpr_32, -1, -1, 15, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.6:
diff --git a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
index d284813c36843..00eb2b7e1aa8d 100644
--- a/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/dbg-value-ends-sched-region.mir
@@ -57,37 +57,37 @@ body:             |
   ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef %11.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %17.sub0:vreg_64, %18:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
-  ; CHECK-NEXT:   dead undef %17.sub1:vreg_64, dead %19:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, %18, 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
+  ; CHECK-NEXT:   undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[DEF4]].sub0, [[DEF6]].sub0, 0, implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[DEF4]].sub1, [[DEF6]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF1]], 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF3]]
   ; CHECK-NEXT:   dead [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub1
-  ; CHECK-NEXT:   dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY %11.sub0
+  ; CHECK-NEXT:   dead [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   dead [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 4, [[DEF7]], implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF8]], 288, 0, implicit $exec :: (store (s64), addrspace 1)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[DEF5]].sub1:vreg_64 = COPY [[COPY5]]
+  ; CHECK-NEXT:   undef [[DEF5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY5]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec
+  ; CHECK-NEXT:   dead [[COPY7:%[0-9]+]]:sreg_64 = COPY $exec
   ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[COPY1]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
   ; CHECK-NEXT:   DBG_VALUE [[GLOBAL_LOAD_DWORDX4_]], $noreg, <0x{{[0-9a-f]+}}>, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !DILocation(line: 0, scope: <0x{{[0-9a-f]+}}>)
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
index 19e50203c191b..dd9d6a1c788e1 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
@@ -24,9 +24,9 @@ body:             |
     ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_BUFFER_LOAD_DWORDX8_IMM undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 416, 0 :: (dereferenceable invariant load (s256), align 4)
     ; CHECK-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr7
     ; CHECK-NEXT: renamable $sgpr5 = COPY renamable $sgpr9
-    ; CHECK-NEXT: dead undef %4.sub0:vreg_64 = COPY renamable $sgpr3
-    ; CHECK-NEXT: dead undef %7.sub1:vreg_64 = COPY killed renamable $sgpr5
-    ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx11 undef %4, undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    ; CHECK-NEXT: dead undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY renamable $sgpr3
+    ; CHECK-NEXT: dead undef [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY killed renamable $sgpr5
+    ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx11 undef [[COPY]], undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
     ; CHECK-NEXT: S_ENDPGM 0
     undef %8.sub3:sgpr_128 = IMPLICIT_DEF
     undef %8.sub1:sgpr_128 = COPY undef $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
index 66276c756db42..cdd4c72f3717f 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-crash.mir
@@ -61,8 +61,8 @@ body:             |
   ; CHECK-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[V_MUL_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[DEF7]], [[DEF7]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %23:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MOV_B32_e32_1]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e32 [[V_MUL_F32_e32_4]], [[DEF13]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[V_ADD_F32_e32_]], [[COPY]], [[V_MOV_B32_e32_1]], implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[DEF14:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   $sgpr4 = IMPLICIT_DEF
   ; CHECK-NEXT:   $vgpr0 = COPY [[DEF11]]
@@ -73,10 +73,10 @@ body:             |
   ; CHECK-NEXT:   $vgpr2 = COPY [[V_MUL_F32_e32_3]]
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL [[DEF14]], @foo, csr_amdgpu, implicit undef $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $sgpr4, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit-def $vgpr0
   ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e32 [[V_MUL_F32_e32_]], [[DEF8]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MAC_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_MAC_F32_e32_]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %26:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %27:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead %28:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_MAC_F32_e32_]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAC_F32_e32 [[DEF12]], [[DEF9]], [[V_ADD_F32_e32_1]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF4]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF5]], 0, [[DEF2]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MAD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MAD_F32_e64 0, [[V_ADD_F32_e32_1]], 0, [[DEF6]], 0, [[DEF3]], 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF]], [[DEF10]], 0, 0, implicit $exec
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
index 41b85ce5c6e02..760ae6032230f 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
+++ b/llvm/test/CodeGen/AMDGPU/extend-phi-subrange-not-in-parent.mir
@@ -29,33 +29,33 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK-NEXT:     internal %6.sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
-  ; CHECK-NEXT:     internal %6.sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK-NEXT:     internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
+  ; CHECK-NEXT:     internal [[COPY1]].sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   %6.sub0:av_1024_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %6
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   undef %4.sub0:vreg_1024_align2 = COPY [[DEF]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_1024_align2 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vreg_1024_align2 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
index cf610f9436acd..2d6ae31f8e585 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll
@@ -11,24 +11,24 @@ define float @fdiv_f32(float %a, float %b) #0 {
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   %4:vgpr_32, %5:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %8:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_2:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_3:%[0-9]+]]:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[V_DIV_SCALE_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
-  ; GCN-NEXT:   %12:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %13:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %14:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %15:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %16:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[V_DIV_SCALE_F32_e64_]], 0, [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_MUL_F32_e64_]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_2]], 0, [[V_FMA_F32_e64_1]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_FMA_F32_e64_3]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
-  ; GCN-NEXT:   $vcc = COPY %5
-  ; GCN-NEXT:   %18:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec
-  ; GCN-NEXT:   %19:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY %19
+  ; GCN-NEXT:   $vcc = COPY [[V_DIV_SCALE_F32_e64_1]]
+  ; GCN-NEXT:   [[V_DIV_FMAS_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed [[V_FMA_F32_e64_4]], 0, [[V_FMA_F32_e64_1]], 0, [[V_FMA_F32_e64_3]], 0, 0, implicit $mode, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_FIXUP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed [[V_DIV_FMAS_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[V_DIV_FIXUP_F32_e64_]]
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
 entry:
   %fdiv = fdiv float %a, %b
@@ -42,24 +42,24 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 {
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GCN-NEXT:   %4:vgpr_32, %5:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %8:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_1:%[0-9]+]]:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_SCALE_F32_e64_2:%[0-9]+]]:vgpr_32, [[V_DIV_SCALE_F32_e64_3:%[0-9]+]]:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, [[V_DIV_SCALE_F32_e64_2]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
   ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode
-  ; GCN-NEXT:   %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %13:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %14:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_RCP_F32_e64_]], 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_1:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, [[V_RCP_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, [[V_DIV_SCALE_F32_e64_]], 0, [[V_FMA_F32_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_2:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_MUL_F32_e64_]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_3:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed [[V_FMA_F32_e64_2]], 0, [[V_FMA_F32_e64_1]], 0, [[V_MUL_F32_e64_]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[V_FMA_F32_e64_4:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, [[V_DIV_SCALE_F32_e64_2]], 0, [[V_FMA_F32_e64_3]], 0, [[V_DIV_SCALE_F32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode
-  ; GCN-NEXT:   $vcc = COPY %5
-  ; GCN-NEXT:   %18:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec
-  ; GCN-NEXT:   %19:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = COPY %19
+  ; GCN-NEXT:   $vcc = COPY [[V_DIV_SCALE_F32_e64_1]]
+  ; GCN-NEXT:   [[V_DIV_FMAS_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed [[V_FMA_F32_e64_4]], 0, [[V_FMA_F32_e64_1]], 0, [[V_FMA_F32_e64_3]], 0, 0, implicit $mode, implicit $vcc, implicit $exec
+  ; GCN-NEXT:   [[V_DIV_FIXUP_F32_e64_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed [[V_DIV_FMAS_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[V_DIV_FIXUP_F32_e64_]]
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
 entry:
   %fdiv = fdiv nnan float %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
index 42274e5420d07..ce4beb8789dc3 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll
@@ -14,6 +14,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -42,6 +43,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -70,6 +72,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -98,6 +101,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
   ; GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw
   ; GFX11: bb.0 (%ir-block.0):
   ; GFX11-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
index 08c58f21e76ab..666971618a5c2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.v2f16-no-rtn.ll
@@ -15,6 +15,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_intrinsic(ptr addrspace(1
   ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -42,6 +43,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic(ptr addrs
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
@@ -69,6 +71,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_no_rtn_flat_intrinsic(ptr addrsp
   ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
@@ -96,6 +99,7 @@ define amdgpu_ps void @global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic(ptr
   ; GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-NEXT:   GLOBAL_ATOMIC_PK_ADD_F16_SADDR killed [[V_MOV_B32_e32_]], [[COPY]], killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1)
   ; GFX908-NEXT:   S_ENDPGM 0
+  ;
   ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_v2f16_saddr_no_rtn_flat_intrinsic
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
   ; GFX90A_GFX940-NEXT:   liveins: $sgpr0, $sgpr1, $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
index f59c42283e981..bdd89a9077900 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
@@ -85,7 +85,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
   ; CHECK-NEXT:   renamable $sgpr6 = S_LSHL_B32 renamable $sgpr67, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   dead [[V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32_:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32_]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vreg_1024 = V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 [[COPY]], 0, killed $sgpr6, 3, implicit-def $m0, implicit $m0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.1(0x40000000)
@@ -114,7 +114,7 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr87 = COPY renamable $sgpr44
   ; CHECK-NEXT:   renamable $sgpr88 = COPY renamable $sgpr44
   ; CHECK-NEXT:   renamable $sgpr89 = COPY renamable $sgpr44
-  ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec
+  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99, implicit $exec
   ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -125,7 +125,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr6_sgpr7, implicit-def $scc
   ; CHECK-NEXT:   dead renamable $sgpr4 = S_LSHL_B32 killed renamable $sgpr66, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   dead [[COPY1:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
+  ; CHECK-NEXT:   dead [[COPY2:%[0-9]+]]:vreg_1024 = COPY renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
index edc04f2ef39ee..742498cdd8bd1 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
@@ -23,12 +23,12 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
     ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %9.sub1
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]].sub1
     ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub0
-    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %7.sub1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
@@ -66,18 +66,18 @@ body:             |
     ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
     ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %13.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
-    ; CHECK-NEXT: S_NOP 0, implicit-def %13.sub1
-    ; CHECK-NEXT: undef %15.sub0:vreg_64 = COPY %13.sub0
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY]].sub1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
-    ; CHECK-NEXT: S_NOP 0, implicit-def %7.sub0
-    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY %7.sub1
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[COPY2]].sub0
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:vreg_64 = COPY [[COPY2]].sub1
     ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
-    ; CHECK-NEXT: undef %14.sub0:vreg_64 = COPY %15.sub0
-    ; CHECK-NEXT: S_NOP 0, implicit %14.sub0
-    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY %9.sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %8.sub1
+    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub0:vreg_64 = COPY [[COPY1]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY4]].sub0
+    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub1:vreg_64 = COPY [[COPY3]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY5]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir
index 50cd4ba09184d..1eeb0f453bb15 100644
--- a/llvm/test/CodeGen/AMDGPU/gws-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/gws-hazards.mir
@@ -19,22 +19,26 @@ body: |
     ; GFX9-NEXT: $m0 = S_MOV_B32 -1
     ; GFX9-NEXT: S_NOP 0
     ; GFX9-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; VI-LABEL: name: m0_gws_init0
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: $m0 = S_MOV_B32 -1
     ; VI-NEXT: S_NOP 0
     ; VI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; CI-LABEL: name: m0_gws_init0
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: $m0 = S_MOV_B32 -1
     ; CI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; SI-LABEL: name: m0_gws_init0
     ; SI: liveins: $vgpr0
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: $m0 = S_MOV_B32 -1
     ; SI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; GFX10-LABEL: name: m0_gws_init0
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -56,19 +60,23 @@ body: |
     ; GFX9-NEXT: $m0 = S_MOV_B32 -1
     ; GFX9-NEXT: S_NOP 0
     ; GFX9-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; VI-LABEL: name: m0_gws_init1
     ; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; VI-NEXT: $m0 = S_MOV_B32 -1
     ; VI-NEXT: S_NOP 0
     ; VI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; CI-LABEL: name: m0_gws_init1
     ; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; CI-NEXT: $m0 = S_MOV_B32 -1
     ; CI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; SI-LABEL: name: m0_gws_init1
     ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; SI-NEXT: $m0 = S_MOV_B32 -1
     ; SI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; GFX10-LABEL: name: m0_gws_init1
     ; GFX10: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX10-NEXT: $m0 = S_MOV_B32 -1
@@ -96,6 +104,7 @@ body: |
     ; GFX9-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; GFX9-NEXT: S_NOP 0
     ; GFX9-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; VI-LABEL: name: m0_gws_readlane
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -103,18 +112,21 @@ body: |
     ; VI-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; VI-NEXT: S_NOP 0
     ; VI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; CI-LABEL: name: m0_gws_readlane
     ; CI: liveins: $vgpr0, $vgpr1
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
     ; CI-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; CI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; SI-LABEL: name: m0_gws_readlane
     ; SI: liveins: $vgpr0, $vgpr1
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
     ; SI-NEXT: $m0 = S_MOV_B32 $sgpr0
     ; SI-NEXT: DS_GWS_INIT $vgpr0, 0, implicit $m0, implicit $exec
+    ;
     ; GFX10-LABEL: name: m0_gws_readlane
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir b/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir
index 6556f9b46707d..5596eceb95d05 100644
--- a/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/high-bits-zeroed-16-bit-ops.mir
@@ -20,6 +20,7 @@ body:             |
     ; GFX8-NEXT: $vgpr0 = COPY %op
     ; GFX8-NEXT: $vgpr1 = COPY %op
     ; GFX8-NEXT: $vgpr2 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_f32_altmask
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -30,6 +31,7 @@ body:             |
     ; GFX9-NEXT: $vgpr0 = COPY %op
     ; GFX9-NEXT: $vgpr1 = COPY %op
     ; GFX9-NEXT: $vgpr2 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_f32_altmask
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -69,6 +71,7 @@ body:             |
     ; GFX8-NEXT: %mask:sreg_32 = S_MOV_B32 65534
     ; GFX8-NEXT: %and:vgpr_32 = V_AND_B32_e64 %mask, %op, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX9-LABEL: name: wrong_mask_value
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -77,6 +80,7 @@ body:             |
     ; GFX9-NEXT: %mask:sreg_32 = S_MOV_B32 65534
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e64 %mask, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: wrong_mask_value
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -106,6 +110,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_f32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -114,6 +119,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_f32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -148,6 +154,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_U16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_u16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -156,6 +163,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_U16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_u16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -190,6 +198,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_I16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cvt_f16_i16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -198,6 +207,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CVT_F16_I16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cvt_f16_i16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -232,6 +242,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RCP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_rcp_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -240,6 +251,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RCP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_rcp_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -273,6 +285,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RSQ_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_rsq_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -281,6 +294,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RSQ_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_rsq_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -314,6 +328,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SQRT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_sqrt_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -322,6 +337,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SQRT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_sqrt_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -356,6 +372,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LOG_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_log_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -364,6 +381,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LOG_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_log_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -398,6 +416,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_EXP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_exp_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -406,6 +425,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_EXP_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_exp_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -440,6 +460,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SIN_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_sin_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -448,6 +469,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_SIN_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_sin_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -482,6 +504,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_COS_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_cos_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -490,6 +513,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_COS_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_cos_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -524,6 +548,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FLOOR_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_floor_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -532,6 +557,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FLOOR_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_floor_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -566,6 +592,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CEIL_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_ceil_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -574,6 +601,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_CEIL_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_ceil_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -608,6 +636,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_TRUNC_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_trunc_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -616,6 +645,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_TRUNC_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_trunc_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -650,6 +680,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RNDNE_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_rndne_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -658,6 +689,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_RNDNE_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_rndne_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -692,6 +724,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FRACT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_fract_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -700,6 +733,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FRACT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_fract_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -734,6 +768,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_frexp_mant_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -742,6 +777,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_MANT_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_frexp_mant_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -776,6 +812,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_frexp_exp_f16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -784,6 +821,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_FREXP_EXP_I16_F16_e32 [[COPY]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_frexp_exp_f16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -819,6 +857,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_ldexp_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -828,6 +867,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_LDEXP_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_ldexp_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -865,6 +905,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHLREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_lshlrev_b16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -874,6 +915,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHLREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_lshlrev_b16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -911,6 +953,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHRREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_lshrrev_b16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -920,6 +963,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_LSHRREV_B16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_lshrrev_b16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -957,6 +1001,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ASHRREV_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_ashrrev_i16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -966,6 +1011,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ASHRREV_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_ashrrev_i16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1003,6 +1049,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ADD_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_add_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1012,6 +1059,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_ADD_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_add_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1049,6 +1097,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUB_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_sub_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1058,6 +1107,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUB_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_sub_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1095,6 +1145,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUBREV_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_subrev_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1104,6 +1155,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_SUBREV_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_subrev_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1141,6 +1193,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MUL_LO_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_mul_lo_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1150,6 +1203,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MUL_LO_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_mul_lo_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1187,6 +1241,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_add_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1196,6 +1251,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_add_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1233,6 +1289,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_sub_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1242,6 +1299,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_sub_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1279,6 +1337,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_subrev_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1288,6 +1347,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_subrev_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1325,6 +1385,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_mul_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1334,6 +1395,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_mul_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1371,6 +1433,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_max_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1380,6 +1443,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_max_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1417,6 +1481,7 @@ body:             |
     ; GFX8-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX9-LABEL: name: v_min_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1426,6 +1491,7 @@ body:             |
     ; GFX9-NEXT: %op_vop1:vgpr_32 = nofpexcept V_ADD_F16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop1
+    ;
     ; GFX10-LABEL: name: v_min_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1463,6 +1529,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_max_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1472,6 +1539,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_max_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1508,6 +1576,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_min_u16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1517,6 +1586,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_U16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_min_u16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1554,6 +1624,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_max_i16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1563,6 +1634,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MAX_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_max_i16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1599,6 +1671,7 @@ body:             |
     ; GFX8-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX8-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX9-LABEL: name: v_min_i16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1608,6 +1681,7 @@ body:             |
     ; GFX9-NEXT: %op_vop2:vgpr_32 = nofpexcept V_MIN_I16_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op_vop3
     ; GFX9-NEXT: $vgpr1 = COPY %op_vop2
+    ;
     ; GFX10-LABEL: name: v_min_i16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1644,6 +1718,7 @@ body:             |
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MAD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_mad_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1652,6 +1727,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MAD_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_mad_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1684,6 +1760,7 @@ body:             |
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_FMA_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_fma_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1692,6 +1769,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_FMA_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_fma_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1724,6 +1802,7 @@ body:             |
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_div_fixup_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1732,6 +1811,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_DIV_FIXUP_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX10-LABEL: name: v_div_fixup_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1763,6 +1843,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MADAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_madak_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1771,6 +1852,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MADAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_madak_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1800,6 +1882,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MADMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_madmk_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1808,6 +1891,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MADMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_madmk_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1837,6 +1921,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_FMAAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_fmaak_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1845,6 +1930,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_FMAAK_F16 [[COPY]], [[COPY1]], 1234, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_fmaak_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1874,6 +1960,7 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_FMAMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op
+    ;
     ; GFX9-LABEL: name: v_fmamk_f16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1882,6 +1969,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_FMAMK_F16 [[COPY]], 1234, [[COPY1]], implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: v_fmamk_f16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -1914,6 +2002,7 @@ body:             |
     ; GFX8-NEXT: %op_vop3:vgpr_32 = nofpexcept V_MAC_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop2
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
+    ;
     ; GFX9-LABEL: name: v_mac_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1926,6 +2015,7 @@ body:             |
     ; GFX9-NEXT: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop2
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop3
+    ;
     ; GFX10-LABEL: name: v_mac_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -1966,6 +2056,7 @@ body:             |
     ; GFX8-NEXT: %op_vop3:vgpr_32 = nofpexcept V_FMAC_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop2
     ; GFX8-NEXT: $vgpr0 = COPY %op_vop3
+    ;
     ; GFX9-LABEL: name: v_fmac_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -1978,6 +2069,7 @@ body:             |
     ; GFX9-NEXT: %and_vop3:vgpr_32 = V_AND_B32_e32 65535, %op_vop3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop2
     ; GFX9-NEXT: $vgpr0 = COPY %and_vop3
+    ;
     ; GFX10-LABEL: name: v_fmac_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -2018,6 +2110,7 @@ body:             |
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXLO_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX9-LABEL: name: no_fold_v_mad_mixlo_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2028,6 +2121,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXLO_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: no_fold_v_mad_mixlo_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -2064,6 +2158,7 @@ body:             |
     ; GFX8-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXHI_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX9-LABEL: name: no_fold_v_mad_mixhi_f16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2074,6 +2169,7 @@ body:             |
     ; GFX9-NEXT: %op:vgpr_32 = nofpexcept V_MAD_MIXHI_F16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
     ; GFX9-NEXT: %and:vgpr_32 = V_AND_B32_e32 65535, %op, implicit $exec
     ; GFX9-NEXT: $vgpr0 = COPY %and
+    ;
     ; GFX10-LABEL: name: no_fold_v_mad_mixhi_f16
     ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir b/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir
index c39fa08194f98..9eff5ac8a2a31 100644
--- a/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir
@@ -22,17 +22,17 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
@@ -68,17 +68,17 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sgpr_128 = S_MOV_B32 9
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 9
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
@@ -116,17 +116,17 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_NOP 0, implicit-def undef %0.sub1_sub2_sub3
-  ; CHECK-NEXT:   %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:vreg_128 = V_MOV_B32_e32 9, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128 = V_MOV_B32_e32 9, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_CBRANCH_SCC0 %bb.2, implicit undef $scc
diff --git a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
index 0417710297021..efa24a9bee7de 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
+++ b/llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir
@@ -43,6 +43,7 @@ body:             |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   S_ENDPGM 0
+  ;
   ; GFX11-LABEL: name: loop_header_nopred
   ; GFX11: bb.0:
   ; GFX11-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
index 1679773c945b8..9eeec4fa3a93d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
@@ -93,7 +93,7 @@ body:             |
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY1]]
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 $exec_lo, [[V_CMP_GT_I32_e64_]], implicit-def $scc
   ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_OR_B32_]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_OR_B32_]]
   ; CHECK-NEXT:   $exec_lo = S_ANDN2_B32_term $exec_lo, [[S_OR_B32_]], implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
@@ -246,8 +246,8 @@ body:             |
   ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 1, [[S_FF1_I32_B32_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[COPY8]], [[S_LSHL_B32_]], implicit-def dead $scc
   ; CHECK-NEXT:   S_CMP_LG_U32 [[S_ANDN2_B32_]], 0, implicit-def $scc
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_ADD_I32_]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_ADD_I32_]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_ANDN2_B32_]]
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit killed $scc
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
index f04f66cfbba1b..02e3d7e81fd40 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
@@ -40,21 +40,21 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]]
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %10:vreg_64, [[COPY66]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY66]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %10:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
-  ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
@@ -127,21 +127,21 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY4]], implicit $exec
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY77:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY77]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY77]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]]
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY7]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]]
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY9]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY9]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
-  ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   bb.0:
@@ -216,21 +216,21 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit killed [[S_MOV_B64_]]
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY1]]
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = nsw V_ADD_U32_e32 1, killed [[COPY5]], implicit $exec
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[V_ADD_U32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY66:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY66]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY66]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %11:vreg_64, [[COPY6]], 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]]
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[COPY7]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY8]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
   ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY8]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
-  ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term killed [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
index 0c00c9af25a79..914cc8ae8844c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
@@ -222,25 +222,25 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term1]]
-  ; CHECK-NEXT:   dead %7:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD undef %8:vreg_64, 0, 0, implicit $exec :: (volatile load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = COPY [[COPY3]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_64_xexec = COPY [[COPY4]]
-  ; CHECK-NEXT:   $exec = S_OR_B64_term $exec, killed [[COPY5]], implicit-def $scc
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[S_MOV_B64_term]]
+  ; CHECK-NEXT:   $exec = S_OR_B64_term $exec, killed [[COPY4]], implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_SLEEP 1
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
-  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY6]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY6]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY5]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_1]], [[COPY5]], implicit-def dead $scc
   ; CHECK-NEXT:   $exec = S_MOV_B64_term killed [[S_AND_B64_1]]
+  ; CHECK-NEXT:   [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B64_term1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
-  ; CHECK-NEXT:   [[S_MOV_B64_term2:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_1]], implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index fc3ed8703cec4..018da7f81e3d4 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -744,8 +744,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -766,8 +766,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -898,13 +898,13 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_23:%[0-9]+]].sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %23
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]].sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -1003,15 +1003,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %21
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -1091,10 +1091,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -1122,87 +1122,87 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
-  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
-  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
-  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
-  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
-  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
-  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
-  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
-  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
-  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
-  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
-  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
-  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
-  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
-  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
-  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
-  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
-  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
-  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
-  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
-  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
-  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
-  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
-  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
-  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
-  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
-  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
-  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
-  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
-  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
-  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
-  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
-  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
-  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
-  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
-  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
-  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
-  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
-  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
-  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
-  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
-  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
-  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
-  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
-  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
-  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
-  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
-  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
-  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
-  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
-  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
-  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
-  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
-  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
-  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
-  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
-  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
-  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
-  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
-  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
-  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
-  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
-  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
-  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
-  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
-  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
-  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
-  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
-  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
-  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
-  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
-  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
-  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -1224,8 +1224,8 @@ body:             |
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_27]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -1248,47 +1248,47 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_18]], implicit [[S_MOV_B32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_28]], implicit [[S_MOV_B32_29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_38]], implicit [[S_MOV_B32_39]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_48]], implicit [[S_MOV_B32_49]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_58]], implicit [[S_MOV_B32_59]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_68]], implicit [[S_MOV_B32_69]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_78]], implicit [[S_MOV_B32_79]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_13]], implicit [[S_MOV_B32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_23]], implicit [[S_MOV_B32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_33]], implicit [[S_MOV_B32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_43]], implicit [[S_MOV_B32_44]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_53]], implicit [[S_MOV_B32_54]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_63]], implicit [[S_MOV_B32_64]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_73]], implicit [[S_MOV_B32_74]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_81]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -1620,10 +1620,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -1647,87 +1647,87 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
-  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
-  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
-  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
-  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
-  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
-  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
-  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
-  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
-  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
-  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
-  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
-  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
-  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
-  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
-  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
-  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
-  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
-  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
-  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
-  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
-  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
-  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
-  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
-  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
-  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
-  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
-  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
-  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
-  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
-  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
-  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
-  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
-  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
-  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
-  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
-  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
-  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
-  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
-  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
-  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
-  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
-  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
-  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
-  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
-  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
-  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
-  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
-  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
-  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
-  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
-  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
-  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
-  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
-  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
-  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
-  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
-  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
-  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
-  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
-  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
-  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
-  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
-  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
-  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
-  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
-  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
-  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
-  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
-  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
-  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
-  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
-  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -1749,8 +1749,8 @@ body:             |
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -1771,47 +1771,47 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_19]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_18]], implicit [[S_MOV_B32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_28]], implicit [[S_MOV_B32_29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_38]], implicit [[S_MOV_B32_39]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_48]], implicit [[S_MOV_B32_49]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_58]], implicit [[S_MOV_B32_59]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_68]], implicit [[S_MOV_B32_69]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_78]], implicit [[S_MOV_B32_79]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_13]], implicit [[S_MOV_B32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_23]], implicit [[S_MOV_B32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_33]], implicit [[S_MOV_B32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_43]], implicit [[S_MOV_B32_44]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_53]], implicit [[S_MOV_B32_54]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_63]], implicit [[S_MOV_B32_64]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_73]], implicit [[S_MOV_B32_74]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_81]]
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
@@ -2026,10 +2026,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -2053,138 +2053,138 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
-  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
-  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
-  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
-  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
-  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
-  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
-  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
-  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
-  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
-  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
-  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
-  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
-  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
-  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
-  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
-  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
-  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
-  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
-  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
-  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
-  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
-  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
-  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
-  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
-  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
-  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
-  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
-  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
-  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
-  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
-  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
-  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
-  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
-  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
-  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
-  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
-  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
-  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
-  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
-  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
-  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
-  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
-  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
-  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
-  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
-  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
-  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
-  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
-  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
-  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
-  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
-  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
-  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
-  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
-  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
-  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
-  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
-  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
-  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
-  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
-  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
-  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
-  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
-  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
-  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
-  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
-  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
-  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
-  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
-  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
-  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
-  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
-  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
-  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
-  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
-  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
-  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
-  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
-  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
-  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 81
-  ; GFX908-NEXT:   [[S_MOV_B32_82:%[0-9]+]]:sgpr_32 = S_MOV_B32 82
-  ; GFX908-NEXT:   [[S_MOV_B32_83:%[0-9]+]]:sgpr_32 = S_MOV_B32 83
-  ; GFX908-NEXT:   [[S_MOV_B32_84:%[0-9]+]]:sgpr_32 = S_MOV_B32 84
+  ; GFX908-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GFX908-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
+  ; GFX908-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 2
+  ; GFX908-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 3
+  ; GFX908-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 4
+  ; GFX908-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 5
+  ; GFX908-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sgpr_32 = S_MOV_B32 6
+  ; GFX908-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sgpr_32 = S_MOV_B32 7
+  ; GFX908-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
+  ; GFX908-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sgpr_32 = S_MOV_B32 9
+  ; GFX908-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sgpr_32 = S_MOV_B32 10
+  ; GFX908-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sgpr_32 = S_MOV_B32 11
+  ; GFX908-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sgpr_32 = S_MOV_B32 12
+  ; GFX908-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sgpr_32 = S_MOV_B32 13
+  ; GFX908-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sgpr_32 = S_MOV_B32 14
+  ; GFX908-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sgpr_32 = S_MOV_B32 15
+  ; GFX908-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sgpr_32 = S_MOV_B32 16
+  ; GFX908-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sgpr_32 = S_MOV_B32 17
+  ; GFX908-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sgpr_32 = S_MOV_B32 18
+  ; GFX908-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sgpr_32 = S_MOV_B32 19
+  ; GFX908-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sgpr_32 = S_MOV_B32 20
+  ; GFX908-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sgpr_32 = S_MOV_B32 21
+  ; GFX908-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sgpr_32 = S_MOV_B32 22
+  ; GFX908-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sgpr_32 = S_MOV_B32 23
+  ; GFX908-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sgpr_32 = S_MOV_B32 24
+  ; GFX908-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sgpr_32 = S_MOV_B32 25
+  ; GFX908-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sgpr_32 = S_MOV_B32 26
+  ; GFX908-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sgpr_32 = S_MOV_B32 27
+  ; GFX908-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sgpr_32 = S_MOV_B32 28
+  ; GFX908-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sgpr_32 = S_MOV_B32 29
+  ; GFX908-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sgpr_32 = S_MOV_B32 30
+  ; GFX908-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sgpr_32 = S_MOV_B32 31
+  ; GFX908-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sgpr_32 = S_MOV_B32 32
+  ; GFX908-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sgpr_32 = S_MOV_B32 33
+  ; GFX908-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sgpr_32 = S_MOV_B32 34
+  ; GFX908-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sgpr_32 = S_MOV_B32 35
+  ; GFX908-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sgpr_32 = S_MOV_B32 36
+  ; GFX908-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sgpr_32 = S_MOV_B32 37
+  ; GFX908-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sgpr_32 = S_MOV_B32 38
+  ; GFX908-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sgpr_32 = S_MOV_B32 39
+  ; GFX908-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sgpr_32 = S_MOV_B32 40
+  ; GFX908-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sgpr_32 = S_MOV_B32 41
+  ; GFX908-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sgpr_32 = S_MOV_B32 42
+  ; GFX908-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sgpr_32 = S_MOV_B32 43
+  ; GFX908-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sgpr_32 = S_MOV_B32 44
+  ; GFX908-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sgpr_32 = S_MOV_B32 45
+  ; GFX908-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sgpr_32 = S_MOV_B32 46
+  ; GFX908-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sgpr_32 = S_MOV_B32 47
+  ; GFX908-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sgpr_32 = S_MOV_B32 48
+  ; GFX908-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sgpr_32 = S_MOV_B32 49
+  ; GFX908-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sgpr_32 = S_MOV_B32 50
+  ; GFX908-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sgpr_32 = S_MOV_B32 51
+  ; GFX908-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sgpr_32 = S_MOV_B32 52
+  ; GFX908-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sgpr_32 = S_MOV_B32 53
+  ; GFX908-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sgpr_32 = S_MOV_B32 54
+  ; GFX908-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sgpr_32 = S_MOV_B32 55
+  ; GFX908-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sgpr_32 = S_MOV_B32 56
+  ; GFX908-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sgpr_32 = S_MOV_B32 57
+  ; GFX908-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sgpr_32 = S_MOV_B32 58
+  ; GFX908-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sgpr_32 = S_MOV_B32 59
+  ; GFX908-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sgpr_32 = S_MOV_B32 60
+  ; GFX908-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sgpr_32 = S_MOV_B32 61
+  ; GFX908-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sgpr_32 = S_MOV_B32 62
+  ; GFX908-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sgpr_32 = S_MOV_B32 63
+  ; GFX908-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sgpr_32 = S_MOV_B32 64
+  ; GFX908-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sgpr_32 = S_MOV_B32 65
+  ; GFX908-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sgpr_32 = S_MOV_B32 66
+  ; GFX908-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sgpr_32 = S_MOV_B32 67
+  ; GFX908-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sgpr_32 = S_MOV_B32 68
+  ; GFX908-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sgpr_32 = S_MOV_B32 69
+  ; GFX908-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sgpr_32 = S_MOV_B32 70
+  ; GFX908-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sgpr_32 = S_MOV_B32 71
+  ; GFX908-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sgpr_32 = S_MOV_B32 72
+  ; GFX908-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sgpr_32 = S_MOV_B32 73
+  ; GFX908-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sgpr_32 = S_MOV_B32 74
+  ; GFX908-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sgpr_32 = S_MOV_B32 75
+  ; GFX908-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sgpr_32 = S_MOV_B32 76
+  ; GFX908-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sgpr_32 = S_MOV_B32 77
+  ; GFX908-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sgpr_32 = S_MOV_B32 78
+  ; GFX908-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sgpr_32 = S_MOV_B32 79
+  ; GFX908-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sgpr_32 = S_MOV_B32 80
+  ; GFX908-NEXT:   [[S_MOV_B32_82:%[0-9]+]]:sgpr_32 = S_MOV_B32 81
+  ; GFX908-NEXT:   [[S_MOV_B32_83:%[0-9]+]]:sgpr_32 = S_MOV_B32 82
+  ; GFX908-NEXT:   [[S_MOV_B32_84:%[0-9]+]]:sgpr_32 = S_MOV_B32 83
+  ; GFX908-NEXT:   [[S_MOV_B32_85:%[0-9]+]]:sgpr_32 = S_MOV_B32 84
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_4]], implicit [[S_MOV_B32_5]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_6]], implicit [[S_MOV_B32_7]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_8]], implicit [[S_MOV_B32_9]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_10]], implicit [[S_MOV_B32_11]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_12]], implicit [[S_MOV_B32_13]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_14]], implicit [[S_MOV_B32_15]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_16]], implicit [[S_MOV_B32_17]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_18]], implicit [[S_MOV_B32_19]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_20]], implicit [[S_MOV_B32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_22]], implicit [[S_MOV_B32_23]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_24]], implicit [[S_MOV_B32_25]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_26]], implicit [[S_MOV_B32_27]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_28]], implicit [[S_MOV_B32_29]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_30]], implicit [[S_MOV_B32_31]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_32]], implicit [[S_MOV_B32_33]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_34]], implicit [[S_MOV_B32_35]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_36]], implicit [[S_MOV_B32_37]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_38]], implicit [[S_MOV_B32_39]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_40]], implicit [[S_MOV_B32_41]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_42]], implicit [[S_MOV_B32_43]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_44]], implicit [[S_MOV_B32_45]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_46]], implicit [[S_MOV_B32_47]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_48]], implicit [[S_MOV_B32_49]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_50]], implicit [[S_MOV_B32_51]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_52]], implicit [[S_MOV_B32_53]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_54]], implicit [[S_MOV_B32_55]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_56]], implicit [[S_MOV_B32_57]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_58]], implicit [[S_MOV_B32_59]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_60]], implicit [[S_MOV_B32_61]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_62]], implicit [[S_MOV_B32_63]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_64]], implicit [[S_MOV_B32_65]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_66]], implicit [[S_MOV_B32_67]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_68]], implicit [[S_MOV_B32_69]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_70]], implicit [[S_MOV_B32_71]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_72]], implicit [[S_MOV_B32_73]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_74]], implicit [[S_MOV_B32_75]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_76]], implicit [[S_MOV_B32_77]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_78]], implicit [[S_MOV_B32_79]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_80]], implicit [[S_MOV_B32_81]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_82]], implicit [[S_MOV_B32_83]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_84]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_5]], implicit [[S_MOV_B32_6]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_7]], implicit [[S_MOV_B32_8]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_9]], implicit [[S_MOV_B32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_11]], implicit [[S_MOV_B32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_13]], implicit [[S_MOV_B32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_15]], implicit [[S_MOV_B32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_17]], implicit [[S_MOV_B32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_19]], implicit [[S_MOV_B32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_21]], implicit [[S_MOV_B32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_23]], implicit [[S_MOV_B32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_25]], implicit [[S_MOV_B32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_27]], implicit [[S_MOV_B32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_29]], implicit [[S_MOV_B32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_31]], implicit [[S_MOV_B32_32]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_33]], implicit [[S_MOV_B32_34]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_35]], implicit [[S_MOV_B32_36]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_37]], implicit [[S_MOV_B32_38]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_39]], implicit [[S_MOV_B32_40]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_41]], implicit [[S_MOV_B32_42]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_43]], implicit [[S_MOV_B32_44]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_45]], implicit [[S_MOV_B32_46]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_47]], implicit [[S_MOV_B32_48]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_49]], implicit [[S_MOV_B32_50]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_51]], implicit [[S_MOV_B32_52]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_53]], implicit [[S_MOV_B32_54]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_55]], implicit [[S_MOV_B32_56]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_57]], implicit [[S_MOV_B32_58]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_59]], implicit [[S_MOV_B32_60]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_61]], implicit [[S_MOV_B32_62]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_63]], implicit [[S_MOV_B32_64]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_65]], implicit [[S_MOV_B32_66]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_67]], implicit [[S_MOV_B32_68]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_69]], implicit [[S_MOV_B32_70]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_71]], implicit [[S_MOV_B32_72]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_73]], implicit [[S_MOV_B32_74]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_75]], implicit [[S_MOV_B32_76]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_77]], implicit [[S_MOV_B32_78]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_79]], implicit [[S_MOV_B32_80]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_81]], implicit [[S_MOV_B32_82]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_83]], implicit [[S_MOV_B32_84]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[S_MOV_B32_85]]
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GFX908-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY2]], [[V_CMP_GT_U32_e64_]], implicit-def dead $scc
   ; GFX908-NEXT:   $exec = S_MOV_B64_term [[S_AND_B64_]]
@@ -2202,8 +2202,8 @@ body:             |
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -2599,15 +2599,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %21
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -2823,8 +2823,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -2846,14 +2846,14 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.5:
@@ -2989,8 +2989,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3012,8 +3012,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -3166,8 +3166,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3190,8 +3190,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -3356,8 +3356,8 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3380,8 +3380,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -3864,10 +3864,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -3952,8 +3952,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -4146,10 +4146,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -4254,8 +4254,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -4488,10 +4488,10 @@ body:             |
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
   ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
-  ; GFX908-NEXT:   undef %4.sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
@@ -4640,8 +4640,8 @@ body:             |
   ; GFX908-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT:   $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc
-  ; GFX908-NEXT:   undef %4.sub0:sreg_64 = S_ADD_I32 %4.sub0, -1, implicit-def dead $scc
-  ; GFX908-NEXT:   S_CMP_LG_U32 %4.sub0, 0, implicit-def $scc
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_ADD_I32 [[S_MOV_B32_]].sub0, -1, implicit-def dead $scc
+  ; GFX908-NEXT:   S_CMP_LG_U32 [[S_MOV_B32_]].sub0, 0, implicit-def $scc
   ; GFX908-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit killed $scc
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.4:
@@ -4979,15 +4979,15 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
-  ; GFX908-NEXT:   %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   undef [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   %21.sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit %21
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]].sub3:vreg_128 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_21]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
@@ -5617,7 +5617,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -5626,7 +5626,7 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   S_NOP 0, implicit %23.sub1
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 3bd113988af63..efa21052e3ae2 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -32,12 +32,12 @@ body:             |
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
@@ -104,11 +104,11 @@ body:             |
   ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
@@ -118,8 +118,8 @@ body:             |
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
@@ -129,7 +129,7 @@ body:             |
   ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %13, %10, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_3]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
   ; GFX9-NEXT:   S_ENDPGM 0, implicit [[PHI]], implicit [[PHI1]]
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2
@@ -189,11 +189,11 @@ body:             |
   ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_1]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %9:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %10:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %9, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %12:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %13:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, %12, 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_2:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_3:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[V_FMAC_F32_e64_2]], 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
@@ -203,8 +203,8 @@ body:             |
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %9, %10, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 %12, %13, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_]], [[V_FMAC_F32_e64_1]], implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 [[V_FMAC_F32_e64_2]], [[V_FMAC_F32_e64_3]], implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
@@ -268,8 +268,8 @@ body:             |
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GFX9-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %5:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM %5, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   early-clobber %6:vgpr_32 = STRICT_WWM [[V_FMAC_F32_e64_]], implicit $exec
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
@@ -282,7 +282,7 @@ body:             |
   ; GFX9-NEXT: bb.2:
   ; GFX9-NEXT:   successors: %bb.3(0x80000000)
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   S_NOP 0, implicit %5
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]]
   ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
@@ -353,9 +353,9 @@ body:             |
   ; GFX9-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
   ; GFX9-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
   ; GFX9-NEXT: {{  $}}
@@ -461,15 +461,15 @@ body:             |
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.1:
   ; GFX9-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
@@ -591,9 +591,9 @@ body:             |
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B64_]]
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY3]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %6:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD killed [[COPY2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
-  ; GFX9-NEXT:   %8:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX9-NEXT:   [[V_FMAC_F32_e64_1:%[0-9]+]]:vgpr_32 = contract nofpexcept V_FMAC_F32_e64 0, [[GLOBAL_LOAD_DWORD1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
   ; GFX9-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.1
   ; GFX9-NEXT: {{  $}}
@@ -608,7 +608,7 @@ body:             |
   ; GFX9-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX9-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vcc
   ; GFX9-NEXT: {{  $}}
-  ; GFX9-NEXT:   S_NOP 0, implicit %6, implicit %8
+  ; GFX9-NEXT:   S_NOP 0, implicit [[V_FMAC_F32_e64_]], implicit [[V_FMAC_F32_e64_1]]
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
index 027b94e2f8203..b08da2e1848ff 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir
@@ -52,6 +52,7 @@ body: |
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
     ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4)
@@ -78,6 +79,7 @@ body: |
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY1]].sub1
+    ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4)
@@ -115,6 +117,7 @@ body: |
     ; GFX11-NEXT: [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY8]].sub1
     ; GFX11-NEXT: [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY9]].sub0
     ; GFX11-NEXT: [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY9]].sub1
+    ;
     ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4)
@@ -151,6 +154,7 @@ body: |
     ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64))
     ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32))
+    ;
     ; GFX12-LABEL: name: merge_s_load_x2_x1
     ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8)
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
index 7dc78101f44b7..c86b5adec372d 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -32,6 +32,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xyz_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -40,6 +41,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xyz_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -70,6 +72,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -78,6 +81,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -108,6 +112,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -116,6 +121,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -146,6 +152,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -154,6 +161,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -185,6 +193,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -193,6 +202,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -223,6 +233,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -231,6 +242,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -273,6 +285,7 @@ body:             |
     ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_float_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -288,6 +301,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_float_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -344,6 +358,7 @@ body:             |
     ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_sint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -359,6 +374,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_sint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -415,6 +431,7 @@ body:             |
     ; GFX9-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX9-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX9-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_uint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -430,6 +447,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_uint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -480,6 +498,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -495,6 +514,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -545,6 +565,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -560,6 +581,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -611,6 +633,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_xyz
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -626,6 +649,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_xyz
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -675,6 +699,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_xyz_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -690,6 +715,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_xyz_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -740,6 +766,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_xy_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -756,6 +783,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_xy_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -807,6 +835,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -822,6 +851,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -872,6 +902,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_xy_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -887,6 +918,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_xy_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -935,6 +967,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -949,6 +982,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -995,6 +1029,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -1009,6 +1044,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 126, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -1067,6 +1103,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 125, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_float32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1093,6 +1130,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_float32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1175,6 +1213,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 93, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_sint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1201,6 +1240,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_sint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1283,6 +1323,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX9-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 77, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_uint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1309,6 +1350,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 68, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_uint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1391,6 +1433,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1417,6 +1460,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 84, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1499,6 +1543,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -1525,6 +1570,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 114, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 116, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -1588,6 +1634,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1596,6 +1643,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1625,6 +1673,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1633,6 +1682,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1664,6 +1714,7 @@ body:             |
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1673,6 +1724,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1706,6 +1758,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1716,6 +1769,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 116, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_merge_across_swizzled_store
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1751,6 +1805,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1760,6 +1815,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1793,6 +1849,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1802,6 +1859,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1835,6 +1893,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1844,6 +1903,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1877,6 +1937,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1886,6 +1947,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1919,6 +1981,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1928,6 +1991,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1961,6 +2025,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -1970,6 +2035,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2003,6 +2069,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2012,6 +2079,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2045,6 +2113,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2054,6 +2123,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2087,6 +2157,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2096,6 +2167,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2129,6 +2201,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2138,6 +2211,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2171,6 +2245,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2180,6 +2255,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2213,6 +2289,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2222,6 +2299,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2257,6 +2335,7 @@ body:             |
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
     ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2267,6 +2346,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2303,6 +2383,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2313,6 +2394,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2348,6 +2430,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2357,6 +2440,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2390,6 +2474,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 125, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2399,6 +2484,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2433,6 +2519,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2442,6 +2529,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2476,6 +2564,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 126, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2485,6 +2574,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 125, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2520,6 +2610,7 @@ body:             |
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
     ; GFX9-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX9-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2530,6 +2621,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2566,6 +2658,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 123, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2576,6 +2669,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 116, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2611,6 +2705,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2621,6 +2716,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2657,6 +2753,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2668,6 +2765,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2705,6 +2803,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2715,6 +2814,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2751,6 +2851,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2762,6 +2863,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 123, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx9_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2801,6 +2903,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2810,6 +2913,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2839,6 +2943,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2848,6 +2953,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1_sub2
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xyz_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2877,6 +2983,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2886,6 +2993,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0_sub1
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2915,6 +3023,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2924,6 +3033,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2953,6 +3063,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2962,6 +3073,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub0_sub1
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -2991,6 +3103,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3000,6 +3113,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3031,6 +3145,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3040,6 +3155,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3076,6 +3192,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_float_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3097,6 +3214,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_float_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3153,6 +3271,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_sint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3174,6 +3293,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_sint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3230,6 +3350,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_uint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3251,6 +3372,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_uint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3307,6 +3429,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3322,6 +3445,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3372,6 +3496,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3387,6 +3512,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -3438,6 +3564,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3453,6 +3580,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_xyz
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3502,6 +3630,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3517,6 +3646,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_xyz_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3567,6 +3697,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3583,6 +3714,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_xy_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3634,6 +3766,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3649,6 +3782,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3699,6 +3833,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_xy_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3714,6 +3849,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_xy_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3762,6 +3898,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3776,6 +3913,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3822,6 +3960,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 75, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -3836,6 +3975,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -3894,6 +4034,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_float32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -3920,6 +4061,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_float32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4002,6 +4144,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_sint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4028,6 +4171,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_sint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4110,6 +4254,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_uint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4136,6 +4281,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_uint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4218,6 +4364,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4244,6 +4391,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4326,6 +4474,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -4352,6 +4501,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -4415,6 +4565,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4423,6 +4574,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4452,6 +4604,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4460,6 +4613,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4490,6 +4644,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4500,6 +4655,7 @@ body:             |
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4533,6 +4689,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4543,6 +4700,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4576,6 +4734,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4586,6 +4745,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4618,6 +4778,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4628,6 +4789,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4660,6 +4822,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4670,6 +4833,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4703,6 +4867,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4715,6 +4880,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4752,6 +4918,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4763,6 +4930,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4798,6 +4966,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4808,6 +4977,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4841,6 +5011,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4851,6 +5022,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4883,6 +5055,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4893,6 +5066,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4925,6 +5099,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4935,6 +5110,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4967,6 +5143,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -4977,6 +5154,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5010,6 +5188,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5020,6 +5199,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5052,6 +5232,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5062,6 +5243,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5094,6 +5276,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5104,6 +5287,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_IDXEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5136,6 +5320,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5146,6 +5331,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5179,6 +5365,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5189,6 +5376,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 74, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub1_sub2
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5222,6 +5410,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5232,6 +5421,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0_sub1
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5265,6 +5455,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 74, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5275,6 +5466,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 77, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_BOTHEN_exact]].sub1_sub2_sub3
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_xyz_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5308,6 +5500,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5320,6 +5513,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5357,6 +5551,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5368,6 +5563,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5404,6 +5600,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5414,6 +5611,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5450,6 +5648,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5461,6 +5660,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5498,6 +5698,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5508,6 +5709,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5544,6 +5746,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5555,6 +5758,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 64, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx10_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5595,6 +5799,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5603,6 +5808,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5633,6 +5839,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xyz_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5641,6 +5848,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xyz_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5671,6 +5879,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5679,6 +5888,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5709,6 +5919,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5717,6 +5928,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5747,6 +5959,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5755,6 +5968,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5785,6 +5999,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5794,6 +6009,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5824,6 +6040,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5833,6 +6050,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_OFFSET:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_format_32_32_32_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5870,6 +6088,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_float_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5891,6 +6110,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_float_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5947,6 +6167,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_sint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -5968,6 +6189,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_sint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6024,6 +6246,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_uint_32
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6045,6 +6268,7 @@ body:             |
     ; GFX10-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET]].sub2
     ; GFX10-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
     ; GFX10-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_uint_32
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6101,6 +6325,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6116,6 +6341,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_data_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6166,6 +6392,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6181,6 +6408,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET6:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET7:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET8:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_num_format_mismatch
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -6232,6 +6460,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_xyz
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6247,6 +6476,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xyz
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6296,6 +6526,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_xyz_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6311,6 +6542,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 16, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_xyz_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6361,6 +6593,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_xy_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6377,6 +6610,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6428,6 +6662,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_xy
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6443,6 +6678,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact %10:vreg_64, [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_xy
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6493,6 +6729,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_xy_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6508,6 +6745,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_xy_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6556,6 +6794,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_x
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6570,6 +6809,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 64, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6616,6 +6856,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 8, 63, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -6630,6 +6871,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 63, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_x_x_format_32_32_32_32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -6688,6 +6930,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_float32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -6714,6 +6957,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 74, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_float32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -6796,6 +7040,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_sint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -6822,6 +7067,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 73, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_sint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -6904,6 +7150,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 36, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 44, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_uint32
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -6930,6 +7177,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GFX10-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 72, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_uint32
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -7012,6 +7260,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -7038,6 +7287,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 21, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_data_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -7120,6 +7370,7 @@ body:             |
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX9-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
     ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX10-NEXT: {{  $}}
@@ -7146,6 +7397,7 @@ body:             |
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY2]], [[REG_SEQUENCE]], 0, 30, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY1]], [[REG_SEQUENCE]], 0, 40, 13, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
     ; GFX10-NEXT: TBUFFER_STORE_FORMAT_X_OFFSET_exact [[COPY]], [[REG_SEQUENCE]], 0, 32, 22, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_store_not_merged_num_format_mismatch
     ; GFX11: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; GFX11-NEXT: {{  $}}
@@ -7209,6 +7461,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7217,6 +7470,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7246,6 +7500,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7254,6 +7509,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_not_merged_swizzled_1
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7284,6 +7540,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7294,6 +7551,7 @@ body:             |
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_OFFSET]].sub1
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 12, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_merge_across_swizzle
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7327,6 +7585,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7337,6 +7596,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7370,6 +7630,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7379,6 +7640,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7412,6 +7674,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7421,6 +7684,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7454,6 +7718,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7463,6 +7728,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7496,6 +7762,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7506,6 +7773,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7539,6 +7807,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7548,6 +7817,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7581,6 +7851,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7590,6 +7861,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7623,6 +7895,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7632,6 +7905,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7665,6 +7939,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7675,6 +7950,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7708,6 +7984,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7717,6 +7994,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7750,6 +8028,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7759,6 +8038,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7792,6 +8072,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7801,6 +8082,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7835,6 +8117,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7847,6 +8130,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_IDXEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7884,6 +8168,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_IDXEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7895,6 +8180,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_idxen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7930,6 +8216,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7940,6 +8227,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7973,6 +8261,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -7982,6 +8271,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8016,6 +8306,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8025,6 +8316,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8059,6 +8351,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8068,6 +8361,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 60, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_xyz_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8102,6 +8396,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8112,6 +8407,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8149,6 +8445,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8160,6 +8457,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_bothen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8197,6 +8495,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8207,6 +8506,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_vaddr
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8243,6 +8543,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8254,6 +8555,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_IDXEN_exact1:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 50, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_xy_xy_idxen_exact_diff_srsrc
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8291,6 +8593,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8303,6 +8606,7 @@ body:             |
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZ_BOTHEN_exact]].sub2
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
     ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8340,6 +8644,7 @@ body:             |
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 22, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact1:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
     ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_X_BOTHEN_exact2:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 12, 22, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+    ;
     ; GFX10-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
@@ -8351,6 +8656,7 @@ body:             |
     ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact:%[0-9]+]]:vreg_64 = TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 64, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub0
     ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[TBUFFER_LOAD_FORMAT_XY_BOTHEN_exact]].sub1
+    ;
     ; GFX11-LABEL: name: gfx11_tbuffer_load_x_x_x_bothen_exact_swizzled_0
     ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir
index d967d76008c99..fda33f52bf06d 100644
--- a/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/mode-register-fptrunc.mir
@@ -16,6 +16,7 @@ body: |
     ; CHECK-NEXT: S_SETREG_IMM32_B32 1, 129, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: $vgpr1 = V_CVT_F16_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: ftrunc_upward
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
@@ -40,6 +41,7 @@ body: |
     ; CHECK-NEXT: S_SETREG_IMM32_B32 1, 193, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: $vgpr0 = V_CVT_F16_F32_e32 $vgpr1, implicit $mode, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
+    ;
     ; GFX11-LABEL: name: ftrunc_downward
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
index 5c45cca40b0ee..d19318ceb55c6 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-lshlrev.mir
@@ -14,6 +14,7 @@ body:             |
     ; GFX8-NEXT: [[DEF3:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; GFX8-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; GFX8-NEXT: [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 [[DEF4]], [[V_LSHL_ADD_U64_e64_]], implicit $exec
+    ;
     ; GFX12-LABEL: name: lshlrev_b64
     ; GFX12: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GFX12-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
index 5385d63ece451..003c3ea7fce10 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-vimage-vsample.ll
@@ -67,6 +67,7 @@ define amdgpu_ps float @vimage_move_to_valu(<8 x i32> %rsrc) {
   ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_1]]
   ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_LOAD_V1_V2_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX12-LABEL: name: vimage_move_to_valu
   ; GFX12: bb.0.bb:
   ; GFX12-NEXT:   successors: %bb.1(0x80000000)
@@ -200,6 +201,7 @@ define amdgpu_ps float @vsample_move_to_valu_rsrc(<8 x i32> %rsrc, <4 x i32> inr
   ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
   ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX12-LABEL: name: vsample_move_to_valu_rsrc
   ; GFX12: bb.0.main_body:
   ; GFX12-NEXT:   successors: %bb.1(0x80000000)
@@ -324,6 +326,7 @@ define amdgpu_ps float @vsample_move_to_valu_samp(<8 x i32> inreg %rsrc, <4 x i3
   ; GFX11-NEXT:   $exec_lo = S_MOV_B32 [[S_MOV_B32_]]
   ; GFX11-NEXT:   $vgpr0 = COPY [[IMAGE_SAMPLE_V1_V1_gfx11_]]
   ; GFX11-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
   ; GFX12-LABEL: name: vsample_move_to_valu_samp
   ; GFX12: bb.0.main_body:
   ; GFX12-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
index c059b6daf0d5d..ece2e1b653d34 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir
@@ -70,6 +70,7 @@ body:             |
     ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_IDXEN]]
     ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: idxen
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -184,6 +185,7 @@ body:             |
     ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFEN]]
     ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: offen
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -298,6 +300,7 @@ body:             |
     ; W64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_BOTHEN]]
     ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: bothen
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -394,6 +397,7 @@ body:             |
     ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
     ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: addr64
     ; W32: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
     ; W32-NEXT: {{  $}}
@@ -471,6 +475,7 @@ body:             |
     ; ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_ADDR64_]]
     ; ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W64-NO-ADDR64-LABEL: name: offset
     ; W64-NO-ADDR64: successors: %bb.1(0x80000000)
     ; W64-NO-ADDR64-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
@@ -515,6 +520,7 @@ body:             |
     ; W64-NO-ADDR64-NEXT: $sgpr30_sgpr31 = COPY [[COPY]]
     ; W64-NO-ADDR64-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_FORMAT_X_OFFSET]]
     ; W64-NO-ADDR64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0
+    ;
     ; W32-LABEL: name: offset
     ; W32: successors: %bb.1(0x80000000)
     ; W32-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
index 091cf2ba44b53..3de258bb52a55 100644
--- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
@@ -12,17 +12,21 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_2_pass
     ; gfx908-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD25: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 0
     ; gfx908-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD75: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 0
     ; gfx908-PAD75-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_2_pass
     ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: S_NOP 1
@@ -39,18 +43,22 @@ body: |
     ; gfx908-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD25: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD75: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu
     ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -69,20 +77,24 @@ body: |
     ; gfx908-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: DBG_VALUE
     ; gfx908-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD25: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: DBG_VALUE
     ; gfx908-PAD25-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: DBG_VALUE
     ; gfx908-PAD50-NEXT: S_NOP 0
     ; gfx908-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD75: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: DBG_VALUE
     ; gfx908-PAD75-NEXT: S_NOP 0
     ; gfx908-PAD75-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_2_pass_dbg
     ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: DBG_VALUE
@@ -100,18 +112,22 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_8_pass
     ; gfx908-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: S_NOP 1
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 3
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 5
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_8_pass
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: S_NOP 7
@@ -129,23 +145,27 @@ body: |
     ; gfx908-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 1
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 3
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -165,19 +185,23 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_16_pass
     ; gfx908-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: S_NOP 3
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 7
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 7
     ; gfx908-PAD75-NEXT: S_NOP 3
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: S_NOP 7
@@ -198,6 +222,7 @@ body: |
     ; gfx908-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -205,6 +230,7 @@ body: |
     ; gfx908-PAD25-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -213,6 +239,7 @@ body: |
     ; gfx908-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: S_NOP 3
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -221,6 +248,7 @@ body: |
     ; gfx908-PAD75-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: S_NOP 7
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -261,6 +289,7 @@ body: |
     ; gfx908-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -280,6 +309,7 @@ body: |
     ; gfx908-PAD25-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -299,6 +329,7 @@ body: |
     ; gfx908-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -318,6 +349,7 @@ body: |
     ; gfx908-PAD75-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
@@ -366,15 +398,19 @@ body: |
     ; gfx908-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD25: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD25-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD75: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD75-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ;
     ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_occ_1
     ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
     ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
@@ -400,6 +436,7 @@ body: |
   ; gfx908-DEFAULT-NEXT: bb.2:
   ; gfx908-DEFAULT-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
   ; gfx908-DEFAULT-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD25-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD25: bb.0:
   ; gfx908-PAD25-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -416,6 +453,7 @@ body: |
   ; gfx908-PAD25-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
   ; gfx908-PAD25-NEXT:   S_NOP 1
   ; gfx908-PAD25-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD50-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD50: bb.0:
   ; gfx908-PAD50-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -432,6 +470,7 @@ body: |
   ; gfx908-PAD50-NEXT:   $vgpr3 = V_MOV_B32_e32 1, implicit $exec
   ; gfx908-PAD50-NEXT:   S_NOP 5
   ; gfx908-PAD50-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD75-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD75: bb.0:
   ; gfx908-PAD75-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -449,6 +488,7 @@ body: |
   ; gfx908-PAD75-NEXT:   S_NOP 7
   ; gfx908-PAD75-NEXT:   S_NOP 1
   ; gfx908-PAD75-NEXT:   early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+  ;
   ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
   ; gfx908-PAD100: bb.0:
   ; gfx908-PAD100-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
index 64bba249db074..b501fd037574b 100644
--- a/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir
@@ -34,22 +34,22 @@ body:             |
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
   ; GFX9-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr16
-  ; GFX9-NEXT:   undef %18.sub15:vreg_512 = COPY $vgpr15
-  ; GFX9-NEXT:   %18.sub14:vreg_512 = COPY $vgpr14
-  ; GFX9-NEXT:   %18.sub13:vreg_512 = COPY $vgpr13
-  ; GFX9-NEXT:   %18.sub12:vreg_512 = COPY $vgpr12
-  ; GFX9-NEXT:   %18.sub11:vreg_512 = COPY $vgpr11
-  ; GFX9-NEXT:   %18.sub10:vreg_512 = COPY $vgpr10
-  ; GFX9-NEXT:   %18.sub9:vreg_512 = COPY $vgpr9
-  ; GFX9-NEXT:   %18.sub8:vreg_512 = COPY $vgpr8
-  ; GFX9-NEXT:   %18.sub7:vreg_512 = COPY $vgpr7
-  ; GFX9-NEXT:   %18.sub6:vreg_512 = COPY $vgpr6
-  ; GFX9-NEXT:   %18.sub5:vreg_512 = COPY $vgpr5
-  ; GFX9-NEXT:   %18.sub4:vreg_512 = COPY $vgpr4
-  ; GFX9-NEXT:   %18.sub3:vreg_512 = COPY $vgpr3
-  ; GFX9-NEXT:   %18.sub2:vreg_512 = COPY $vgpr2
-  ; GFX9-NEXT:   %18.sub1:vreg_512 = COPY $vgpr1
-  ; GFX9-NEXT:   %18.sub0:vreg_512 = COPY $vgpr0
+  ; GFX9-NEXT:   undef [[COPY2:%[0-9]+]].sub15:vreg_512 = COPY $vgpr15
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub14:vreg_512 = COPY $vgpr14
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub13:vreg_512 = COPY $vgpr13
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub12:vreg_512 = COPY $vgpr12
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub11:vreg_512 = COPY $vgpr11
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub10:vreg_512 = COPY $vgpr10
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub9:vreg_512 = COPY $vgpr9
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub8:vreg_512 = COPY $vgpr8
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub7:vreg_512 = COPY $vgpr7
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub6:vreg_512 = COPY $vgpr6
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub5:vreg_512 = COPY $vgpr5
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub4:vreg_512 = COPY $vgpr4
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY $vgpr3
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY $vgpr2
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY $vgpr1
+  ; GFX9-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY $vgpr0
   ; GFX9-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 15, [[COPY1]], implicit $exec
   ; GFX9-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
   ; GFX9-NEXT: {{  $}}
@@ -60,7 +60,7 @@ body:             |
   ; GFX9-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[V_AND_B32_e32_]], implicit $exec
   ; GFX9-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_SET_GPR_IDX_ON [[V_READFIRSTLANE_B32_]], 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef %18.sub0, implicit $exec, implicit %18, implicit $m0
+  ; GFX9-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 undef [[COPY2]].sub0, implicit $exec, implicit [[COPY2]], implicit $m0
   ; GFX9-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   ; GFX9-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def dead $scc
   ; GFX9-NEXT:   S_CBRANCH_EXECNZ %bb.1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir
index be1e36c05a30d..564c018e9a954 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness-wave32.mir
@@ -19,13 +19,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub1:sgpr_128 = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, undef %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, undef [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub1
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
@@ -52,9 +52,9 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   dead %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -83,13 +83,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub1
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
@@ -116,13 +116,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]]
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
@@ -149,13 +149,13 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr0_sgpr1
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (s128), align 8, addrspace 1)
-  ; CHECK-NEXT:   undef %2.sub0:sreg_64_xexec = S_MOV_B32 -1
-  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, %2.sub0, implicit-def dead $scc
-  ; CHECK-NEXT:   %2.sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 -1
+  ; CHECK-NEXT:   $vcc_lo = S_ANDN2_B32 $exec_lo, [[S_MOV_B32_]].sub0, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub0
   bb.0:
     liveins: $sgpr0_sgpr1
     %0:sgpr_64 = COPY $sgpr0_sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
index 897370b3d9b50..f224c91bad8cc 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
@@ -272,7 +272,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
+  ; CHECK-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
   ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, [[DEF]], implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.3
@@ -280,7 +280,7 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit undef %1
+  ; CHECK-NEXT:   S_NOP 0, implicit undef [[V_CNDMASK_B32_e64_]]
   ; CHECK-NEXT:   S_BRANCH %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
@@ -314,7 +314,7 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead %0:sreg_64_xexec = S_MOV_B64 0
+  ; CHECK-NEXT:   dead [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -326,7 +326,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, undef %0, implicit-def $scc
+  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, undef [[S_MOV_B64_]], implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -374,7 +374,7 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def $scc
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = S_ADD_I32 [[DEF]], -1, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.4, implicit $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -440,7 +440,7 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def $scc
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = S_ADD_I32 [[DEF]], -1, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
@@ -645,9 +645,9 @@ body:             |
   ; CHECK-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   undef %1.sub1:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
-  ; CHECK-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 0, %1.sub1, implicit $exec
-  ; CHECK-NEXT:   %1.sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec
+  ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub1:vreg_64 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[DEF]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_I32_e64 0, [[V_CNDMASK_B32_e64_]].sub1, implicit $exec
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec
   ; CHECK-NEXT:   $exec = S_MOV_B64_term [[V_CMP_GT_I32_e64_]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
@@ -660,7 +660,7 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %1.sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_CNDMASK_B32_e64_]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
index 84f9871527da0..c4d0583a3317a 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-mask-pre-ra-loop-phi.mir
@@ -12,7 +12,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -23,10 +23,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_MOV_B64_]], implicit $exec
   ; CHECK-NEXT:   V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec
-  ; CHECK-NEXT:   %1.sub1:vreg_64 = COPY %1.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
-  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
@@ -65,7 +65,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %1.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -74,11 +74,11 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %1.sub1:vreg_64 = COPY %1.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, %1, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %3:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
   ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, [[S_MOV_B64_]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   bb.0:
@@ -115,7 +115,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -127,8 +127,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr4_sgpr5, implicit $exec
   ; CHECK-NEXT:   V_CMP_NE_U32_e32 1, [[V_CNDMASK_B32_e64_]], implicit-def $vcc, implicit $exec
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = COPY %0.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 0
   ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, $vcc, implicit-def dead $scc
@@ -169,7 +169,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 -1
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -179,8 +179,8 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = COPY %0.sub0
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, %0, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %2:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   ATOMIC_FENCE 4, 2
   ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, $sgpr4_sgpr5, implicit-def dead $scc
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
index 6170fe63f765d..6be3e592eee45 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir
@@ -139,8 +139,8 @@ body:             |
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   dead undef %0.sub0:sgpr_256 = COPY $exec
-  ; GCN-NEXT:   dead %1:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, undef %2:sreg_64_xexec, implicit $exec
+  ; GCN-NEXT:   dead undef [[COPY:%[0-9]+]].sub0:sgpr_256 = COPY $exec
+  ; GCN-NEXT:   dead [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, undef %2:sreg_64_xexec, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir b/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir
index d9333edb03a9f..5659e70cb7db7 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir
@@ -13,6 +13,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_1_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $exec = S_MOV_B64 -1
@@ -48,6 +49,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_2_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $sgpr0 = S_MOV_B32 0
@@ -100,6 +102,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_3_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec
@@ -235,6 +238,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_4_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $exec = S_MOV_B64 -1
@@ -308,6 +312,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: partial_forwarding_5_hazard
     ; GFX12: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     ; GFX12-NEXT: $vgpr10 = V_MOV_B32_e32 0, implicit $exec
@@ -394,6 +399,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: partial_forwarding_branching_1a
   ; GFX12: bb.0:
   ; GFX12-NEXT:   successors: %bb.2(0x80000000)
@@ -471,6 +477,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr2 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX12-LABEL: name: partial_forwarding_branching_1b
   ; GFX12: bb.0:
   ; GFX12-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir
index a9580da5af1da..9ce2464e0968a 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-av-spill.mir
@@ -24,6 +24,7 @@ body:             |
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v1
     ; MUBUF-V2A: liveins: $agpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -31,11 +32,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v1
     ; FLATSCR: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v1
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -43,11 +46,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v1
     ; MUBUF-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v1
     ; MUBUF-GFX90A-V2A: liveins: $agpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -55,11 +60,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v1
     ; FLATSCR-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v1
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -92,6 +99,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v2
     ; MUBUF-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -101,11 +109,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v2
     ; FLATSCR: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v2
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -115,6 +125,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v2
     ; MUBUF-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -122,6 +133,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v2
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -131,11 +143,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v2
     ; FLATSCR-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v2
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -172,6 +186,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v3
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -183,11 +198,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v3
     ; FLATSCR: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v3
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -199,6 +216,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v3
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -208,6 +226,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v3
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -219,11 +238,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v3
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v3
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -264,6 +285,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v4
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -277,11 +299,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v4
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v4
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -295,6 +319,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v4
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -306,6 +331,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v4
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -319,11 +345,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v4
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v4
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -368,6 +396,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v5
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -383,6 +412,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v5
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -390,6 +420,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v5
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -405,6 +436,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v5
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -418,6 +450,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v5
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -433,6 +466,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v5
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -440,6 +474,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v5
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -488,6 +523,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v6
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -505,6 +541,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v6
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -512,6 +549,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v6
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -529,6 +567,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v6
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -544,6 +583,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v6
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -561,6 +601,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v6
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -568,6 +609,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v6
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -620,6 +662,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v7
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -639,6 +682,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v7
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -646,6 +690,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5_vgpr6 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s96) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v7
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -665,6 +710,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v7
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (store (s32) into %stack.0, addrspace 5)
@@ -682,6 +728,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v7
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -701,6 +748,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v7
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -708,6 +756,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5_vgpr6 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 :: (load (s96) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v7
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -764,6 +813,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v8
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -785,6 +835,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v8
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -792,6 +843,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v8
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -813,6 +865,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v8
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -832,6 +885,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v8
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -853,6 +907,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v8
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -860,6 +915,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v8
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -934,6 +990,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v16
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -971,6 +1028,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v16
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -982,6 +1040,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v16
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1019,6 +1078,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v16
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -1054,6 +1114,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v16
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1091,6 +1152,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v16
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1102,6 +1164,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v16
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1224,6 +1287,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_v32
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1293,6 +1357,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_v32
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1312,6 +1377,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_v32
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1381,6 +1447,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_v32
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -1448,6 +1515,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_v32
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1517,6 +1585,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_v32
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1536,6 +1605,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_v32
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1630,6 +1700,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a1
     ; MUBUF-V2A: liveins: $vgpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1637,6 +1708,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a1
     ; FLATSCR: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
@@ -1644,6 +1716,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a1
     ; FLATSCR-V2A: liveins: $vgpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1651,11 +1724,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a1
     ; MUBUF-GFX90A: $agpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a1
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1663,11 +1738,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a1
     ; FLATSCR-GFX90A: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a1
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1704,6 +1781,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a2
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1713,6 +1791,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a2
     ; FLATSCR: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
@@ -1724,6 +1803,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; FLATSCR-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a2
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1733,6 +1813,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a2
     ; MUBUF-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -1740,6 +1821,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a2
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1749,11 +1831,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a2
     ; FLATSCR-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a2
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1796,6 +1880,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a3
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1807,6 +1892,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a3
     ; FLATSCR: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
@@ -1822,6 +1908,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; FLATSCR-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a3
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1833,6 +1920,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a3
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -1842,6 +1930,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a3
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1853,11 +1942,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a3
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a3
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1906,6 +1997,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a4
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1919,6 +2011,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a4
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
@@ -1938,6 +2031,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; FLATSCR-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a4
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1951,6 +2045,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a4
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -1962,6 +2057,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a4
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1975,11 +2071,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a4
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a4
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2034,6 +2132,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a5
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2049,6 +2148,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a5
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
@@ -2072,6 +2172,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a5
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2087,6 +2188,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a5
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -2100,6 +2202,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a5
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2115,6 +2218,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a5
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2122,6 +2226,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a5
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2182,6 +2287,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a6
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2199,6 +2305,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a6
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
@@ -2226,6 +2333,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; FLATSCR-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a6
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2243,6 +2351,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a6
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -2258,6 +2367,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a6
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2275,6 +2385,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a6
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2282,6 +2393,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a6
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2348,6 +2460,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a7
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2367,6 +2480,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a7
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
@@ -2398,6 +2512,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; FLATSCR-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a7
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2417,6 +2532,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a7
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (store (s32) into %stack.0, addrspace 5)
@@ -2434,6 +2550,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a7
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2453,6 +2570,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a7
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2460,6 +2578,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6 :: (load (s96) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a7
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2532,6 +2651,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a8
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2553,6 +2673,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a8
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -2588,6 +2709,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; FLATSCR-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a8
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2609,6 +2731,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a8
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -2628,6 +2751,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a8
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2649,6 +2773,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a8
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2656,6 +2781,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a8
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2734,6 +2860,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; MUBUF-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a9
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2757,6 +2884,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a9
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
@@ -2796,6 +2924,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; FLATSCR-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a9
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2819,6 +2948,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a9
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (store (s32) into %stack.0, addrspace 5)
@@ -2840,6 +2970,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a9
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2863,6 +2994,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a9
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2872,6 +3004,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a9
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2956,6 +3089,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; MUBUF-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a10
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2981,6 +3115,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a10
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
@@ -3024,6 +3159,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; FLATSCR-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a10
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3049,6 +3185,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a10
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (store (s32) into %stack.0, addrspace 5)
@@ -3072,6 +3209,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 32, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a10
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3097,6 +3235,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a10
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3106,6 +3245,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 :: (load (s64) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a10
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -3196,6 +3336,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; MUBUF-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a11
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -3223,6 +3364,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a11
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
@@ -3270,6 +3412,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 40, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; FLATSCR-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a11
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3297,6 +3440,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a11
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (store (s32) into %stack.0, addrspace 5)
@@ -3322,6 +3466,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 36, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a11
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3349,6 +3494,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a11
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3358,6 +3504,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 :: (load (s96) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a11
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -3454,6 +3601,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 44, addrspace 5)
     ; MUBUF-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a12
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -3483,6 +3631,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a12
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
@@ -3534,6 +3683,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 44, addrspace 5)
     ; FLATSCR-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a12
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3563,6 +3713,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a12
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (store (s32) into %stack.0, addrspace 5)
@@ -3590,6 +3741,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 40, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (load (s32) from %stack.0 + 44, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a12
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3619,6 +3771,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a12
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3628,6 +3781,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a12
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -3742,6 +3896,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a16
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -3779,6 +3934,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a16
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -3846,6 +4002,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; FLATSCR-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a16
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3883,6 +4040,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a16
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -3918,6 +4076,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a16
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3955,6 +4114,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a16
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3966,6 +4126,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a16
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -4152,6 +4313,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_av_a32
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -4221,6 +4383,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_av_a32
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
@@ -4352,6 +4515,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; FLATSCR-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_av_a32
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -4421,6 +4585,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_av_a32
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -4488,6 +4653,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_av_a32
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -4557,6 +4723,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_av_a32
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -4576,6 +4743,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr24_agpr25_agpr26_agpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr28_agpr29_agpr30_agpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_av_a32
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
index a3d78f342c253..8eddc9a5afd50 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill-partial-agpr.mir
@@ -65,6 +65,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v2_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -103,6 +104,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v3_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -143,6 +145,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v4_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -189,6 +192,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v5_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -237,6 +241,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v6_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -293,6 +298,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v8_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -363,6 +369,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v16_partial_agpr
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
index 9024a395b6cf0..d74b6f239b354 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-build-spill.mir
@@ -24,6 +24,7 @@ body:             |
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v1
     ; MUBUF-V2A: liveins: $agpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -31,11 +32,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v1
     ; FLATSCR: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v1
     ; FLATSCR-V2A: liveins: $agpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -43,11 +46,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v1
     ; MUBUF-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v1
     ; MUBUF-GFX90A-V2A: liveins: $agpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -55,11 +60,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v1
     ; FLATSCR-GFX90A: $vgpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v1
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -92,6 +99,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v2
     ; MUBUF-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -101,11 +109,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v2
     ; FLATSCR: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v2
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -115,6 +125,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v2
     ; MUBUF-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -122,6 +133,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v2
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -131,11 +143,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit-def $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v2
     ; FLATSCR-GFX90A: $vgpr0_vgpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v2
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -172,6 +186,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v3
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -183,11 +198,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v3
     ; FLATSCR: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v3
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -199,6 +216,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v3
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr0_vgpr1_vgpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -208,6 +226,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v3
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -219,11 +238,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v3
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr0_vgpr1_vgpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v3
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -264,6 +285,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v4
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -277,11 +299,13 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v4
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v4
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -295,6 +319,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v4
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -306,6 +331,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v4
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -319,11 +345,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v4
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v4
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -368,6 +396,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v5
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -383,6 +412,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v5
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -390,6 +420,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v5
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -405,6 +436,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v5
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -418,6 +450,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v5
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -433,6 +466,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v5
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -440,6 +474,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v5
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -488,6 +523,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v6
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -505,6 +541,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v6
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -512,6 +549,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v6
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -529,6 +567,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v6
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -544,6 +583,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v6
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -561,6 +601,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v6
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -568,6 +609,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v6
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -622,6 +664,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v8
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -643,6 +686,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v8
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -650,6 +694,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v8
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -671,6 +716,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v8
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -690,6 +736,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v8
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -711,6 +758,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v8
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -718,6 +766,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v8
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -792,6 +841,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v16
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -829,6 +879,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v16
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -840,6 +891,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v16
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -877,6 +929,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v16
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -912,6 +965,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v16
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -949,6 +1003,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v16
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -960,6 +1015,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr8_vgpr9_vgpr10_vgpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr12_vgpr13_vgpr14_vgpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v16
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1082,6 +1138,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_v32
     ; MUBUF-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1151,6 +1208,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_v32
     ; FLATSCR: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1170,6 +1228,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_v32
     ; FLATSCR-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1239,6 +1298,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr29 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: $vgpr28 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_v32
     ; MUBUF-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -1306,6 +1366,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_v32
     ; MUBUF-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1375,6 +1436,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr30 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr31 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_v32
     ; FLATSCR-GFX90A: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1394,6 +1456,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $vgpr24_vgpr25_vgpr26_vgpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $vgpr28_vgpr29_vgpr30_vgpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_v32
     ; FLATSCR-GFX90A-V2A: liveins: $agpr0, $agpr1, $agpr2, $agpr3, $agpr4, $agpr5, $agpr6, $agpr7, $agpr8, $agpr9, $agpr10, $agpr11, $agpr12, $agpr13, $agpr14, $agpr15, $agpr16, $agpr17, $agpr18, $agpr19, $agpr20, $agpr21, $agpr22, $agpr23, $agpr24, $agpr25, $agpr26, $agpr27, $agpr28, $agpr29, $agpr30, $agpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1488,6 +1551,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a1
     ; MUBUF-V2A: liveins: $vgpr0
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1495,6 +1559,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a1
     ; FLATSCR: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
@@ -1502,6 +1567,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a1
     ; FLATSCR-V2A: liveins: $vgpr0
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1509,11 +1575,13 @@ body:             |
     ; FLATSCR-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a1
     ; MUBUF-GFX90A: $agpr0 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a1
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1521,11 +1589,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a1
     ; FLATSCR-GFX90A: $agpr0 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORD_SADDR killed $agpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a1
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1562,6 +1632,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a2
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1571,6 +1642,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a2
     ; FLATSCR: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1
@@ -1582,6 +1654,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; FLATSCR-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a2
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1591,6 +1664,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a2
     ; MUBUF-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1, implicit $agpr0_agpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -1598,6 +1672,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit-def $agpr0_agpr1 :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a2
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1607,11 +1682,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a2
     ; FLATSCR-GFX90A: $agpr0_agpr1 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $agpr0_agpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a2
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1654,6 +1731,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a3
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1665,6 +1743,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a3
     ; FLATSCR: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
@@ -1680,6 +1759,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; FLATSCR-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a3
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1691,6 +1771,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a3
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $agpr0_agpr1_agpr2 :: (store (s32) into %stack.0, addrspace 5)
@@ -1700,6 +1781,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 4, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a3
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1711,11 +1793,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a3
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $agpr0_agpr1_agpr2, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2 = SCRATCH_LOAD_DWORDX3_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s96) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a3
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1764,6 +1848,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a4
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1777,6 +1862,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a4
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
@@ -1796,6 +1882,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; FLATSCR-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a4
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1809,6 +1896,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a4
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $agpr0_agpr1_agpr2_agpr3 :: (store (s32) into %stack.0, addrspace 5)
@@ -1820,6 +1908,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 8, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a4
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1833,11 +1922,13 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a4
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s128) into %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a4
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -1892,6 +1983,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a5
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -1907,6 +1999,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a5
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
@@ -1930,6 +2023,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a5
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -1945,6 +2039,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a5
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s32) into %stack.0, addrspace 5)
@@ -1958,6 +2053,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 12, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a5
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -1973,6 +2069,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a5
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4, implicit $agpr0_agpr1_agpr2_agpr3_agpr4 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -1980,6 +2077,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4 :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a5
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2040,6 +2138,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a6
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2057,6 +2156,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a6
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
@@ -2084,6 +2184,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; FLATSCR-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a6
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2101,6 +2202,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a6
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s32) into %stack.0, addrspace 5)
@@ -2116,6 +2218,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 16, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s32) from %stack.0 + 20, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a6
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2133,6 +2236,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a6
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2140,6 +2244,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5 = SCRATCH_LOAD_DWORDX2_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 :: (load (s64) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a6
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2210,6 +2315,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a8
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2231,6 +2337,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a8
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
@@ -2266,6 +2373,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; FLATSCR-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a8
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2287,6 +2395,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a8
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s32) into %stack.0, addrspace 5)
@@ -2306,6 +2415,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 24, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s32) from %stack.0 + 28, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a8
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2327,6 +2437,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a8
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2334,6 +2445,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr0_agpr1_agpr2_agpr3 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr4_agpr5_agpr6_agpr7 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 :: (load (s128) from %stack.0 + 16, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a8
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2440,6 +2552,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a16
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2477,6 +2590,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a16
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
@@ -2544,6 +2658,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 60, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; FLATSCR-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a16
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -2581,6 +2696,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a16
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s32) into %stack.0, addrspace 5)
@@ -2616,6 +2732,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 56, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s32) from %stack.0 + 60, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a16
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -2653,6 +2770,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a16
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -2664,6 +2782,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr8_agpr9_agpr10_agpr11 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 32, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 32, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr12_agpr13_agpr14_agpr15 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 48, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 :: (load (s128) from %stack.0 + 48, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a16
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
@@ -2850,6 +2969,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-V2A-LABEL: name: test_spill_a32
     ; MUBUF-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-V2A-NEXT: {{  $}}
@@ -2919,6 +3039,7 @@ body:             |
     ; MUBUF-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-LABEL: name: test_spill_a32
     ; FLATSCR: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
@@ -3050,6 +3171,7 @@ body:             |
     ; FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 124, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; FLATSCR-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-V2A-LABEL: name: test_spill_a32
     ; FLATSCR-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-V2A-NEXT: {{  $}}
@@ -3119,6 +3241,7 @@ body:             |
     ; FLATSCR-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; FLATSCR-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; FLATSCR-V2A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-LABEL: name: test_spill_a32
     ; MUBUF-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; MUBUF-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $agpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s32) into %stack.0, addrspace 5)
@@ -3186,6 +3309,7 @@ body:             |
     ; MUBUF-GFX90A-NEXT: $agpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 120, 0, 0, implicit $exec :: (load (s32) from %stack.0 + 120, addrspace 5)
     ; MUBUF-GFX90A-NEXT: $agpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 124, 0, 0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s32) from %stack.0 + 124, addrspace 5)
     ; MUBUF-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; MUBUF-GFX90A-V2A-LABEL: name: test_spill_a32
     ; MUBUF-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; MUBUF-GFX90A-V2A-NEXT: {{  $}}
@@ -3255,6 +3379,7 @@ body:             |
     ; MUBUF-GFX90A-V2A-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
     ; MUBUF-GFX90A-V2A-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
     ; MUBUF-GFX90A-V2A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-LABEL: name: test_spill_a32
     ; FLATSCR-GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = IMPLICIT_DEF
     ; FLATSCR-GFX90A-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $agpr0_agpr1_agpr2_agpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (store (s128) into %stack.0, align 4, addrspace 5)
@@ -3274,6 +3399,7 @@ body:             |
     ; FLATSCR-GFX90A-NEXT: $agpr24_agpr25_agpr26_agpr27 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 96, 0, implicit $exec, implicit $flat_scr :: (load (s128) from %stack.0 + 96, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: $agpr28_agpr29_agpr30_agpr31 = SCRATCH_LOAD_DWORDX4_SADDR $sgpr32, 112, 0, implicit $exec, implicit $flat_scr, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 :: (load (s128) from %stack.0 + 112, align 4, addrspace 5)
     ; FLATSCR-GFX90A-NEXT: S_ENDPGM 0
+    ;
     ; FLATSCR-GFX90A-V2A-LABEL: name: test_spill_a32
     ; FLATSCR-GFX90A-V2A: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31
     ; FLATSCR-GFX90A-V2A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
index e72ed2ba99e1a..2ccc24152a9f0 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -96,7 +96,7 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr93 = COPY renamable $sgpr60
   ; CHECK-NEXT:   renamable $sgpr94 = COPY renamable $sgpr60
   ; CHECK-NEXT:   renamable $sgpr95 = COPY renamable $sgpr60
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $exec
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir b/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir
index 86f6bfbe16c63..fe2b4824f35a2 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-fast-dont-drop-subreg-index-issue61134.mir
@@ -19,14 +19,14 @@ body:             |
     ; CHECK-LABEL: name: func
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %0.sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; CHECK-NEXT: undef %2.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: undef %0.sub0:vreg_64, renamable $sgpr0_sgpr1 = V_ADD_CO_U32_e64 1, %0.sub0, 0, implicit $exec
-    ; CHECK-NEXT: undef %2.sub1:vreg_64, dead renamable $sgpr0_sgpr1 = V_ADDC_U32_e64 0, %2.sub1, killed $sgpr0_sgpr1, 0, implicit $exec
-    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY]].sub0:vreg_64, renamable $sgpr0_sgpr1 = V_ADD_CO_U32_e64 1, [[COPY]].sub0, 0, implicit $exec
+    ; CHECK-NEXT: undef [[V_MOV_B32_e32_]].sub1:vreg_64, dead renamable $sgpr0_sgpr1 = V_ADDC_U32_e64 0, [[V_MOV_B32_e32_]].sub1, killed $sgpr0_sgpr1, 0, implicit $exec
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY1]], 0, 0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit %0, implicit %2, implicit $vgpr0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY]], implicit [[V_MOV_B32_e32_]], implicit $vgpr0
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %2:vreg_64 = COPY $vgpr1_vgpr2
     undef %1.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
index 9fec776df13f8..6c556433088c5 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-cannot-join-failures.mir
@@ -9,16 +9,16 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sreg_64_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[DEF:%[0-9]+]].sub0:sreg_64_xexec = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %0.sub1:sreg_64_xexec = COPY %0.sub0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub1:sreg_64_xexec = COPY [[DEF]].sub0
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0, implicit %0
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[DEF]]
   bb.0:
     successors: %bb.1
 
@@ -42,9 +42,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: couldnt_join_subrange_no_implicit_def_inst
-    ; CHECK: undef %0.sub0:sreg_64 = S_MOV_B32 0
-    ; CHECK-NEXT: %0.sub1:sreg_64 = COPY %0.sub0
-    ; CHECK-NEXT: S_ENDPGM 0, implicit %0.sub1
+    ; CHECK: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_]].sub0
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_MOV_B32_]].sub1
     undef %0.sub0:sreg_64 = S_MOV_B32 0
     %1:sreg_64 = COPY %0:sreg_64
     %0.sub1:sreg_64 = COPY %0.sub0:sreg_64
@@ -59,12 +59,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:sreg_64 = S_MOV_B32 -1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 -1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   %0.sub0:sreg_64 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY %0
-  ; CHECK-NEXT:   dead %0.sub1:sreg_64 = COPY %0.sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY [[S_MOV_B32_]]
+  ; CHECK-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0, implicit [[COPY]].sub1
   bb.0:
     successors: %bb.1
@@ -84,10 +84,10 @@ body:             |
   bb.0:
 
     ; CHECK-LABEL: name: lanes_not_tracked_subreg_join_couldnt_join_subrange
-    ; CHECK: undef %0.sub0:sreg_64_xexec = S_MOV_B32 0
-    ; CHECK-NEXT: %0.sub1:sreg_64_xexec = S_MOV_B32 0
-    ; CHECK-NEXT: S_NOP 0, implicit %0.sub1
-    ; CHECK-NEXT: S_NOP 0, implicit %0
+    ; CHECK: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = S_MOV_B32 0
+    ; CHECK-NEXT: S_NOP 0, implicit [[S_MOV_B32_]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[S_MOV_B32_]]
     ; CHECK-NEXT: S_ENDPGM 0
     undef %0.sub0:sreg_64_xexec = S_MOV_B32 0
     %1:sreg_64 = COPY %0
@@ -105,12 +105,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:sreg_64_xexec = S_MOV_B32 0
-  ; CHECK-NEXT:   %0.sub1:sreg_64_xexec = COPY %0.sub0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64_xexec = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64_xexec = COPY [[S_MOV_B32_]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
-  ; CHECK-NEXT:   S_ENDPGM 0, implicit %0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_MOV_B32_]].sub1
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[S_MOV_B32_]]
   bb.0:
     successors: %bb.1
 
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
index 1c4900ae85f58..18eb5586fdecf 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-keep-valid-lanes-implicit-def-bug39602.mir
@@ -12,12 +12,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:sreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   undef [[DEF:%[0-9]+]].sub1:sreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   %0.sub0:sreg_64 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY %0
-  ; CHECK-NEXT:   dead %0.sub1:sreg_64 = COPY %0.sub0
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY [[DEF]]
+  ; CHECK-NEXT:   dead [[DEF:%[0-9]+]].sub1:sreg_64 = COPY [[DEF]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0, implicit [[COPY]].sub1
   bb.0:
     successors: %bb.1
@@ -41,12 +41,12 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub1:sreg_64 = S_MOV_B32 -1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 -1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   %0.sub0:sreg_64 = S_MOV_B32 0
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY %0
-  ; CHECK-NEXT:   dead %0.sub1:sreg_64 = COPY %0.sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY [[S_MOV_B32_]]
+  ; CHECK-NEXT:   dead [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = COPY [[S_MOV_B32_]].sub0
   ; CHECK-NEXT:   S_ENDPGM 0, implicit [[COPY]].sub1
   bb.0:
     successors: %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir b/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
index 82983aa117518..d0245ff1a73ae 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalescer-resolve-lane-conflict-by-subranges.mir
@@ -21,11 +21,11 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -71,7 +71,7 @@ body:             |
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[COPY1]], implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub2, [[COPY1]], implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -112,12 +112,12 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub1:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub1, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub1:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
@@ -133,7 +133,6 @@ body:             |
     successors: %bb.2
 
     %2:vgpr_32 = V_AND_B32_e64 %1.sub0, %1.sub1, implicit $exec
-    ; %1.sub1 was re-defined
     %1.sub1:vreg_128 = V_AND_B32_e64 %2, %2, implicit $exec
     S_BRANCH %bb.2
 
@@ -161,13 +160,13 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
-  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_]].sub2:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
+  ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]].sub2:vreg_128 = V_AND_B32_e64 [[GLOBAL_LOAD_DWORDX4_]].sub0, [[GLOBAL_LOAD_DWORDX4_]].sub0, implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
-  ; GCN-NEXT:   dead %3:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec
+  ; GCN-NEXT:   dead [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORDX4_]].sub1, [[GLOBAL_LOAD_DWORDX4_]].sub2, implicit $exec
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
index 84649f9563696..c7fa879187cc3 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-dead-subreg.mir
@@ -16,9 +16,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 2, implicit $m0
     ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 3, implicit $m0
-    ; GCN-NEXT: dead %4:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr2, implicit killed $sgpr1
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr2, implicit killed $sgpr1
     ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: dead %5:vgpr_32 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 killed $sgpr1, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     $m0 = IMPLICIT_DEF
     %0:sreg_64_xexec = S_MOV_B64 1, implicit $m0
@@ -41,9 +41,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr2, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 3, implicit $m0
     ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: dead %4:vgpr_32 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit killed $sgpr0, implicit killed $sgpr2
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit killed $sgpr0, implicit killed $sgpr2
     ; GCN-NEXT: renamable $sgpr0 = S_MUL_I32 renamable $sgpr5, 3
-    ; GCN-NEXT: dead %5:vgpr_32 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr5
     $m0 = IMPLICIT_DEF
     %0:sreg_64_xexec = S_MOV_B64 1, implicit $m0
@@ -66,9 +66,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 3, implicit $m0
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 2, implicit $m0
-    ; GCN-NEXT: dead %5:vgpr_32 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit killed $sgpr0_sgpr1, implicit killed $sgpr4_sgpr5
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit killed $sgpr0_sgpr1, implicit killed $sgpr4_sgpr5
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: dead %6:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr4_sgpr5, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr4_sgpr5, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr8, implicit renamable $sgpr11
     %0:sreg_64 = IMPLICIT_DEF
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0, 1, 0
@@ -91,9 +91,9 @@ body:             |
     ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5 = S_MOV_B64 2, implicit $m0
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_MOV_B64 3, implicit $m0
-    ; GCN-NEXT: dead %4:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr4_sgpr5, implicit killed $sgpr2_sgpr3
+    ; GCN-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr4_sgpr5, implicit killed $sgpr2_sgpr3
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: dead %5:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr2_sgpr3, implicit $exec
+    ; GCN-NEXT: dead [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO killed $sgpr2_sgpr3, implicit $exec
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     $m0 = IMPLICIT_DEF
     %0:sreg_64_xexec = S_MOV_B64 1, implicit $m0
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index f087ae458454c..3f88f98e34371 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -21,28 +21,28 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr6_sgpr7
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_]], implicit $exec
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY %0
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_512 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 0, [[V_MOV_B32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY [[V_MOV_B32_e32_]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]].sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32), align 8, addrspace 5)
   ; CHECK-NEXT:   dead [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec
-  ; CHECK-NEXT:   dead [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-  ; CHECK-NEXT:   undef %11.sub1:vreg_512 = COPY [[COPY]].sub1
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
-  ; CHECK-NEXT:   %11.sub0:vreg_512 = COPY [[COPY]].sub0
-  ; CHECK-NEXT:   %11.sub3:vreg_512 = COPY [[COPY]].sub3
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
-  ; CHECK-NEXT:   %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   dead [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_2]], implicit-def dead $vcc, implicit $exec
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_512 = COPY [[COPY2]]
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
index e4f56cc328e47..add7825a224ed 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -27,33 +27,33 @@ body:             |
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
-  ; CHECK-NEXT:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_F32_e32_:%[0-9]+]].sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead undef [[V_ADD_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   undef %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
-  ; CHECK-NEXT:   %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, implicit $exec
-  ; CHECK-NEXT:   undef %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_F32_e32_1:%[0-9]+]].sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ADD_F32_e32_1:%[0-9]+]].sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2 [[V_ADD_F32_e32_1]], [[V_MOV_B32_e32_]], 32, 0, implicit $exec
+  ; CHECK-NEXT:   undef [[GLOBAL_LOAD_DWORD2:%[0-9]+]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   %11.sub1:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD2:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, implicit $exec
-  ; CHECK-NEXT:   dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD3:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[GLOBAL_LOAD_DWORD2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD4:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   dead [[GLOBAL_LOAD_DWORD5:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_LSHLREV_B64_e64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64_e64 2, [[DEF2]], implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_e64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 4c32af8b01000..6d79837feb128 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -29,9 +29,9 @@ body:             |
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %2:vgpr_32, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr101, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sgpr_64 = S_LOAD_DWORDX2_IMM [[COPY]](p4), 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 5329
-  ; CHECK-NEXT:   undef %5.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4, implicit $exec
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -41,42 +41,42 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead %11
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
   ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %15, 851978 /* regdef:VGPR_16 */, def %16
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
   ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %21, 851978 /* regdef:VGPR_16 */, def %22
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_2]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_3]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]]
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_3]], 851978 /* regdef:VGPR_16 */, def dead [[V_MOV_B32_e32_4]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 851977 /* reguse:VGPR_16 */, %15, 851977 /* reguse:VGPR_16 */, %16, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_1]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_3]], 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_2]]
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)
-  ; CHECK-NEXT:   undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %30:vgpr_32, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   undef [[FLAT_LOAD_DWORD:%[0-9]+]].sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
   ; CHECK-NEXT:   [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; CHECK-NEXT:   [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_ADD_U32_e32_]], [[S_MOV_B32_]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
-  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
   ; CHECK-NEXT:   [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_e64_]], [[DEF1]], implicit $exec
-  ; CHECK-NEXT:   [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[V_CNDMASK_B32_e64_]], [[S_MOV_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[DEF2]], [[S_MOV_B32_]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_SUB_U32_e32_]], [[DEF]].sub0, implicit $exec
-  ; CHECK-NEXT:   [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_e64_1]], [[V_MUL_LO_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   [[DEF]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
-  ; CHECK-NEXT:   undef %38.sub0:vreg_64, %39:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
-  ; CHECK-NEXT:   undef %40.sub1:vreg_64, dead %41:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, %39, 0, implicit $exec
-  ; CHECK-NEXT:   undef %42.sub0:sgpr_64 = V_READFIRSTLANE_B32 %38.sub0, implicit $exec
-  ; CHECK-NEXT:   %42.sub1:sgpr_64 = V_READFIRSTLANE_B32 %40.sub1, implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %42, 0, 0 :: (load (s32), addrspace 1)
+  ; CHECK-NEXT:   [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[DEF1]], [[V_MUL_LO_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADD_CO_U32_e64_:%[0-9]+]].sub0:vreg_64, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_ADDC_U32_e64_:%[0-9]+]].sub1:vreg_64, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 [[V_ADD_CO_U32_e64_]].sub0, implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 [[V_ADDC_U32_e64_]].sub1, implicit $exec
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[V_READFIRSTLANE_B32_]], 0, 0 :: (load (s32), addrspace 1)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_4:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %45:vgpr_32, 0, 0, implicit $exec :: (load (s32), addrspace 3)
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %46:vreg_64, [[DS_READ_B32_gfx9_4]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; CHECK-NEXT:   %31.sub0:vreg_64 = COPY [[S_LOAD_DWORD_IMM]], implicit $exec
-  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %47:vgpr_32, %31, 0, 0, implicit $exec :: (store (s64), addrspace 3)
+  ; CHECK-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]].sub0:vreg_64 = COPY [[S_LOAD_DWORD_IMM]], implicit $exec
+  ; CHECK-NEXT:   DS_WRITE_B64_gfx9 undef %47:vgpr_32, [[FLAT_LOAD_DWORD]], 0, 0, implicit $exec :: (store (s64), addrspace 3)
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
     liveins: $sgpr4_sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
index 937b43f455587..0b1fd441256d8 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-fpmode.mir
@@ -63,7 +63,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32))
-    ; CHECK-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
+    ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
     ; CHECK-NEXT: S_DENORM_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
@@ -89,7 +89,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load (s32))
-    ; CHECK-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
+    ; CHECK-NEXT: dead [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4, 0, implicit $exec :: (load (s32))
     ; CHECK-NEXT: S_ROUND_MODE 0, implicit-def $mode, implicit $mode
     ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e32 0, [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[GLOBAL_LOAD_DWORD]], [[V_ADD_F32_e32_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
index 4eebd8f2e574d..9429d1565962e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier.mir
@@ -13,31 +13,31 @@ body: |
     ; CHECK-LABEL: name: test
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %0.sub3:vreg_128 = COPY $vgpr9
-    ; CHECK-NEXT: undef %1.sub2:vreg_128 = COPY $vgpr8
-    ; CHECK-NEXT: undef %2.sub1:vreg_128 = COPY $vgpr7
-    ; CHECK-NEXT: undef %3.sub0:vreg_128 = COPY $vgpr6
-    ; CHECK-NEXT: undef %4.sub3:vreg_128 = COPY $vgpr5
-    ; CHECK-NEXT: undef %5.sub2:vreg_128 = COPY $vgpr4
-    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY $vgpr1
-    ; CHECK-NEXT: %8.sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: undef %6.sub1:vreg_128 = COPY $vgpr3
-    ; CHECK-NEXT: undef %7.sub0:vreg_128 = COPY $vgpr2
-    ; CHECK-NEXT: undef %9.sub0:sgpr_128 = V_READFIRSTLANE_B32 %7.sub0, implicit $exec
-    ; CHECK-NEXT: %9.sub1:sgpr_128 = V_READFIRSTLANE_B32 %6.sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub3:vreg_128 = COPY $vgpr9
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub2:vreg_128 = COPY $vgpr8
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub1:vreg_128 = COPY $vgpr7
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY $vgpr6
+    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub3:vreg_128 = COPY $vgpr5
+    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub2:vreg_128 = COPY $vgpr4
+    ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub1:vreg_128 = COPY $vgpr3
+    ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub0:vreg_128 = COPY $vgpr2
+    ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY8]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY7]].sub1, implicit $exec
     ; CHECK-NEXT: S_BARRIER
-    ; CHECK-NEXT: %9.sub2:sgpr_128 = V_READFIRSTLANE_B32 %5.sub2, implicit $exec
-    ; CHECK-NEXT: %9.sub3:sgpr_128 = V_READFIRSTLANE_B32 %4.sub3, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %9, 0, 0, 0, 0, implicit $exec
-    ; CHECK-NEXT: undef %12.sub0:sgpr_128 = V_READFIRSTLANE_B32 %3.sub0, implicit $exec
-    ; CHECK-NEXT: %12.sub1:sgpr_128 = V_READFIRSTLANE_B32 %2.sub1, implicit $exec
-    ; CHECK-NEXT: %12.sub2:sgpr_128 = V_READFIRSTLANE_B32 %1.sub2, implicit $exec
-    ; CHECK-NEXT: %12.sub3:sgpr_128 = V_READFIRSTLANE_B32 %0.sub3, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %12, 0, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY5]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY4]].sub3, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_128 = V_READFIRSTLANE_B32 [[COPY3]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_128 = V_READFIRSTLANE_B32 [[COPY2]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub2:sgpr_128 = V_READFIRSTLANE_B32 [[COPY1]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub3:sgpr_128 = V_READFIRSTLANE_B32 [[COPY]].sub3, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[V_READFIRSTLANE_B32_1]], 0, 0, 0, 0, implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET]], [[BUFFER_LOAD_DWORD_OFFSET]], implicit $exec
     ; CHECK-NEXT: [[V_MUL_LO_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 [[BUFFER_LOAD_DWORD_OFFSET1]], [[BUFFER_LOAD_DWORD_OFFSET1]], implicit $exec
     ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MUL_LO_U32_e64_]], [[V_MUL_LO_U32_e64_1]], implicit $exec
-    ; CHECK-NEXT: GLOBAL_STORE_DWORD %8, [[V_ADD_U32_e32_]], 0, 0, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY6]], [[V_ADD_U32_e32_]], 0, 0, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0
     undef %43.sub3:vreg_128 = COPY $vgpr9
     undef %42.sub2:vreg_128 = COPY $vgpr8
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index b5beabd287c2a..796a70cfe8a39 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -8,9 +8,9 @@ body:             |
 
     ; GCN-LABEL: name: simple
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -26,10 +26,10 @@ body:             |
 
     ; GCN-LABEL: name: salu_in_between
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $sgpr0 = S_MOV_B32 $sgpr2
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr2
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -46,12 +46,12 @@ body:             |
 
     ; GCN-LABEL: name: valu_write_in_between
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -68,12 +68,12 @@ body:             |
 
     ; GCN-LABEL: name: valu_read_in_between
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: V_NOP_e32 implicit $exec, implicit $vgpr0
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: V_NOP_e32 implicit $exec, implicit $vgpr0
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -90,12 +90,12 @@ body:             |
 
     ; GCN-LABEL: name: changed_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $sgpr2 = S_MOV_B32 1
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 1
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -112,12 +112,12 @@ body:             |
 
     ; GCN-LABEL: name: implicitly_changed_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_NOP 0, implicit-def $sgpr2
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit-def $sgpr2
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -134,12 +134,12 @@ body:             |
 
     ; GCN-LABEL: name: changed_m0
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $m0 = S_MOV_B32 1
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $m0 = S_MOV_B32 1
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -156,12 +156,12 @@ body:             |
 
     ; GCN-LABEL: name: implicitly_changed_m0
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_NOP 0, implicit-def $m0
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_NOP 0, implicit-def $m0
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -178,9 +178,9 @@ body:             |
 
     ; GCN-LABEL: name: same_imm_index
     ; GCN: S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -196,11 +196,11 @@ body:             |
 
     ; GCN-LABEL: name: different_imm_index
     ; GCN: S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON 2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON 2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON 1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -216,11 +216,11 @@ body:             |
 
     ; GCN-LABEL: name: different_gpr_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -236,14 +236,14 @@ body:             |
 
     ; GCN-LABEL: name: different_gpr_index_then_same_index
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -262,11 +262,11 @@ body:             |
 
     ; GCN-LABEL: name: use_m0_with_idx_off
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -282,10 +282,10 @@ body:             |
 
     ; GCN-LABEL: name: three_in_a_row
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr17 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr18 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr17 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr18 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -304,12 +304,12 @@ body:             |
 
     ; GCN-LABEL: name: different_gpr_index_then_two_same_indexes
     ; GCN: S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -328,12 +328,12 @@ body:             |
 
     ; GCN-LABEL: name: two_same_indexes_then_different
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: S_SET_GPR_IDX_ON killed $sgpr1, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
@@ -352,10 +352,10 @@ body:             |
 
     ; GCN-LABEL: name: indirect_mov
     ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
-    ; GCN: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
-    ; GCN: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
-    ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
+    ; GCN-NEXT: V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   V_MOV_B32_indirect_write undef $vgpr0, undef $vgpr3, implicit $exec, implicit $m0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 4)
@@ -371,13 +371,13 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: simple_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: }
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: }
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -396,14 +396,14 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: salu_in_between_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN: }
-    ; GCN: $sgpr0 = S_MOV_B32 $sgpr2
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: }
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr2
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -423,16 +423,16 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: valu_in_between_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
-    ; GCN: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
+    ; GCN-NEXT: $vgpr20 = V_MOV_B32_indirect_read 1, implicit $exec, implicit $m0
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -452,16 +452,16 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: changed_index_bundle
     ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
-    ; GCN: $sgpr2 = S_MOV_B32 1
-    ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
-    ; GCN:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-    ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-    ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-    ; GCN: }
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
+    ; GCN-NEXT: $sgpr2 = S_MOV_B32 1
+    ; GCN-NEXT: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
+    ; GCN-NEXT:   S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+    ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+    ; GCN-NEXT: }
   BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 {
     S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
     $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -480,13 +480,15 @@ name:            simple_cbranch_vccz
 body:             |
   ; GCN-LABEL: name: simple_cbranch_vccz
   ; GCN: bb.0:
-  ; GCN:   successors: %bb.1(0x80000000)
-  ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-  ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-  ; GCN:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
-  ; GCN: bb.1:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+  ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  ; GCN-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
   bb.0:
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
@@ -503,12 +505,14 @@ name:            simple_cbranch_execz
 body:             |
   ; GCN-LABEL: name: simple_cbranch_execz
   ; GCN: bb.0:
-  ; GCN:   successors:
-  ; GCN:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
-  ; GCN:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
-  ; GCN: bb.1:
+  ; GCN-NEXT:   successors:
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
+  ; GCN-NEXT:   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; GCN-NEXT:   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
   bb.0:
   S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0
   $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
index 07a6b4d3ea0fb..37d7a758d6fd7 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-flags.mir
@@ -15,7 +15,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: %2:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_ADD_F32_e32_:%[0-9]+]]:vgpr_32 = nnan nofpexcept V_ADD_F32_e32 [[COPY]], [[COPY1]], implicit $mode, implicit $exec
     ; CHECK-NEXT: S_NOP 0
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
index f4c13b51157d4..ed2148ab5a198 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
@@ -11,6 +11,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_cvv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -31,6 +32,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vcv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -51,6 +53,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vvc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -71,6 +74,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vsc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -91,6 +95,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_cvv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -111,6 +116,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vcv_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -131,6 +137,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vvc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -151,6 +158,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vsc_f32
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -171,6 +179,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_cvv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -191,6 +200,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vcv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -211,6 +221,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vvc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -231,6 +242,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: mad_vsc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
@@ -251,6 +263,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_cvv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -271,6 +284,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vcv_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -291,6 +305,7 @@ body: |
     ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vvc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -311,6 +326,7 @@ body: |
     ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF
     ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
     ; GFX10-NEXT: SI_RETURN implicit $vgpr2
+    ;
     ; GFX11-LABEL: name: fma_vsc_f16
     ; GFX11: $vgpr0 = IMPLICIT_DEF
     ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
index 90924b3345fa0..a54c0accce783 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-aligned-vgprs.mir
@@ -16,6 +16,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_64
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -40,6 +41,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_64
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -65,6 +67,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub0_sub1, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_96_sub0_subg1
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -89,6 +92,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub1_sub2, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_96_sub1_sub2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -114,6 +118,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_96
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -138,6 +143,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_96
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -163,6 +169,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub0_sub1, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128_sub0_sub1
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -187,6 +194,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub2_sub3, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128_sub2_sub3
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -211,6 +219,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[DEF]].sub1_sub2, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_128_sub1_sub2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -236,6 +245,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]].sub0_sub1_sub2, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128_sub0_sub1_sub2
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -260,6 +270,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX3 [[COPY]], [[DEF]].sub1_sub2_sub3, 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_128_sub1_sub2_sub3
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -285,6 +296,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: aligned_vgpr_128
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
@@ -309,6 +321,7 @@ body:             |
     ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; GFX908-NEXT: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
     ; GFX908-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[DEF]], 0, 0, implicit $exec
+    ;
     ; GFX90A-LABEL: name: unaligned_vgpr_128
     ; GFX90A: liveins: $vgpr0_vgpr1
     ; GFX90A-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
index e342c2b83524d..eddad05d976bd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
@@ -11,7 +11,7 @@ body: |
     ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
     ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0
     ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 [[S_LOAD_DWORD_IMM]], 255, implicit-def $scc
-    ; GCN-NEXT: dead %3:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc
+    ; GCN-NEXT: dead [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0 = S_AND_B32 65535, [[S_AND_B32_]], implicit-def $scc
     ; GCN-NEXT: S_ENDPGM 0
     %0:sgpr_64 = COPY $sgpr4_sgpr5
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0
@@ -30,7 +30,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], undef %1:sreg_64, implicit-def dead $scc
-  ; GCN-NEXT:   dead %0:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[S_AND_B64_]], [[COPY]], implicit-def dead $scc
   ; GCN-NEXT:   $exec = S_MOV_B64_term [[S_AND_B64_]]
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
@@ -195,7 +195,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   S_SLEEP 3
   ; GCN-NEXT:   S_NOP 0
@@ -368,7 +368,7 @@ body:             |
   ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec
   ; GCN-NEXT:   [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY2]], implicit $exec
-  ; GCN-NEXT:   dead %5:sreg_64_xexec = S_MOV_B64 0
+  ; GCN-NEXT:   dead [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
@@ -383,7 +383,7 @@ body:             |
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc
   ; GCN-NEXT:   [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[S_AND_B64_]], [[COPY4]], implicit-def dead $scc
   ; GCN-NEXT:   $exec = S_MOV_B64_term [[S_AND_B64_]]
-  ; GCN-NEXT:   dead %8:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
+  ; GCN-NEXT:   dead [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term [[S_XOR_B64_]], implicit $exec
   ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
index 16f7e15f267ee..981a853cb097c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir
@@ -30,6 +30,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; GFX908-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -53,6 +54,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -72,6 +74,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -130,6 +133,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -152,6 +156,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit-def $agpr0_agpr1
   ; GFX908-EXPANDED-NEXT:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr64
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -168,6 +173,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -239,6 +245,7 @@ body: |
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -272,6 +279,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -303,6 +311,7 @@ body: |
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit undef $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32_used_all_vgprs
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -385,6 +394,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -409,6 +419,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr96
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -425,6 +436,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -483,6 +495,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -509,6 +522,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr128
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -525,6 +539,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -585,6 +600,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -613,6 +629,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr160
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -629,6 +646,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -691,6 +709,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -721,6 +740,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr192
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -737,6 +757,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -801,6 +822,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -835,6 +857,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr256
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -851,6 +874,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -919,6 +943,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr288
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -955,6 +980,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr288
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -971,6 +997,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr288
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1041,6 +1068,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr320
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1079,6 +1107,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr320
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1095,6 +1124,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr320
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1167,6 +1197,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr352
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1207,6 +1238,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr352
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1223,6 +1255,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr352
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1297,6 +1330,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr384
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1339,6 +1373,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr384
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1355,6 +1390,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr384
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1431,6 +1467,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1481,6 +1518,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr512
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1497,6 +1535,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1581,6 +1620,7 @@ body: |
   ; GFX908-SPILLED-NEXT: bb.2:
   ; GFX908-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX908-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ;
   ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024
   ; GFX908-EXPANDED: bb.0:
   ; GFX908-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
@@ -1663,6 +1703,7 @@ body: |
   ; GFX908-EXPANDED-NEXT:   $agpr30 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec
   ; GFX908-EXPANDED-NEXT:   $agpr31 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
   ; GFX908-EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ;
   ; GFX90A-SPILLED-LABEL: name: spill_restore_agpr1024
   ; GFX90A-SPILLED: bb.0:
   ; GFX90A-SPILLED-NEXT:   successors: %bb.1(0x80000000)
@@ -1679,6 +1720,7 @@ body: |
   ; GFX90A-SPILLED-NEXT: bb.2:
   ; GFX90A-SPILLED-NEXT:   $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5)
   ; GFX90A-SPILLED-NEXT:   S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+  ;
   ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024
   ; GFX90A-EXPANDED: bb.0:
   ; GFX90A-EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
index 9bac6bbd97595..e54e5898f8b53 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir
@@ -19,14 +19,14 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %5.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 0, %5.sub1, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   undef %7.sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef %7.sub1, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V64_SAVE %7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   dead [[V_MUL_F32_e32_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e32 0, [[V_MOV_B32_e32_]].sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MAC_F32_e32_:%[0-9]+]].sub1:vreg_64 = V_MAC_F32_e32 0, undef %1:vgpr_32, undef [[V_MAC_F32_e32_]].sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   SI_SPILL_V64_SAVE [[V_MAC_F32_e32_]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   undef %6.sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub1
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub1
   ; CHECK-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V64_RESTORE]].sub1
   ; CHECK-NEXT:   S_NOP 0, implicit undef %9.sub0:vreg_64
@@ -59,13 +59,13 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %1.sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 1786773504, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   S_NOP 0, implicit %1.sub2
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub2
   ; CHECK-NEXT:   S_NOP 0, implicit undef %4.sub0:vreg_128
-  ; CHECK-NEXT:   undef %2.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   S_NOP 0, implicit %2.sub2
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_1]].sub2
   bb.0:
     successors: %bb.1
 
diff --git a/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir b/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
index f4cf0f43e456b..eca6efaafa9d4 100644
--- a/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/split-liverange-overlapping-copies.mir
@@ -34,33 +34,33 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK-NEXT:     internal %6.sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
-  ; CHECK-NEXT:     internal %6.sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024_align2 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK-NEXT:     internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024_align2 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
+  ; CHECK-NEXT:     internal [[COPY1]].sub29_sub30_sub31:av_1024_align2 = COPY [[COPY]].sub29_sub30_sub31
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   %6.sub0:av_1024_align2 = IMPLICIT_DEF
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]].sub0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %6
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:av_1024_align2 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   undef %4.sub0:vreg_1024_align2 = COPY [[DEF]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_1024_align2 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vreg_1024_align2 = IMPLICIT_DEF
@@ -110,34 +110,34 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[DEF1]]
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %6.sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
-  ; CHECK-NEXT:     internal %6.sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
-  ; CHECK-NEXT:     internal %6.sub29_sub30:av_1024 = COPY [[COPY]].sub29_sub30
+  ; CHECK-NEXT:   undef [[COPY1:%[0-9]+]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16:av_1024 = COPY [[COPY]].sub1_sub2_sub3_sub4_sub5_sub6_sub7_sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15_sub16 {
+  ; CHECK-NEXT:     internal [[COPY1]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16:av_1024 = COPY [[COPY]].sub17_lo16_sub17_hi16_sub18_lo16_sub18_hi16_sub19_lo16_sub19_hi16_sub20_lo16_sub20_hi16_sub21_lo16_sub21_hi16_sub22_lo16_sub22_hi16_sub23_lo16_sub23_hi16_sub24_lo16_sub24_hi16_sub25_lo16_sub25_hi16_sub26_lo16_sub26_hi16_sub27_lo16_sub27_hi16_sub28_lo16_sub28_hi16
+  ; CHECK-NEXT:     internal [[COPY1]].sub29_sub30:av_1024 = COPY [[COPY]].sub29_sub30
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   %6.sub0:av_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   %6.sub31:av_1024 = IMPLICIT_DEF
-  ; CHECK-NEXT:   S_NOP 0, implicit %6.sub0, implicit %6.sub31
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub0:av_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]].sub31:av_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]].sub0, implicit [[COPY1]].sub31
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit %6
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:av_1024 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:av_1024 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   undef %4.sub0:vreg_1024 = COPY [[DEF]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %4
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:vreg_1024 = COPY [[DEF]]
+  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   bb.0:
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vreg_1024 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
index 2c4178d242ca8..896986ff9b02b 100644
--- a/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/split-mbb-lis-subrange.mir
@@ -34,8 +34,8 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   %s0c:vgpr_32 = V_ADD_F32_e64 0, %s0a, 0, %const, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   %s0d:vgpr_32 = V_ADD_F32_e64 0, %s0b, 0, %const, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY %s0c
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY %s0d
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %s0c
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY %s0d
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
@@ -45,8 +45,8 @@ body:             |
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   %phi1:vgpr_32 = COPY [[COPY3]]
-  ; GCN-NEXT:   %phi0:vgpr_32 = COPY [[COPY2]]
+  ; GCN-NEXT:   %phi1:vgpr_32 = COPY [[COPY1]]
+  ; GCN-NEXT:   %phi0:vgpr_32 = COPY [[COPY]]
   ; GCN-NEXT:   S_BRANCH %bb.4
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
index b50d71cb79e9e..8f53ec2f992da 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir
@@ -15,72 +15,72 @@ body:             |
   ; RA-NEXT: {{  $}}
   ; RA-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; RA-NEXT:   [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; RA-NEXT:   undef %2.sub1:sgpr_1024 = S_MOV_B32 -1
-  ; RA-NEXT:   %2.sub0:sgpr_1024 = S_MOV_B32 -1
-  ; RA-NEXT:   undef %3.sub0:sgpr_1024 = S_MOV_B32 0
+  ; RA-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_1024 = S_MOV_B32 -1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 -1
+  ; RA-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_1024 = S_MOV_B32 0
   ; RA-NEXT: {{  $}}
   ; RA-NEXT: bb.1:
   ; RA-NEXT:   successors: %bb.2(0x80000000)
   ; RA-NEXT: {{  $}}
-  ; RA-NEXT:   %2.sub2:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub3:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub4:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub5:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub6:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub7:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub8:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub9:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub10:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub11:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub12:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub13:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub14:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub15:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub16:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub17:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub18:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub19:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub20:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub21:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub22:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub23:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub24:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub25:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub26:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub27:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %2.sub28:sgpr_1024 = COPY %2.sub0
-  ; RA-NEXT:   %2.sub29:sgpr_1024 = COPY %2.sub1
-  ; RA-NEXT:   %3.sub1:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub2:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub3:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub4:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub5:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub6:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub7:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub8:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub9:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub10:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub11:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub12:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub13:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub14:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub15:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub16:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub17:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub18:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub19:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub20:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub21:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub22:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub23:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub24:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub25:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub26:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub27:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub28:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub29:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub30:sgpr_1024 = COPY %3.sub0
-  ; RA-NEXT:   %3.sub31:sgpr_1024 = COPY %3.sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_]].sub1
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub1:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub2:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub3:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub4:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub5:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub6:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub7:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub8:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub9:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub10:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub11:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub12:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub13:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub14:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub15:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub16:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub17:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub18:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub19:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub20:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub21:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub22:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub23:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub24:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub25:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub26:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub27:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub28:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub29:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub30:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
+  ; RA-NEXT:   [[S_MOV_B32_1:%[0-9]+]].sub31:sgpr_1024 = COPY [[S_MOV_B32_1]].sub0
   ; RA-NEXT: {{  $}}
   ; RA-NEXT: bb.2:
   ; RA-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -257,53 +257,53 @@ body:             |
     ; RA: [[DEF:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
     ; RA-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
     ; RA-NEXT: [[DEF2:%[0-9]+]]:sgpr_512 = IMPLICIT_DEF
-    ; RA-NEXT: [[DEF2]].sub4:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub5:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub10:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub11:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub7:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub8:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub13:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: [[DEF2]].sub14:sgpr_512 = S_MOV_B32 -1
-    ; RA-NEXT: undef %16.sub4_sub5:sgpr_512 = COPY [[DEF2]].sub4_sub5 {
-    ; RA-NEXT:   internal %16.sub10_sub11:sgpr_512 = COPY [[DEF2]].sub10_sub11
-    ; RA-NEXT:   internal %16.sub7:sgpr_512 = COPY [[DEF2]].sub7
-    ; RA-NEXT:   internal %16.sub8:sgpr_512 = COPY [[DEF2]].sub8
-    ; RA-NEXT:   internal %16.sub13:sgpr_512 = COPY [[DEF2]].sub13
-    ; RA-NEXT:   internal %16.sub14:sgpr_512 = COPY [[DEF2]].sub14
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub4:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub5:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub10:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub11:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub7:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub8:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub13:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: [[DEF2:%[0-9]+]].sub14:sgpr_512 = S_MOV_B32 -1
+    ; RA-NEXT: undef [[COPY:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[DEF2]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY]].sub10_sub11:sgpr_512 = COPY [[DEF2]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY]].sub7:sgpr_512 = COPY [[DEF2]].sub7
+    ; RA-NEXT:   internal [[COPY]].sub8:sgpr_512 = COPY [[DEF2]].sub8
+    ; RA-NEXT:   internal [[COPY]].sub13:sgpr_512 = COPY [[DEF2]].sub13
+    ; RA-NEXT:   internal [[COPY]].sub14:sgpr_512 = COPY [[DEF2]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: undef %18.sub4_sub5:sgpr_512 = COPY %16.sub4_sub5 {
-    ; RA-NEXT:   internal %18.sub10_sub11:sgpr_512 = COPY %16.sub10_sub11
-    ; RA-NEXT:   internal %18.sub7:sgpr_512 = COPY %16.sub7
-    ; RA-NEXT:   internal %18.sub8:sgpr_512 = COPY %16.sub8
-    ; RA-NEXT:   internal %18.sub13:sgpr_512 = COPY %16.sub13
-    ; RA-NEXT:   internal %18.sub14:sgpr_512 = COPY %16.sub14
+    ; RA-NEXT: undef [[COPY1:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[COPY]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY1]].sub10_sub11:sgpr_512 = COPY [[COPY]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY1]].sub7:sgpr_512 = COPY [[COPY]].sub7
+    ; RA-NEXT:   internal [[COPY1]].sub8:sgpr_512 = COPY [[COPY]].sub8
+    ; RA-NEXT:   internal [[COPY1]].sub13:sgpr_512 = COPY [[COPY]].sub13
+    ; RA-NEXT:   internal [[COPY1]].sub14:sgpr_512 = COPY [[COPY]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: SI_SPILL_S512_SAVE %18, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5)
+    ; RA-NEXT: SI_SPILL_S512_SAVE [[COPY1]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5)
     ; RA-NEXT: S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98
     ; RA-NEXT: [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5)
-    ; RA-NEXT: undef %17.sub4_sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub4_sub5 {
-    ; RA-NEXT:   internal %17.sub10_sub11:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10_sub11
-    ; RA-NEXT:   internal %17.sub7:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub7
-    ; RA-NEXT:   internal %17.sub8:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub8
-    ; RA-NEXT:   internal %17.sub13:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub13
-    ; RA-NEXT:   internal %17.sub14:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub14
+    ; RA-NEXT: undef [[COPY2:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY2]].sub10_sub11:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY2]].sub7:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub7
+    ; RA-NEXT:   internal [[COPY2]].sub8:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub8
+    ; RA-NEXT:   internal [[COPY2]].sub13:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub13
+    ; RA-NEXT:   internal [[COPY2]].sub14:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: undef %14.sub4_sub5:sgpr_512 = COPY %17.sub4_sub5 {
-    ; RA-NEXT:   internal %14.sub10_sub11:sgpr_512 = COPY %17.sub10_sub11
-    ; RA-NEXT:   internal %14.sub7:sgpr_512 = COPY %17.sub7
-    ; RA-NEXT:   internal %14.sub8:sgpr_512 = COPY %17.sub8
-    ; RA-NEXT:   internal %14.sub13:sgpr_512 = COPY %17.sub13
-    ; RA-NEXT:   internal %14.sub14:sgpr_512 = COPY %17.sub14
+    ; RA-NEXT: undef [[COPY3:%[0-9]+]].sub4_sub5:sgpr_512 = COPY [[COPY2]].sub4_sub5 {
+    ; RA-NEXT:   internal [[COPY3]].sub10_sub11:sgpr_512 = COPY [[COPY2]].sub10_sub11
+    ; RA-NEXT:   internal [[COPY3]].sub7:sgpr_512 = COPY [[COPY2]].sub7
+    ; RA-NEXT:   internal [[COPY3]].sub8:sgpr_512 = COPY [[COPY2]].sub8
+    ; RA-NEXT:   internal [[COPY3]].sub13:sgpr_512 = COPY [[COPY2]].sub13
+    ; RA-NEXT:   internal [[COPY3]].sub14:sgpr_512 = COPY [[COPY2]].sub14
     ; RA-NEXT: }
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub4, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub5, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub10, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub11, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub7, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub8, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub13, 0 :: (dereferenceable invariant load (s32))
-    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], %14.sub14, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub4, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub5, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub10, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub11, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub7, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub8, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub13, 0 :: (dereferenceable invariant load (s32))
+    ; RA-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[DEF]], [[COPY3]].sub14, 0 :: (dereferenceable invariant load (s32))
     ; RA-NEXT: S_NOP 0, implicit [[DEF]], implicit [[DEF1]], implicit [[S_BUFFER_LOAD_DWORD_SGPR]], implicit [[S_BUFFER_LOAD_DWORD_SGPR1]], implicit [[S_BUFFER_LOAD_DWORD_SGPR2]], implicit [[S_BUFFER_LOAD_DWORD_SGPR3]], implicit [[S_BUFFER_LOAD_DWORD_SGPR4]], implicit [[S_BUFFER_LOAD_DWORD_SGPR5]], implicit [[S_BUFFER_LOAD_DWORD_SGPR6]], implicit [[S_BUFFER_LOAD_DWORD_SGPR7]]
     ;
     ; VR-LABEL: name: splitkit_copy_unbundle_reorder
@@ -349,7 +349,6 @@ body:             |
     %2.sub13:sgpr_512 = S_MOV_B32 -1
     %2.sub14:sgpr_512 = S_MOV_B32 -1
 
-    ; Clobber registers
     S_NOP 0, implicit-def $sgpr8, implicit-def $sgpr12, implicit-def $sgpr16, implicit-def $sgpr20, implicit-def $sgpr24, implicit-def $sgpr28, implicit-def $sgpr32, implicit-def $sgpr36, implicit-def $sgpr40, implicit-def $sgpr44, implicit-def $sgpr48, implicit-def $sgpr52, implicit-def $sgpr56, implicit-def $sgpr60, implicit-def $sgpr64, implicit-def $sgpr68, implicit-def $sgpr72, implicit-def $sgpr74, implicit-def $sgpr78, implicit-def $sgpr82, implicit-def $sgpr86, implicit-def $sgpr90, implicit-def $sgpr94, implicit-def $sgpr98
 
     %5:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR %0:sgpr_128, %2.sub4:sgpr_512, 0 :: (dereferenceable invariant load (s32))
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index 70e5e8a00c3d9..42db92b15acf5 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -16,449 +16,449 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
     ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
-    ; CHECK-NEXT: undef %2.sub3:sgpr_128 = S_MOV_B32 61440
-    ; CHECK-NEXT: %2.sub2:sgpr_128 = S_MOV_B32 -1
-    ; CHECK-NEXT: %2.sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
-    ; CHECK-NEXT: %2.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
-    ; CHECK-NEXT: undef %3.sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
-    ; CHECK-NEXT: %3.sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
-    ; CHECK-NEXT: %3.sub2:sgpr_128 = COPY %2.sub2
-    ; CHECK-NEXT: %3.sub3:sgpr_128 = COPY %2.sub3
-    ; CHECK-NEXT: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE %3, implicit $exec {
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 0, 0, 0, implicit $exec :: (load (s128), align 128, addrspace 1)
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 16, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 32, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %55.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %63.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %71.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %79.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %87.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %101.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %107.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %154.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %209.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %188.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %129.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %135.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %141.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %147.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %159.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %165.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 112, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: undef %40.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %41.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; CHECK-NEXT: undef %48.sub2:vreg_128 = COPY %47.sub2
-    ; CHECK-NEXT: %48.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %51.sub0:vreg_128 = COPY %48.sub0 {
-    ; CHECK-NEXT:   internal %51.sub2:vreg_128 = COPY %48.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %51, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %56.sub2:vreg_128 = COPY %55.sub2
-    ; CHECK-NEXT: %56.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %59.sub0:vreg_128 = COPY %56.sub0 {
-    ; CHECK-NEXT:   internal %59.sub2:vreg_128 = COPY %56.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %59, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %64.sub2:vreg_128 = COPY %63.sub2
-    ; CHECK-NEXT: %64.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %67.sub0:vreg_128 = COPY %64.sub0 {
-    ; CHECK-NEXT:   internal %67.sub2:vreg_128 = COPY %64.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %67, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %72.sub2:vreg_128 = COPY %71.sub2
-    ; CHECK-NEXT: %72.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %75.sub0:vreg_128 = COPY %72.sub0 {
-    ; CHECK-NEXT:   internal %75.sub2:vreg_128 = COPY %72.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %75, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %80.sub2:vreg_128 = COPY %79.sub2
-    ; CHECK-NEXT: %80.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %83.sub0:vreg_128 = COPY %80.sub0 {
-    ; CHECK-NEXT:   internal %83.sub2:vreg_128 = COPY %80.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %83, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %88.sub2:vreg_128 = COPY %87.sub2
-    ; CHECK-NEXT: %88.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %91.sub0:vreg_128 = COPY %88.sub0 {
-    ; CHECK-NEXT:   internal %91.sub2:vreg_128 = COPY %88.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %91, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %96.sub2:vreg_128 = COPY %95.sub2
-    ; CHECK-NEXT: %96.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %155.sub0:vreg_128 = COPY %96.sub0 {
-    ; CHECK-NEXT:   internal %155.sub2:vreg_128 = COPY %96.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %155, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %102.sub2:vreg_128 = COPY %101.sub2
-    ; CHECK-NEXT: %102.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %117.sub0:vreg_128 = COPY %102.sub0 {
-    ; CHECK-NEXT:   internal %117.sub2:vreg_128 = COPY %102.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %117, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %108.sub2:vreg_128 = COPY %107.sub2
-    ; CHECK-NEXT: %108.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %110.sub0:vreg_128 = COPY %108.sub0 {
-    ; CHECK-NEXT:   internal %110.sub2:vreg_128 = COPY %108.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %114.sub2:vreg_128 = COPY %113.sub2
-    ; CHECK-NEXT: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %116.sub0:vreg_128 = COPY %114.sub0 {
-    ; CHECK-NEXT:   internal %116.sub2:vreg_128 = COPY %114.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %154.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %177.sub0:vreg_128 = COPY %154.sub0 {
-    ; CHECK-NEXT:   internal %177.sub2:vreg_128 = COPY %154.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %179.sub0:vreg_128 = COPY %177.sub0 {
-    ; CHECK-NEXT:   internal %179.sub2:vreg_128 = COPY %177.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %179, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %210.sub2:vreg_128 = COPY %209.sub2
-    ; CHECK-NEXT: %210.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %213.sub0:vreg_128 = COPY %210.sub0 {
-    ; CHECK-NEXT:   internal %213.sub2:vreg_128 = COPY %210.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %213, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %189.sub2:vreg_128 = COPY %188.sub2
-    ; CHECK-NEXT: %189.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %192.sub0:vreg_128 = COPY %189.sub0 {
-    ; CHECK-NEXT:   internal %192.sub2:vreg_128 = COPY %189.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %192, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %124.sub2:vreg_128 = COPY %123.sub2
-    ; CHECK-NEXT: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %126.sub0:vreg_128 = COPY %124.sub0 {
-    ; CHECK-NEXT:   internal %126.sub2:vreg_128 = COPY %124.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %130.sub2:vreg_128 = COPY %129.sub2
-    ; CHECK-NEXT: %130.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %205.sub0:vreg_128 = COPY %130.sub0 {
-    ; CHECK-NEXT:   internal %205.sub2:vreg_128 = COPY %130.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %205, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %136.sub2:vreg_128 = COPY %135.sub2
-    ; CHECK-NEXT: %136.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %138.sub0:vreg_128 = COPY %136.sub0 {
-    ; CHECK-NEXT:   internal %138.sub2:vreg_128 = COPY %136.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %142.sub2:vreg_128 = COPY %141.sub2
-    ; CHECK-NEXT: %142.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %144.sub0:vreg_128 = COPY %142.sub0 {
-    ; CHECK-NEXT:   internal %144.sub2:vreg_128 = COPY %142.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %148.sub2:vreg_128 = COPY %147.sub2
-    ; CHECK-NEXT: %148.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %150.sub0:vreg_128 = COPY %148.sub0 {
-    ; CHECK-NEXT:   internal %150.sub2:vreg_128 = COPY %148.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %160.sub2:vreg_128 = COPY %159.sub2
-    ; CHECK-NEXT: %160.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %162.sub0:vreg_128 = COPY %160.sub0 {
-    ; CHECK-NEXT:   internal %162.sub2:vreg_128 = COPY %160.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %166.sub2:vreg_128 = COPY %165.sub2
-    ; CHECK-NEXT: %166.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %168.sub0:vreg_128 = COPY %166.sub0 {
-    ; CHECK-NEXT:   internal %168.sub2:vreg_128 = COPY %166.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %175.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %172.sub2:vreg_128 = COPY %175.sub2
-    ; CHECK-NEXT: %172.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %174.sub0:vreg_128 = COPY %172.sub0 {
-    ; CHECK-NEXT:   internal %174.sub2:vreg_128 = COPY %172.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %187.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %184.sub2:vreg_128 = COPY %187.sub2
-    ; CHECK-NEXT: %184.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %186.sub0:vreg_128 = COPY %184.sub0 {
-    ; CHECK-NEXT:   internal %186.sub2:vreg_128 = COPY %184.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %200.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %197.sub2:vreg_128 = COPY %200.sub2
-    ; CHECK-NEXT: %197.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %199.sub0:vreg_128 = COPY %197.sub0 {
-    ; CHECK-NEXT:   internal %199.sub2:vreg_128 = COPY %197.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %220.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %204.sub2:vreg_128 = COPY %220.sub2
-    ; CHECK-NEXT: %204.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %219.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %218.sub2:vreg_128 = COPY %219.sub2
-    ; CHECK-NEXT: %218.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
-    ; CHECK-NEXT: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
-    ; CHECK-NEXT: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
-    ; CHECK-NEXT: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
-    ; CHECK-NEXT: %40.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
-    ; CHECK-NEXT: %41.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
-    ; CHECK-NEXT: %42.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
-    ; CHECK-NEXT: %43.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
-    ; CHECK-NEXT: %43.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: %43.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %43, %2, 0, 480, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: %42.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %42.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %42, %2, 0, 496, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %41.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %41.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %41, %2, 0, 448, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: %40.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %40.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %40, %2, 0, 464, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %38.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %38.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %38, %2, 0, 416, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: %37.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %37.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %37, %2, 0, 432, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %36.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %36.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef %216.sub0:vreg_128 = COPY %218.sub0 {
-    ; CHECK-NEXT:   internal %216.sub2:vreg_128 = COPY %218.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %216.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %216.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %216, %2, 0, 400, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %202.sub0:vreg_128 = COPY %204.sub0 {
-    ; CHECK-NEXT:   internal %202.sub2:vreg_128 = COPY %204.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %202.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %202.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %202, %2, 0, 352, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %198.sub0:vreg_128 = COPY %199.sub0 {
-    ; CHECK-NEXT:   internal %198.sub2:vreg_128 = COPY %199.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %195.sub0:vreg_128 = COPY %198.sub0 {
-    ; CHECK-NEXT:   internal %195.sub2:vreg_128 = COPY %198.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %195.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %195.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %195, %2, 0, 368, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %185.sub0:vreg_128 = COPY %186.sub0 {
-    ; CHECK-NEXT:   internal %185.sub2:vreg_128 = COPY %186.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %182.sub0:vreg_128 = COPY %185.sub0 {
-    ; CHECK-NEXT:   internal %182.sub2:vreg_128 = COPY %185.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %182.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %182.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %182, %2, 0, 320, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %173.sub0:vreg_128 = COPY %174.sub0 {
-    ; CHECK-NEXT:   internal %173.sub2:vreg_128 = COPY %174.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %170.sub0:vreg_128 = COPY %173.sub0 {
-    ; CHECK-NEXT:   internal %170.sub2:vreg_128 = COPY %173.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %170.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %170.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %170, %2, 0, 336, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %167.sub0:vreg_128 = COPY %168.sub0 {
-    ; CHECK-NEXT:   internal %167.sub2:vreg_128 = COPY %168.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %164.sub0:vreg_128 = COPY %167.sub0 {
-    ; CHECK-NEXT:   internal %164.sub2:vreg_128 = COPY %167.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %164.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %164.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %164, %2, 0, 288, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %161.sub0:vreg_128 = COPY %162.sub0 {
-    ; CHECK-NEXT:   internal %161.sub2:vreg_128 = COPY %162.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %158.sub0:vreg_128 = COPY %161.sub0 {
-    ; CHECK-NEXT:   internal %158.sub2:vreg_128 = COPY %161.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %158.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %158.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %158, %2, 0, 304, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %149.sub0:vreg_128 = COPY %150.sub0 {
-    ; CHECK-NEXT:   internal %149.sub2:vreg_128 = COPY %150.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %146.sub0:vreg_128 = COPY %149.sub0 {
-    ; CHECK-NEXT:   internal %146.sub2:vreg_128 = COPY %149.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %146.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %146.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %146, %2, 0, 256, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
-    ; CHECK-NEXT: undef %143.sub0:vreg_128 = COPY %144.sub0 {
-    ; CHECK-NEXT:   internal %143.sub2:vreg_128 = COPY %144.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %140.sub0:vreg_128 = COPY %143.sub0 {
-    ; CHECK-NEXT:   internal %140.sub2:vreg_128 = COPY %143.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %140.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %140.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %140, %2, 0, 272, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %137.sub0:vreg_128 = COPY %138.sub0 {
-    ; CHECK-NEXT:   internal %137.sub2:vreg_128 = COPY %138.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %134.sub0:vreg_128 = COPY %137.sub0 {
-    ; CHECK-NEXT:   internal %134.sub2:vreg_128 = COPY %137.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %134.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %134.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %134, %2, 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 61440
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:sgpr_128 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub3
+    ; CHECK-NEXT: early-clobber %4:vreg_128, early-clobber %5:vreg_128, early-clobber %6:vreg_128, early-clobber %7:vreg_128 = BUNDLE [[COPY1]], implicit $exec {
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 0, 0, 0, implicit $exec :: (load (s128), align 128, addrspace 1)
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 16, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 32, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
+    ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 48, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_1:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_2:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_3:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_4:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_5:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_6:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_7:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_8:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_9:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_10:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_11:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_12:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_13:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_14:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_15:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 64, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_16:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_17:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_18:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_19:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 80, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 96, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 112, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
+    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_]].sub2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:vreg_128 = COPY [[COPY2]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY3]].sub2:vreg_128 = COPY [[COPY2]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY3]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY4:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_1]].sub2
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY5:%[0-9]+]].sub0:vreg_128 = COPY [[COPY4]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY5]].sub2:vreg_128 = COPY [[COPY4]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY5]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY6:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_2]].sub2
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY7:%[0-9]+]].sub0:vreg_128 = COPY [[COPY6]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY7]].sub2:vreg_128 = COPY [[COPY6]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY7]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY8:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_3]].sub2
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub0:vreg_128 = COPY [[COPY8]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY9]].sub2:vreg_128 = COPY [[COPY8]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY9]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_4]].sub2
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY11:%[0-9]+]].sub0:vreg_128 = COPY [[COPY10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY11]].sub2:vreg_128 = COPY [[COPY10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY11]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY12:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_5]].sub2
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY13:%[0-9]+]].sub0:vreg_128 = COPY [[COPY12]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY13]].sub2:vreg_128 = COPY [[COPY12]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY13]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY14:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_6]].sub2
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY15:%[0-9]+]].sub0:vreg_128 = COPY [[COPY14]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY15]].sub2:vreg_128 = COPY [[COPY14]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY15]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY16:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_7]].sub2
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY17:%[0-9]+]].sub0:vreg_128 = COPY [[COPY16]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY17]].sub2:vreg_128 = COPY [[COPY16]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY17]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY18:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_8]].sub2
+    ; CHECK-NEXT: [[COPY18:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY19:%[0-9]+]].sub0:vreg_128 = COPY [[COPY18]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY19]].sub2:vreg_128 = COPY [[COPY18]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY20:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_9]].sub2
+    ; CHECK-NEXT: [[COPY20:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY21:%[0-9]+]].sub0:vreg_128 = COPY [[COPY20]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY21]].sub2:vreg_128 = COPY [[COPY20]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_10:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY22:%[0-9]+]].sub0:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY22]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY23:%[0-9]+]].sub0:vreg_128 = COPY [[COPY22]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY23]].sub2:vreg_128 = COPY [[COPY22]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY23]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY24:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_11]].sub2
+    ; CHECK-NEXT: [[COPY24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY25:%[0-9]+]].sub0:vreg_128 = COPY [[COPY24]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY25]].sub2:vreg_128 = COPY [[COPY24]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY25]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY26:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_12]].sub2
+    ; CHECK-NEXT: [[COPY26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub0:vreg_128 = COPY [[COPY26]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY27]].sub2:vreg_128 = COPY [[COPY26]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY27]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY28:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_13]].sub2
+    ; CHECK-NEXT: [[COPY28:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub0:vreg_128 = COPY [[COPY28]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY29]].sub2:vreg_128 = COPY [[COPY28]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY30:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_14]].sub2
+    ; CHECK-NEXT: [[COPY30:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY31:%[0-9]+]].sub0:vreg_128 = COPY [[COPY30]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY31]].sub2:vreg_128 = COPY [[COPY30]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY31]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY32:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_15]].sub2
+    ; CHECK-NEXT: [[COPY32:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY33:%[0-9]+]].sub0:vreg_128 = COPY [[COPY32]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY33]].sub2:vreg_128 = COPY [[COPY32]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY34:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_16]].sub2
+    ; CHECK-NEXT: [[COPY34:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY35:%[0-9]+]].sub0:vreg_128 = COPY [[COPY34]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY35]].sub2:vreg_128 = COPY [[COPY34]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY36:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_17]].sub2
+    ; CHECK-NEXT: [[COPY36:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY37:%[0-9]+]].sub0:vreg_128 = COPY [[COPY36]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY37]].sub2:vreg_128 = COPY [[COPY36]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY38:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_18]].sub2
+    ; CHECK-NEXT: [[COPY38:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY39:%[0-9]+]].sub0:vreg_128 = COPY [[COPY38]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY39]].sub2:vreg_128 = COPY [[COPY38]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY40:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_19]].sub2
+    ; CHECK-NEXT: [[COPY40:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY41:%[0-9]+]].sub0:vreg_128 = COPY [[COPY40]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY41]].sub2:vreg_128 = COPY [[COPY40]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY42:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub2
+    ; CHECK-NEXT: [[COPY42:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY43:%[0-9]+]].sub0:vreg_128 = COPY [[COPY42]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY43]].sub2:vreg_128 = COPY [[COPY42]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_28:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY44:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_28]].sub2
+    ; CHECK-NEXT: [[COPY44:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY45:%[0-9]+]].sub0:vreg_128 = COPY [[COPY44]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY45]].sub2:vreg_128 = COPY [[COPY44]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_29:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY46:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_29]].sub2
+    ; CHECK-NEXT: [[COPY46:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[COPY47:%[0-9]+]].sub0:vreg_128 = COPY [[COPY46]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY47]].sub2:vreg_128 = COPY [[COPY46]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_30:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY48:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_30]].sub2
+    ; CHECK-NEXT: [[COPY48:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_31:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY49:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_31]].sub2
+    ; CHECK-NEXT: [[COPY49:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_26]], [[S_MOV_B32_]], 0, 480, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_25]], [[S_MOV_B32_]], 0, 496, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_24]], [[S_MOV_B32_]], 0, 448, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_23]], [[S_MOV_B32_]], 0, 464, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_22]], [[S_MOV_B32_]], 0, 416, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_21]], [[S_MOV_B32_]], 0, 432, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_20]], [[S_MOV_B32_]], 0, 384, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY50:%[0-9]+]].sub0:vreg_128 = COPY [[COPY49]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY50]].sub2:vreg_128 = COPY [[COPY49]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY50]], [[S_MOV_B32_]], 0, 400, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY51:%[0-9]+]].sub0:vreg_128 = COPY [[COPY48]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY51]].sub2:vreg_128 = COPY [[COPY48]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY51:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY51:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY51]], [[S_MOV_B32_]], 0, 352, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY52:%[0-9]+]].sub0:vreg_128 = COPY [[COPY47]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY52]].sub2:vreg_128 = COPY [[COPY47]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY53:%[0-9]+]].sub0:vreg_128 = COPY [[COPY52]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY53]].sub2:vreg_128 = COPY [[COPY52]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY53]], [[S_MOV_B32_]], 0, 368, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY54:%[0-9]+]].sub0:vreg_128 = COPY [[COPY45]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY54]].sub2:vreg_128 = COPY [[COPY45]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY55:%[0-9]+]].sub0:vreg_128 = COPY [[COPY54]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY55]].sub2:vreg_128 = COPY [[COPY54]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY55]], [[S_MOV_B32_]], 0, 320, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY56:%[0-9]+]].sub0:vreg_128 = COPY [[COPY43]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY56]].sub2:vreg_128 = COPY [[COPY43]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY57:%[0-9]+]].sub0:vreg_128 = COPY [[COPY56]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY57]].sub2:vreg_128 = COPY [[COPY56]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY57]], [[S_MOV_B32_]], 0, 336, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY58:%[0-9]+]].sub0:vreg_128 = COPY [[COPY41]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY58]].sub2:vreg_128 = COPY [[COPY41]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY59:%[0-9]+]].sub0:vreg_128 = COPY [[COPY58]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY59]].sub2:vreg_128 = COPY [[COPY58]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY59]], [[S_MOV_B32_]], 0, 288, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY60:%[0-9]+]].sub0:vreg_128 = COPY [[COPY39]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY60]].sub2:vreg_128 = COPY [[COPY39]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY61:%[0-9]+]].sub0:vreg_128 = COPY [[COPY60]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY61]].sub2:vreg_128 = COPY [[COPY60]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY61]], [[S_MOV_B32_]], 0, 304, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY62:%[0-9]+]].sub0:vreg_128 = COPY [[COPY37]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY62]].sub2:vreg_128 = COPY [[COPY37]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY63:%[0-9]+]].sub0:vreg_128 = COPY [[COPY62]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY63]].sub2:vreg_128 = COPY [[COPY62]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY63]], [[S_MOV_B32_]], 0, 256, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY64:%[0-9]+]].sub0:vreg_128 = COPY [[COPY35]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY64]].sub2:vreg_128 = COPY [[COPY35]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY65:%[0-9]+]].sub0:vreg_128 = COPY [[COPY64]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY65]].sub2:vreg_128 = COPY [[COPY64]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY65]], [[S_MOV_B32_]], 0, 272, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY66:%[0-9]+]].sub0:vreg_128 = COPY [[COPY33]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY66]].sub2:vreg_128 = COPY [[COPY33]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY67:%[0-9]+]].sub0:vreg_128 = COPY [[COPY66]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY67]].sub2:vreg_128 = COPY [[COPY66]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY67]], [[S_MOV_B32_]], 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %131.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
-    ; CHECK-NEXT:   internal %131.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
+    ; CHECK-NEXT: undef [[COPY68:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY68]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %128.sub0:vreg_128 = COPY %131.sub0 {
-    ; CHECK-NEXT:   internal %128.sub2:vreg_128 = COPY %131.sub2
+    ; CHECK-NEXT: undef [[COPY69:%[0-9]+]].sub0:vreg_128 = COPY [[COPY68]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY69]].sub2:vreg_128 = COPY [[COPY68]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %128.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %128.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %128, %2, 0, 240, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %125.sub0:vreg_128 = COPY %126.sub0 {
-    ; CHECK-NEXT:   internal %125.sub2:vreg_128 = COPY %126.sub2
+    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY69]], [[S_MOV_B32_]], 0, 240, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY70:%[0-9]+]].sub0:vreg_128 = COPY [[COPY29]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY70]].sub2:vreg_128 = COPY [[COPY29]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %122.sub0:vreg_128 = COPY %125.sub0 {
-    ; CHECK-NEXT:   internal %122.sub2:vreg_128 = COPY %125.sub2
+    ; CHECK-NEXT: undef [[COPY71:%[0-9]+]].sub0:vreg_128 = COPY [[COPY70]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY71]].sub2:vreg_128 = COPY [[COPY70]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %122.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %122.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY71]], [[S_MOV_B32_]], 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %190.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
-    ; CHECK-NEXT:   internal %190.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
+    ; CHECK-NEXT: undef [[COPY72:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY72]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %120.sub0:vreg_128 = COPY %190.sub0 {
-    ; CHECK-NEXT:   internal %120.sub2:vreg_128 = COPY %190.sub2
+    ; CHECK-NEXT: undef [[COPY73:%[0-9]+]].sub0:vreg_128 = COPY [[COPY72]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY73]].sub2:vreg_128 = COPY [[COPY72]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %120.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %120.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %120, %2, 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY73]], [[S_MOV_B32_]], 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %211.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
-    ; CHECK-NEXT:   internal %211.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
+    ; CHECK-NEXT: undef [[COPY74:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY74]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %208.sub0:vreg_128 = COPY %211.sub0 {
-    ; CHECK-NEXT:   internal %208.sub2:vreg_128 = COPY %211.sub2
+    ; CHECK-NEXT: undef [[COPY75:%[0-9]+]].sub0:vreg_128 = COPY [[COPY74]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY75]].sub2:vreg_128 = COPY [[COPY74]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %208.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %208.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %208, %2, 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY75]], [[S_MOV_B32_]], 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %178.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
-    ; CHECK-NEXT:   internal %178.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
+    ; CHECK-NEXT: undef [[COPY76:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY76]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %152.sub0:vreg_128 = COPY %178.sub0 {
-    ; CHECK-NEXT:   internal %152.sub2:vreg_128 = COPY %178.sub2
+    ; CHECK-NEXT: undef [[COPY77:%[0-9]+]].sub0:vreg_128 = COPY [[COPY76]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY77]].sub2:vreg_128 = COPY [[COPY76]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %152.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %152.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %152, %2, 0, 176, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %115.sub0:vreg_128 = COPY %116.sub0 {
-    ; CHECK-NEXT:   internal %115.sub2:vreg_128 = COPY %116.sub2
+    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY77]], [[S_MOV_B32_]], 0, 176, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY78:%[0-9]+]].sub0:vreg_128 = COPY [[COPY21]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY78]].sub2:vreg_128 = COPY [[COPY21]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %112.sub0:vreg_128 = COPY %115.sub0 {
-    ; CHECK-NEXT:   internal %112.sub2:vreg_128 = COPY %115.sub2
+    ; CHECK-NEXT: undef [[COPY79:%[0-9]+]].sub0:vreg_128 = COPY [[COPY78]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY79]].sub2:vreg_128 = COPY [[COPY78]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %112.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %112.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 128, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef %109.sub0:vreg_128 = COPY %110.sub0 {
-    ; CHECK-NEXT:   internal %109.sub2:vreg_128 = COPY %110.sub2
+    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY79]], [[S_MOV_B32_]], 0, 128, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY80:%[0-9]+]].sub0:vreg_128 = COPY [[COPY19]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY80]].sub2:vreg_128 = COPY [[COPY19]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %106.sub0:vreg_128 = COPY %109.sub0 {
-    ; CHECK-NEXT:   internal %106.sub2:vreg_128 = COPY %109.sub2
+    ; CHECK-NEXT: undef [[COPY81:%[0-9]+]].sub0:vreg_128 = COPY [[COPY80]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY81]].sub2:vreg_128 = COPY [[COPY80]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %106.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %106.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %106, %2, 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY81]], [[S_MOV_B32_]], 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %103.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
-    ; CHECK-NEXT:   internal %103.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
+    ; CHECK-NEXT: undef [[COPY82:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY82]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %100.sub0:vreg_128 = COPY %103.sub0 {
-    ; CHECK-NEXT:   internal %100.sub2:vreg_128 = COPY %103.sub2
+    ; CHECK-NEXT: undef [[COPY83:%[0-9]+]].sub0:vreg_128 = COPY [[COPY82]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY83]].sub2:vreg_128 = COPY [[COPY82]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %100.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %100.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %100, %2, 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY83]], [[S_MOV_B32_]], 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %97.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
-    ; CHECK-NEXT:   internal %97.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
+    ; CHECK-NEXT: undef [[COPY84:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY84]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %94.sub0:vreg_128 = COPY %97.sub0 {
-    ; CHECK-NEXT:   internal %94.sub2:vreg_128 = COPY %97.sub2
+    ; CHECK-NEXT: undef [[COPY85:%[0-9]+]].sub0:vreg_128 = COPY [[COPY84]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY85]].sub2:vreg_128 = COPY [[COPY84]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %94.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %94.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY85]], [[S_MOV_B32_]], 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %89.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
-    ; CHECK-NEXT:   internal %89.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
+    ; CHECK-NEXT: undef [[COPY86:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY86]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %86.sub0:vreg_128 = COPY %89.sub0 {
-    ; CHECK-NEXT:   internal %86.sub2:vreg_128 = COPY %89.sub2
+    ; CHECK-NEXT: undef [[COPY87:%[0-9]+]].sub0:vreg_128 = COPY [[COPY86]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY87]].sub2:vreg_128 = COPY [[COPY86]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %86.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %86.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %86, %2, 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY87]], [[S_MOV_B32_]], 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
-    ; CHECK-NEXT:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
+    ; CHECK-NEXT: undef [[COPY88:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY88]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %78.sub0:vreg_128 = COPY %81.sub0 {
-    ; CHECK-NEXT:   internal %78.sub2:vreg_128 = COPY %81.sub2
+    ; CHECK-NEXT: undef [[COPY89:%[0-9]+]].sub0:vreg_128 = COPY [[COPY88]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY89]].sub2:vreg_128 = COPY [[COPY88]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %78.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %78.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %78, %2, 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY89]], [[S_MOV_B32_]], 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %73.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
-    ; CHECK-NEXT:   internal %73.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
+    ; CHECK-NEXT: undef [[COPY90:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY90]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %70.sub0:vreg_128 = COPY %73.sub0 {
-    ; CHECK-NEXT:   internal %70.sub2:vreg_128 = COPY %73.sub2
+    ; CHECK-NEXT: undef [[COPY91:%[0-9]+]].sub0:vreg_128 = COPY [[COPY90]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY91]].sub2:vreg_128 = COPY [[COPY90]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %70.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %70.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %70, %2, 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY91]], [[S_MOV_B32_]], 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %65.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
-    ; CHECK-NEXT:   internal %65.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
+    ; CHECK-NEXT: undef [[COPY92:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY92]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %62.sub0:vreg_128 = COPY %65.sub0 {
-    ; CHECK-NEXT:   internal %62.sub2:vreg_128 = COPY %65.sub2
+    ; CHECK-NEXT: undef [[COPY93:%[0-9]+]].sub0:vreg_128 = COPY [[COPY92]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY93]].sub2:vreg_128 = COPY [[COPY92]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %62.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %62.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %62, %2, 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY93]], [[S_MOV_B32_]], 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %57.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
-    ; CHECK-NEXT:   internal %57.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
+    ; CHECK-NEXT: undef [[COPY94:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY94]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %54.sub0:vreg_128 = COPY %57.sub0 {
-    ; CHECK-NEXT:   internal %54.sub2:vreg_128 = COPY %57.sub2
+    ; CHECK-NEXT: undef [[COPY95:%[0-9]+]].sub0:vreg_128 = COPY [[COPY94]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY95]].sub2:vreg_128 = COPY [[COPY94]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %54.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %54.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %54, %2, 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
+    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY95]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
     ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %49.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
-    ; CHECK-NEXT:   internal %49.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
+    ; CHECK-NEXT: undef [[COPY96:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY96]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef %46.sub0:vreg_128 = COPY %49.sub0 {
-    ; CHECK-NEXT:   internal %46.sub2:vreg_128 = COPY %49.sub2
+    ; CHECK-NEXT: undef [[COPY97:%[0-9]+]].sub0:vreg_128 = COPY [[COPY96]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY97]].sub2:vreg_128 = COPY [[COPY96]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %46.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %46.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %46, %2, 0, 16, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY97]], [[S_MOV_B32_]], 0, 16, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     %0:sgpr_64(p4) = COPY $sgpr0_sgpr1
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0(p4), 9, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
index e10d882aeb299..b428e859a6d32 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
@@ -20,18 +20,18 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %0, 2147549193 /* reguse tiedto:$1 */, %0(tied-def 3)
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def [[V_MOV_B32_e32_]], 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_]](tied-def 3)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef %0.sub0, 851978 /* regdef:VGPR_16 */, def undef %0.sub1
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub0, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
@@ -61,18 +61,18 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def %0, 2147549193 /* reguse tiedto:$1 */, %0(tied-def 3)
+  ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def [[V_MOV_B32_e32_]], 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_]](tied-def 3)
   ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851977 /* reguse:VGPR_16 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef %0.sub1, 851978 /* regdef:VGPR_16 */, def undef %0.sub0
-  ; CHECK-NEXT:   S_NOP 0, implicit %0.sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub1, 851978 /* regdef:VGPR_16 */, def undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir b/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir
index 24e0b11786f58..2843d83d3a2f2 100644
--- a/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir
@@ -12,6 +12,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_1_hazard
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -61,6 +62,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_2_hazard
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $sgpr0 = S_MOV_B32 0
@@ -108,6 +110,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_3_hazard
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -207,6 +210,7 @@ body:            |
     ; GFX11-NEXT: $vgpr5 = V_ADD_F32_e32 $vgpr1, $vgpr4, implicit $mode, implicit $exec
     ; GFX11-NEXT: $vgpr7 = V_ADD_F32_e32 $vgpr3, $vgpr6, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_4_one_depctr_1
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
@@ -231,6 +235,7 @@ body:            |
     ; GFX11-NEXT: $vgpr5 = V_ADD_F32_e32 $vgpr3, $vgpr4, implicit $mode, implicit $exec
     ; GFX11-NEXT: $vgpr7 = V_ADD_F32_e32 $vgpr1, $vgpr6, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_4_one_depctr_2
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr3 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec
@@ -255,6 +260,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 4095
     ; GFX11-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX1150-LABEL: name: trans_use_4
     ; GFX1150: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec
     ; GFX1150-NEXT: $vgpr10 = V_SQRT_F32_e32 $vgpr11, implicit $mode, implicit $exec
@@ -293,6 +299,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX1150-LABEL: name: trans_use_branching_1a
   ; GFX1150: bb.0:
   ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
@@ -353,6 +360,7 @@ body:            |
   ; GFX11-NEXT:   S_WAITCNT_DEPCTR 4095
   ; GFX11-NEXT:   $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec
   ; GFX11-NEXT:   S_ENDPGM 0
+  ;
   ; GFX1150-LABEL: name: trans_use_branching_1b
   ; GFX1150: bb.0:
   ; GFX1150-NEXT:   successors: %bb.2(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index 5b77024852056..eef5f57beb07d 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -5,8 +5,9 @@
 define amdgpu_ps float @simple_test_return_to_epilog(float %a) #0 {
   ; GCN-LABEL: name: simple_test_return_to_epilog
   ; GCN: bb.0.entry:
-  ; GCN:   liveins: $vgpr0
-  ; GCN:   SI_RETURN_TO_EPILOG killed $vgpr0
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG killed $vgpr0
 entry:
   ret float %a
 }
@@ -14,20 +15,26 @@ entry:
 define amdgpu_ps float @test_return_to_epilog_into_end_block(i32 inreg %a, float %b) #0 {
   ; GCN-LABEL: name: test_return_to_epilog_into_end_block
   ; GCN: bb.0.entry:
-  ; GCN:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
-  ; GCN:   liveins: $sgpr2, $vgpr0
-  ; GCN:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GCN: bb.1.if:
-  ; GCN:   successors: %bb.3(0x80000000)
-  ; GCN:   liveins: $vgpr0
-  ; GCN:   S_BRANCH %bb.3
-  ; GCN: bb.2.else:
-  ; GCN:   successors:
-  ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN:   S_WAITCNT_soft 3952
-  ; GCN: bb.3:
+  ; GCN-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
+  ; GCN-NEXT:   liveins: $sgpr2, $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.if:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_BRANCH %bb.3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.else:
+  ; GCN-NEXT:   successors:
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3:
 entry:
   %cc = icmp sgt i32 %a, 0
   br i1 %cc, label %if, label %else
@@ -41,30 +48,40 @@ else:                                             ; preds = %entry
 define amdgpu_ps float @test_unify_return_to_epilog_into_end_block(i32 inreg %a, i32 inreg %b, float %c, float %d) #0 {
   ; GCN-LABEL: name: test_unify_return_to_epilog_into_end_block
   ; GCN: bb.0.entry:
-  ; GCN:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
-  ; GCN:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
-  ; GCN:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
-  ; GCN: bb.1.if:
-  ; GCN:   successors: %bb.5(0x80000000)
-  ; GCN:   liveins: $vgpr0
-  ; GCN:   S_BRANCH %bb.5
-  ; GCN: bb.2.else.if.cond:
-  ; GCN:   successors: %bb.3(0x80000000), %bb.4(0x00000000)
-  ; GCN:   liveins: $sgpr3, $vgpr1
-  ; GCN:   S_CMP_LT_I32 killed renamable $sgpr3, 1, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
-  ; GCN: bb.3.else.if:
-  ; GCN:   successors: %bb.5(0x80000000)
-  ; GCN:   liveins: $vgpr1
-  ; GCN:   $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
-  ; GCN:   S_BRANCH %bb.5
-  ; GCN: bb.4.else:
-  ; GCN:   successors:
-  ; GCN:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GCN:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
-  ; GCN:   S_WAITCNT_soft 3952
-  ; GCN: bb.5:
+  ; GCN-NEXT:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; GCN-NEXT:   liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CMP_LT_I32 killed renamable $sgpr2, 1, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.if:
+  ; GCN-NEXT:   successors: %bb.5(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_BRANCH %bb.5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.else.if.cond:
+  ; GCN-NEXT:   successors: %bb.3(0x80000000), %bb.4(0x00000000)
+  ; GCN-NEXT:   liveins: $sgpr3, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_CMP_LT_I32 killed renamable $sgpr3, 1, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit killed $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3.else.if:
+  ; GCN-NEXT:   successors: %bb.5(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec, implicit $exec
+  ; GCN-NEXT:   S_BRANCH %bb.5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.4.else:
+  ; GCN-NEXT:   successors:
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   S_WAITCNT_soft 3952
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.5:
 entry:
   %cc = icmp sgt i32 %a, 0
   br i1 %cc, label %if, label %else.if.cond
@@ -81,60 +98,77 @@ else:                                             ; preds = %else.if.cond
 }
 
 define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(float %val) #0 {
-; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
-; GCN: bb.0 (%ir-block.0):
-; GCN:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
-; GCN:   liveins: $vgpr0
-; GCN:   renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
-; GCN:   $sgpr0_sgpr1 = S_MOV_B64 $exec
-; GCN:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
-; GCN:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
-; GCN:   renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
-; GCN:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
-; GCN: bb.1.Flow1:
-; GCN:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
-; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
-; GCN:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
-; GCN:   S_CBRANCH_EXECNZ %bb.6, implicit $exec
-; GCN: bb.2.end:
-; GCN:   successors: %bb.9(0x80000000)
-; GCN:   liveins: $sgpr2_sgpr3
-; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
-; GCN:   S_BRANCH %bb.9
-; GCN: bb.3.flow.preheader:
-; GCN:   successors: %bb.4(0x80000000)
-; GCN:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
-; GCN:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
-; GCN:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
-; GCN: bb.4.flow:
-; GCN:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
-; GCN:   liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
-; GCN:   renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
-; GCN:   renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
-; GCN:   $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
-; GCN:   S_CBRANCH_EXECNZ %bb.4, implicit $exec
-; GCN: bb.5.Flow:
-; GCN:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
-; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
-; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
-; GCN:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
-; GCN:   S_CBRANCH_EXECZ %bb.2, implicit $exec
-; GCN: bb.6.kill0:
-; GCN:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
-; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
-; GCN:   dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
-; GCN:   S_CBRANCH_SCC0 %bb.8, implicit $scc
-; GCN: bb.7.kill0:
-; GCN:   successors: %bb.9(0x80000000)
-; GCN:   liveins: $sgpr2_sgpr3, $scc
-; GCN:   $exec = S_MOV_B64 0
-; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
-; GCN:   S_BRANCH %bb.9
-; GCN: bb.8:
-; GCN:   $exec = S_MOV_B64 0
-; GCN:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
-; GCN:   S_ENDPGM 0
-; GCN: bb.9:
+  ; GCN-LABEL: name: test_return_to_epilog_with_optimized_kill
+  ; GCN: bb.0 (%ir-block.0):
+  ; GCN-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $vgpr1 = nofpexcept V_RCP_F32_e32 $vgpr0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $sgpr0_sgpr1 = S_MOV_B64 $exec
+  ; GCN-NEXT:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr1, implicit-def $vcc, implicit $mode, implicit $exec
+  ; GCN-NEXT:   $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   renamable $sgpr2_sgpr3 = S_XOR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def dead $scc
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1.Flow1:
+  ; GCN-NEXT:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.6, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2.end:
+  ; GCN-NEXT:   successors: %bb.9(0x80000000)
+  ; GCN-NEXT:   liveins: $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN-NEXT:   S_BRANCH %bb.9
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.3.flow.preheader:
+  ; GCN-NEXT:   successors: %bb.4(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   nofpexcept V_CMP_NGT_F32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $mode, implicit $exec
+  ; GCN-NEXT:   renamable $sgpr4_sgpr5 = S_MOV_B64 0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.4.flow:
+  ; GCN-NEXT:   successors: %bb.5(0x04000000), %bb.4(0x7c000000)
+  ; GCN-NEXT:   liveins: $vcc, $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   renamable $sgpr6_sgpr7 = S_AND_B64 $exec, renamable $vcc, implicit-def $scc
+  ; GCN-NEXT:   renamable $sgpr4_sgpr5 = S_OR_B64 killed renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN-NEXT:   $exec = S_ANDN2_B64 $exec, renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_EXECNZ %bb.4, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.5.Flow:
+  ; GCN-NEXT:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
+  ; GCN-NEXT:   $sgpr2_sgpr3 = S_ANDN2_SAVEEXEC_B64 killed $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GCN-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.6.kill0:
+  ; GCN-NEXT:   successors: %bb.7(0x40000000), %bb.8(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.8, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.7.kill0:
+  ; GCN-NEXT:   successors: %bb.9(0x80000000)
+  ; GCN-NEXT:   liveins: $sgpr2_sgpr3, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $exec = S_MOV_B64 0
+  ; GCN-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN-NEXT:   S_BRANCH %bb.9
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.8:
+  ; GCN-NEXT:   $exec = S_MOV_B64 0
+  ; GCN-NEXT:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
+  ; GCN-NEXT:   S_ENDPGM 0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.9:
   %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val
   %cmp0 = fcmp olt float %.i0, 0.000000e+00
   br i1 %cmp0, label %kill0, label %flow
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index d1cc927b55657..e668c1d2b7f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -9,16 +9,16 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY killed $sgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
-  ; CHECK-NEXT:   undef %0.sub0:sreg_64 = COPY killed [[COPY]]
-  ; CHECK-NEXT:   %0.sub1:sreg_64 = COPY killed [[COPY1]]
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY2]]
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
-  ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY3]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
-  ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY4]]
+  ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; CHECK-NEXT:   $sgpr0 = COPY killed [[COPY3]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+  ; CHECK-NEXT:   $sgpr1 = COPY killed [[COPY4]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
+  ; CHECK-NEXT:   $sgpr2 = COPY killed [[COPY5]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit killed $sgpr0, implicit killed $sgpr1, implicit killed $sgpr2
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 4
   ret <3 x i32> %load
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
index 3b641092fb990..f45a918d1d0f8 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-regsequence.mir
@@ -12,9 +12,9 @@ body:             |
     ; CHECK-LABEL: name: f
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %2.sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: %2.sub1:vreg_64 = COPY $vgpr1
-    ; CHECK-NEXT: $vgpr2_vgpr3 = COPY %2
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+    ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[COPY]]
     ; CHECK-NEXT: S_NOP 0, implicit $vgpr2_vgpr3
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir b/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir
index 99c955319c648..c71bc2b9c456d 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/undef-subreg-use-after-coalesce.mir
@@ -12,9 +12,9 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_subreg_use_after_full_copy_coalesce_0
-    ; CHECK: undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: dead %0.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit undef %0.sub2
+    ; CHECK: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0, implicit undef [[V_MOV_B32_e32_]].sub2
     undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
     %0.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
     %1:vreg_96 = COPY killed %0
@@ -30,8 +30,8 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_subreg_use_after_full_copy_coalesce_composed
-    ; CHECK: undef %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: dead %0.sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK: undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: dead [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[DEF]].sub1
     undef %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec
@@ -66,10 +66,10 @@ body:             |
     ; CHECK-LABEL: name: undef_subreg_use_after_full_copy_coalesce_1
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef %2.sub0:vreg_96 = COPY $vgpr0
-    ; CHECK-NEXT: %2.sub1:vreg_96 = COPY $vgpr1
-    ; CHECK-NEXT: S_NOP 0, implicit undef %2.sub2
-    ; CHECK-NEXT: S_NOP 0, implicit %2.sub1
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+    ; CHECK-NEXT: S_NOP 0, implicit undef [[COPY]].sub2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]].sub1
     ; CHECK-NEXT: S_ENDPGM 0
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index 801092238a104..65feaf23ae2cb 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -51,6 +51,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_getpc1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $sgpr0_sgpr1 = S_GETPC_B64
@@ -75,6 +76,7 @@ body:            |
     ; GFX11-NEXT:   $sgpr1 = S_ADDC_U32 $sgpr1, target-flags(amdgpu-rel32-lo) @mem + 16, implicit-def $scc, implicit $scc
     ; GFX11-NEXT: }
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_getpc2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: BUNDLE implicit-def $sgpr0_sgpr1 {
@@ -101,6 +103,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_vcc1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -119,6 +122,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_vcc2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -137,6 +141,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp1
     ; GFX12: $vgpr0 = V_CNDMASK_B32_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, 1, 15, 15, 1, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -155,6 +160,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp2
     ; GFX12: $vgpr0 = V_CNDMASK_B32_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -173,6 +179,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp4
     ; GFX12: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -191,6 +198,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc1
     ; GFX12: $vgpr1, $vcc = V_ADDC_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -209,6 +217,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc2
     ; GFX12: $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -227,6 +236,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc3
     ; GFX12: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -245,6 +255,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_addc4
     ; GFX12: $vgpr0, $sgpr2_sgpr3 = V_ADDC_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -263,6 +274,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb1
     ; GFX12: $vgpr1, $vcc = V_SUBB_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -281,6 +293,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb2
     ; GFX12: $vgpr1 = V_SUBB_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -299,6 +312,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb3
     ; GFX12: $vgpr0 = V_SUBB_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -317,6 +331,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subb4
     ; GFX12: $vgpr0, $sgpr2_sgpr3 = V_SUBB_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -335,6 +350,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev1
     ; GFX12: $vgpr1, $vcc = V_SUBBREV_U32_e64 0, $vgpr1, $sgpr2_sgpr3, 0, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -353,6 +369,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev2
     ; GFX12: $vgpr1 = V_SUBBREV_U32_e32 0, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -371,6 +388,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev3
     ; GFX12: $vgpr0 = V_SUBBREV_U32_dpp $vgpr0, $vgpr1, $vgpr2, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -389,6 +407,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subbrev4
     ; GFX12: $vgpr0, $sgpr2_sgpr3 = V_SUBBREV_U32_e64_dpp $vgpr0, $vgpr1, $vgpr2, $sgpr2_sgpr3, 0, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
@@ -407,6 +426,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_div_fmas_f32
     ; GFX12: $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -425,6 +445,7 @@ body:            |
     ; GFX11-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_div_fmas_f64
     ; GFX12: $vgpr0_vgpr1 = V_DIV_FMAS_F64_e64 0, $vgpr0_vgpr1, 0, $vgpr2_vgpr3, 0, $vgpr4_vgpr5, 0, 0, implicit $mode, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc = S_CSELECT_B64 -1, 0, implicit $scc
@@ -444,6 +465,7 @@ body:            |
     ; GFX11-NEXT: $sgpr2 = S_MOV_B32 0
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr2 = S_MOV_B32 0
@@ -463,6 +485,7 @@ body:            |
     ; GFX11-NEXT: $sgpr3 = S_MOV_B32 0
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr3 = S_MOV_B32 0
@@ -483,6 +506,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr3 = S_MOV_B32 0
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg3
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, implicit $exec
     ; GFX12-NEXT: $sgpr2 = S_MOV_B32 0
@@ -505,6 +529,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_lo
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg4
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc_lo = S_MOV_B32 0
@@ -527,6 +552,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr2 = S_MOV_B32 $vcc_hi
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_subreg5
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
     ; GFX12-NEXT: $vcc_hi = S_MOV_B32 0
@@ -550,6 +576,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_waitcnt
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: S_WAITCNT 0
@@ -576,6 +603,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_gap1
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
@@ -603,6 +631,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_gap2
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit $mode
@@ -628,6 +657,7 @@ body:            |
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
     ; GFX11-NEXT: S_ENDPGM 0
+    ;
     ; GFX12-LABEL: name: mask_hazard_gap3
     ; GFX12: $vgpr1 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr2, $sgpr0_sgpr1, implicit $exec
     ; GFX12-NEXT: $vgpr2 = V_WRITELANE_B32 $exec_lo, 0, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
index 5cbecacc4ae7e..08f5550f3b08a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat.mir
@@ -12,8 +12,8 @@ body: |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %4.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   %4.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0
   ; CHECK-NEXT:   $exec = S_MOV_B64_term [[COPY]]
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
@@ -22,11 +22,11 @@ body: |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   %4.sub0:vreg_96 = V_MUL_F32_e32 %4.sub0, %4.sub0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   %4.sub1:vreg_96 = V_MUL_F32_e32 %4.sub1, %4.sub1, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_96 = V_MUL_F32_e32 [[V_MOV_B32_e32_]].sub0, [[V_MOV_B32_e32_]].sub0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_96 = V_MUL_F32_e32 [[V_MOV_B32_e32_]].sub1, [[V_MOV_B32_e32_]].sub1, implicit $mode, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   S_ENDPGM 0, implicit %4
+  ; CHECK-NEXT:   S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
   bb.0:
     liveins: $sgpr0
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
index 93b98df2f7dba..dd3572c027c86 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-scc-clobber.mir
@@ -38,6 +38,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_save_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -56,6 +57,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_save_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -73,6 +75,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_save_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -135,6 +138,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_save_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -153,6 +157,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_save_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -170,6 +175,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_save_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -230,6 +236,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -247,6 +254,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -263,6 +271,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_restore_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -322,6 +331,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -339,6 +349,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -355,6 +366,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_restore_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -418,6 +430,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -439,6 +452,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -459,6 +473,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -528,6 +543,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -549,6 +565,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -569,6 +586,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -640,6 +658,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -661,6 +680,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -681,6 +701,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr96_restore_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -751,6 +772,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -772,6 +794,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -792,6 +815,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr32_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -861,6 +885,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -882,6 +907,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -902,6 +928,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr64_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -973,6 +1000,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -994,6 +1022,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1014,6 +1043,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: vgpr96_save_clobber_scc_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1080,6 +1110,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1099,6 +1130,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1118,6 +1150,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: mubuf_load_restore_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1184,6 +1217,7 @@ body:             |
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX9-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1207,6 +1241,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT: bb.2:
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; GFX10-FLATSCR-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1230,6 +1265,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT: bb.2:
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, amdgpu_allvgprs
+  ;
   ; VMEM-GFX8-LABEL: name: mubuf_load_restore_clobber_scc_no_vgprs_emergency_stack_slot
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1298,6 +1334,7 @@ body:             |
   ; MUBUF-NEXT:   liveins: $vgpr0
   ; MUBUF-NEXT: {{  $}}
   ; MUBUF-NEXT:   S_ENDPGM 0, implicit $vgpr0
+  ;
   ; GFX9-FLATSCR-LABEL: name: v_mov_clobber_scc
   ; GFX9-FLATSCR: bb.0:
   ; GFX9-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1318,6 +1355,7 @@ body:             |
   ; GFX9-FLATSCR-NEXT:   liveins: $vgpr0
   ; GFX9-FLATSCR-NEXT: {{  $}}
   ; GFX9-FLATSCR-NEXT:   S_ENDPGM 0, implicit $vgpr0
+  ;
   ; GFX10-FLATSCR-LABEL: name: v_mov_clobber_scc
   ; GFX10-FLATSCR: bb.0:
   ; GFX10-FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -1338,6 +1376,7 @@ body:             |
   ; GFX10-FLATSCR-NEXT:   liveins: $vgpr0
   ; GFX10-FLATSCR-NEXT: {{  $}}
   ; GFX10-FLATSCR-NEXT:   S_ENDPGM 0, implicit $vgpr0
+  ;
   ; VMEM-GFX8-LABEL: name: v_mov_clobber_scc
   ; VMEM-GFX8: bb.0:
   ; VMEM-GFX8-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 92cdc27b653ad..3c1da043bcd6c 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -38,12 +38,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_schedule
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -73,12 +75,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_fmamk
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_fmamk
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -108,6 +112,7 @@ body:             |
     ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr4, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_fmamk_fail
     ; PAIR: $vgpr1 = IMPLICIT_DEF
     ; PAIR-NEXT: $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -142,6 +147,7 @@ body:             |
     ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc
     ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_cndmask
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -153,6 +159,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc
     ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_cndmask
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -192,10 +199,12 @@ body:             |
     ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
     ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -218,10 +227,12 @@ body:             |
     ; SCHED-NEXT: $sgpr7 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
     ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_mov
     ; PAIR-GFX11: $sgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $sgpr7 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_mov
     ; PAIR-GFX12: $sgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $sgpr7 = IMPLICIT_DEF
@@ -247,6 +258,7 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 99, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_constants_fail
     ; PAIR: $vgpr2 = IMPLICIT_DEF
     ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -276,12 +288,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_constants_inlinable
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_constants_inlinable
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -312,12 +326,14 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_constants_same
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_constants_same
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr0 = IMPLICIT_DEF
@@ -345,10 +361,12 @@ body:             |
     ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec
     ; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_fmaak_constants_same
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $sgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_fmaak_constants_same
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $sgpr0 = IMPLICIT_DEF
@@ -373,11 +391,13 @@ body:             |
     ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: DBG_VALUE $vgpr0, 0, 0
     ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_debug
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: DBG_VALUE $vgpr0, 0, 0
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_debug
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -416,6 +436,7 @@ body:             |
     ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -431,6 +452,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -507,6 +529,7 @@ body:             |
     ; SCHED-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained_2
     ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -531,6 +554,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc
     ; PAIR-GFX11-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained_2
     ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
@@ -607,11 +631,13 @@ body: |
     ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr1, implicit $mode, implicit $exec
     ; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
     ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_fixup
     ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_fixup
     ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr1 = IMPLICIT_DEF
@@ -635,6 +661,7 @@ body: |
     ; SCHED-LABEL: name: vopd_mov_fixup_fail
     ; SCHED: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
     ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_mov_fixup_fail
     ; PAIR: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
     ; PAIR-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec
@@ -652,6 +679,7 @@ body:             |
     ; SCHED: $vgpr0 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
     ; SCHED-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr3, implicit $mode, implicit $exec
+    ;
     ; PAIR-LABEL: name: vopd_no_combine_dependent_subreg
     ; PAIR: $vgpr0 = IMPLICIT_DEF
     ; PAIR-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
@@ -672,11 +700,13 @@ body:             |
     ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
     ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
     ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
+    ;
     ; PAIR-GFX11-LABEL: name: vopd_mov_mov_same_src_bank
     ; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr5 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
+    ;
     ; PAIR-GFX12-LABEL: name: vopd_mov_mov_same_src_bank
     ; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr5 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir
index e764d36b3b299..f382800bfd391 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vinterp.mir
@@ -9,14 +9,16 @@ body: |
   bb.0:
     liveins: $vgpr0
     ; GFX11-LABEL: name: waitcnt-vinterp
-    ; GFX11: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr2 = LDS_PARAM_LOAD 0, 1, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr3 = LDS_PARAM_LOAD 0, 2, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr4 = LDS_PARAM_LOAD 0, 3, 0, implicit $m0, implicit $exec
-    ; GFX11: $vgpr5 = V_INTERP_P10_F16_F32_inreg 0, $vgpr1, 0, $vgpr0, 0, $vgpr1, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
-    ; GFX11: $vgpr6 = V_INTERP_P10_F16_F32_inreg 0, $vgpr2, 0, $vgpr0, 0, $vgpr2, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
-    ; GFX11: $vgpr7 = V_INTERP_P10_F16_F32_inreg 0, $vgpr3, 0, $vgpr0, 0, $vgpr3, 0, 0, 1, implicit $m0, implicit $exec, implicit $mode
-    ; GFX11: $vgpr8 = V_INTERP_P10_F16_F32_inreg 0, $vgpr4, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr2 = LDS_PARAM_LOAD 0, 1, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr3 = LDS_PARAM_LOAD 0, 2, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr4 = LDS_PARAM_LOAD 0, 3, 0, implicit $m0, implicit $exec
+    ; GFX11-NEXT: $vgpr5 = V_INTERP_P10_F16_F32_inreg 0, $vgpr1, 0, $vgpr0, 0, $vgpr1, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11-NEXT: $vgpr6 = V_INTERP_P10_F16_F32_inreg 0, $vgpr2, 0, $vgpr0, 0, $vgpr2, 0, 0, 2, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11-NEXT: $vgpr7 = V_INTERP_P10_F16_F32_inreg 0, $vgpr3, 0, $vgpr0, 0, $vgpr3, 0, 0, 1, implicit $m0, implicit $exec, implicit $mode
+    ; GFX11-NEXT: $vgpr8 = V_INTERP_P10_F16_F32_inreg 0, $vgpr4, 0, $vgpr0, 0, $vgpr4, 0, 0, 0, implicit $m0, implicit $exec, implicit $mode
     $vgpr1 = LDS_PARAM_LOAD 0, 0, 0, implicit $m0, implicit $exec
     $vgpr2 = LDS_PARAM_LOAD 0, 1, 0, implicit $m0, implicit $exec
     $vgpr3 = LDS_PARAM_LOAD 0, 2, 0, implicit $m0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir b/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir
index b4df02e47d2ac..8d75bb3b1280f 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm-terminators.mir
@@ -34,12 +34,12 @@ body: |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sgpr_256 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
   ; CHECK-NEXT:   S_CMP_EQ_U32 [[COPY1]], 0, implicit-def $scc
-  ; CHECK-NEXT:   undef %5.sub0:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   %5.sub1:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   undef [[V_MUL_F32_e64_:%[0-9]+]].sub0:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_MUL_F32_e64_:%[0-9]+]].sub1:vreg_64 = V_MUL_F32_e64 0, [[COPY2]].sub0, 0, [[COPY2]].sub1, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32_xm0 = COPY $scc
   ; CHECK-NEXT:   $exec_lo = S_AND_B32 $exec_lo, [[COPY]], implicit-def $scc
   ; CHECK-NEXT:   $scc = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[IMAGE_SAMPLE_V3_V2_gfx10_:%[0-9]+]]:vreg_96 = IMAGE_SAMPLE_V3_V2_gfx10 %5, [[DEF]], [[DEF1]], 7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+  ; CHECK-NEXT:   [[IMAGE_SAMPLE_V3_V2_gfx10_:%[0-9]+]]:vreg_96 = IMAGE_SAMPLE_V3_V2_gfx10 [[V_MUL_F32_e64_]], [[DEF]], [[DEF1]], 7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
   ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}

From f0663772463335bed253491fa380e1a9c2d78484 Mon Sep 17 00:00:00 2001
From: cmtice <cmtice@google.com>
Date: Mon, 26 Feb 2024 21:54:10 -0800
Subject: [PATCH 407/546] [LLVM][DWARF] Make dwarf::getDebugNamesBucketCount
 return a pair. (#83047)

llvm::dwarf::getDebugNamesBucketCount directly returns the bucket count,
via return statement, but it also returns the hash count via a
parameter. This changes the function to return them both as a std::pair,
in the return statement. It also changes the name of the function to
make it clear it returns both values.
---
 llvm/include/llvm/BinaryFormat/Dwarf.h     | 22 +++++++++++-----------
 llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp |  4 +++-
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 44c0030251b37..a53e79bf6e39c 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -615,21 +615,21 @@ enum AcceleratorTable {
 
 // Uniquify the string hashes and calculate the bucket count for the
 // DWARF v5 Accelerator Table. NOTE: This function effectively consumes the
-// 'hashes' input parameter.
-inline uint32_t getDebugNamesBucketCount(MutableArrayRef<uint32_t> hashes,
-                                         uint32_t &uniqueHashCount) {
+// 'Hashes' input parameter.
+inline std::pair<uint32_t, uint32_t>
+getDebugNamesBucketAndHashCount(MutableArrayRef<uint32_t> Hashes) {
   uint32_t BucketCount = 0;
 
-  sort(hashes);
-  uniqueHashCount = llvm::unique(hashes) - hashes.begin();
-  if (uniqueHashCount > 1024)
-    BucketCount = uniqueHashCount / 4;
-  else if (uniqueHashCount > 16)
-    BucketCount = uniqueHashCount / 2;
+  sort(Hashes);
+  uint32_t UniqueHashCount = llvm::unique(Hashes) - Hashes.begin();
+  if (UniqueHashCount > 1024)
+    BucketCount = UniqueHashCount / 4;
+  else if (UniqueHashCount > 16)
+    BucketCount = UniqueHashCount / 2;
   else
-    BucketCount = std::max<uint32_t>(uniqueHashCount, 1);
+    BucketCount = std::max<uint32_t>(UniqueHashCount, 1);
 
-  return BucketCount;
+  return {BucketCount, UniqueHashCount};
 }
 
 // Constants for the GNU pubnames/pubtypes extensions supporting gdb index.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index 23fc9b2e0410e..9e1727a0b8d13 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -38,7 +38,9 @@ void AccelTableBase::computeBucketCount() {
   for (const auto &E : Entries)
     Uniques.push_back(E.second.HashValue);
 
-  BucketCount = llvm::dwarf::getDebugNamesBucketCount(Uniques, UniqueHashCount);
+  auto Counts = llvm::dwarf::getDebugNamesBucketAndHashCount(Uniques);
+  BucketCount = Counts.first;
+  UniqueHashCount = Counts.second;
 }
 
 void AccelTableBase::finalize(AsmPrinter *Asm, StringRef Prefix) {

From c88beb4112d5bbf07d76a615ab7f13ba2ba023e6 Mon Sep 17 00:00:00 2001
From: YunQiang Su <syq@debian.org>
Date: Tue, 27 Feb 2024 14:08:36 +0800
Subject: [PATCH 408/546] MIPS: Fix asm constraints "f" and "r" for softfloat
 (#79116)

This include 2 fixes:
        1. Disallow 'f' for softfloat.
        2. Allow 'r' for softfloat.

Currently, 'f' is accpeted by clang, then LLVM meets an internal error.

'r' is rejected by LLVM by: couldn't allocate input reg for constraint
'r'.

Fixes: #64241, #63632

---------

Co-authored-by: Fangrui Song <i@maskray.me>
---
 clang/lib/Basic/Targets/Mips.h                |  4 +-
 .../CodeGen/Mips/inline-asm-constraints.c     | 11 +++++
 clang/test/Sema/inline-asm-validate-mips.c    |  9 ++++
 llvm/lib/Target/Mips/MipsISelLowering.cpp     | 10 ++--
 .../Mips/inlineasm-constraints-softfloat.ll   | 48 +++++++++++++++++++
 5 files changed, 78 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGen/Mips/inline-asm-constraints.c
 create mode 100644 clang/test/Sema/inline-asm-validate-mips.c
 create mode 100644 llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll

diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index f46b95abfd75c..23d4e1b598fa1 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -237,12 +237,14 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     case 'r': // CPU registers.
     case 'd': // Equivalent to "r" unless generating MIPS16 code.
     case 'y': // Equivalent to "r", backward compatibility only.
-    case 'f': // floating-point registers.
     case 'c': // $25 for indirect jumps
     case 'l': // lo register
     case 'x': // hilo register pair
       Info.setAllowsRegister();
       return true;
+    case 'f': // floating-point registers.
+      Info.setAllowsRegister();
+      return FloatABI != SoftFloat;
     case 'I': // Signed 16-bit constant
     case 'J': // Integer 0
     case 'K': // Unsigned 16-bit constant
diff --git a/clang/test/CodeGen/Mips/inline-asm-constraints.c b/clang/test/CodeGen/Mips/inline-asm-constraints.c
new file mode 100644
index 0000000000000..88afe8735083b
--- /dev/null
+++ b/clang/test/CodeGen/Mips/inline-asm-constraints.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -emit-llvm -triple mips -target-feature +soft-float %s -o - | FileCheck %s --check-prefix=SOFT_FLOAT
+
+// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(float %1)
+void read_float(float *p) {
+  __asm__("" ::"r"(*p));
+}
+
+// SOFT_FLOAT: call void asm sideeffect "", "r,~{$1}"(double %1)
+void read_double(double *p) {
+  __asm__("" :: "r"(*p));
+}
diff --git a/clang/test/Sema/inline-asm-validate-mips.c b/clang/test/Sema/inline-asm-validate-mips.c
new file mode 100644
index 0000000000000..7da248fe417b5
--- /dev/null
+++ b/clang/test/Sema/inline-asm-validate-mips.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple mips64 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple mips64 -target-feature +soft-float -fsyntax-only -verify=softfloat %s
+
+// expected-no-diagnostics
+
+void test_f(float p) {
+  float result = p;
+  __asm__("" :: "f"(result)); // softfloat-error{{invalid input constraint 'f' in asm}}
+}
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index b2812f87914df..97e830cec27ca 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -4128,14 +4128,18 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || VT == MVT::i1) {
+      if ((VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 ||
+           VT == MVT::i1) ||
+          (VT == MVT::f32 && Subtarget.useSoftFloat())) {
         if (Subtarget.inMips16Mode())
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if (VT == MVT::i64 && !Subtarget.isGP64bit())
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+          !Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if (VT == MVT::i64 && Subtarget.isGP64bit())
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+          Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0U, nullptr);
diff --git a/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll
new file mode 100644
index 0000000000000..705570f808ce0
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/inlineasm-constraints-softfloat.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=mips < %s | FileCheck %s --check-prefix=MIPS32
+; RUN: llc -march=mips64 < %s | FileCheck %s --check-prefix=MIPS64
+
+define dso_local void @read_double(ptr nocapture noundef readonly %0) local_unnamed_addr #0 {
+; MIPS32-LABEL: read_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 4($4)
+; MIPS32-NEXT:    lw $3, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    ld $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load double, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(double %2)
+  ret void
+}
+
+define dso_local void @read_float(ptr nocapture noundef readonly %0) local_unnamed_addr #0 {
+; MIPS32-LABEL: read_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    lw $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load float, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(float %2)
+  ret void
+}
+
+attributes #0 = { "target-features"="+soft-float" "use-soft-float"="true" }

From c19a81931ce47278967056927aa87a7685e5447c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20W=C3=B3jt?= <dominik.wojt@arm.com>
Date: Tue, 27 Feb 2024 07:27:02 +0100
Subject: [PATCH 409/546] [libc++] protect absent atomic types with ifdef
 (#82351)

Otherwise modules.std-module.sh.cpp test fails with following error:
    error: no member named 'atomic_signed_lock_free' in namespace 'std'
when the types are not available.
---
 libcxx/modules/std/atomic.inc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc
index 88b31ccdb2084..2b54cef863e57 100644
--- a/libcxx/modules/std/atomic.inc
+++ b/libcxx/modules/std/atomic.inc
@@ -111,8 +111,10 @@ export namespace std {
   using std::atomic_uintmax_t;
   using std::atomic_uintptr_t;
 
+#ifndef _LIBCPP_NO_LOCK_FREE_TYPES
   using std::atomic_signed_lock_free;
   using std::atomic_unsigned_lock_free;
+#endif
 
   // [atomics.flag], flag type and operations
   using std::atomic_flag;

From 0ed61db6fdf683f8def06e6a6d206d02b821cd81 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 27 Feb 2024 15:31:04 +0900
Subject: [PATCH 410/546] [MC/DC] Refactor: Isolate the final result out of
 TestVector (#82282)

To reduce conditional judges in the loop in `findIndependencePairs()`, I
have tried a couple of tweaks.

* Isolate the final result in `TestVectors`

`using TestVectors = llvm::SmallVector<std::pair<TestVector,
CondState>>;`
The final result was just piggybacked on `TestVector`, so it has been
isolated.

* Filter out and sort `ExecVectors` by the final result

It will cost more in constructing `ExecVectors`, but it can reduce at
least one conditional judgement in the loop.
---
 .../ProfileData/Coverage/CoverageMapping.h    |  6 +--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 37 +++++++++++++------
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index f8f549eea2e09..27b1c32b1fa54 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -385,7 +385,7 @@ struct MCDCRecord {
   enum CondState { MCDC_DontCare = -1, MCDC_False = 0, MCDC_True = 1 };
 
   using TestVector = llvm::SmallVector<CondState>;
-  using TestVectors = llvm::SmallVector<TestVector>;
+  using TestVectors = llvm::SmallVector<std::pair<TestVector, CondState>>;
   using BoolVector = llvm::SmallVector<bool>;
   using TVRowPair = std::pair<unsigned, unsigned>;
   using TVPairMap = llvm::DenseMap<unsigned, TVRowPair>;
@@ -426,13 +426,13 @@ struct MCDCRecord {
   /// accessing conditions in the TestVectors requires a translation from a
   /// ordinal position to actual condition ID. This is done via PosToID[].
   CondState getTVCondition(unsigned TestVectorIndex, unsigned Condition) {
-    return TV[TestVectorIndex][PosToID[Condition]];
+    return TV[TestVectorIndex].first[PosToID[Condition]];
   }
 
   /// Return the Result evaluation for an executed test vector.
   /// See MCDCRecordProcessor::RecordTestVector().
   CondState getTVResult(unsigned TestVectorIndex) {
-    return TV[TestVectorIndex][getNumConditions()];
+    return TV[TestVectorIndex].second;
   }
 
   /// Determine whether a given condition (indicated by Condition) is covered
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 3f892d7aff2f4..64545164d597a 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -370,9 +370,16 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   /// Mapping of calculated MC/DC Independence Pairs for each condition.
   MCDCRecord::TVPairMap IndependencePairs;
 
+  /// Storage for ExecVectors
+  /// ExecVectors is the alias of its 0th element.
+  std::array<MCDCRecord::TestVectors, 2> ExecVectorsByCond;
+
   /// Actual executed Test Vectors for the boolean expression, based on
   /// ExecutedTestVectorBitmap.
-  MCDCRecord::TestVectors ExecVectors;
+  MCDCRecord::TestVectors &ExecVectors;
+
+  /// Number of False items in ExecVectors
+  unsigned NumExecVectorsF;
 
 #ifndef NDEBUG
   DenseSet<unsigned> TVIdxs;
@@ -385,7 +392,8 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
       : NextIDsBuilder(Branches), TVIdxBuilder(this->NextIDs), Bitmap(Bitmap),
         Region(Region), DecisionParams(Region.getDecisionParams()),
         Branches(Branches), NumConditions(DecisionParams.NumConditions),
-        Folded(NumConditions, false), IndependencePairs(NumConditions) {}
+        Folded(NumConditions, false), IndependencePairs(NumConditions),
+        ExecVectors(ExecVectorsByCond[false]) {}
 
 private:
   // Walk the binary decision diagram and try assigning both false and true to
@@ -415,11 +423,9 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
         continue;
 
       // Copy the completed test vector to the vector of testvectors.
-      ExecVectors.push_back(TV);
-
       // The final value (T,F) is equal to the last non-dontcare state on the
       // path (in a short-circuiting system).
-      ExecVectors.back().push_back(MCDCCond);
+      ExecVectorsByCond[MCDCCond].push_back({TV, MCDCCond});
     }
 
     // Reset back to DontCare.
@@ -437,6 +443,14 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
     buildTestVector(TV, 0, 0, 0);
     assert(TVIdxs.size() == unsigned(NumTestVectors) &&
            "TVIdxs wasn't fulfilled");
+
+    // Fill ExecVectors order by False items and True items.
+    // ExecVectors is the alias of ExecVectorsByCond[false], so
+    // Append ExecVectorsByCond[true] on it.
+    NumExecVectorsF = ExecVectors.size();
+    auto &ExecVectorsT = ExecVectorsByCond[true];
+    ExecVectors.append(std::make_move_iterator(ExecVectorsT.begin()),
+                       std::make_move_iterator(ExecVectorsT.end()));
   }
 
   // Find an independence pair for each condition:
@@ -445,13 +459,12 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   // - All other conditions' values must be equal or marked as "don't care".
   void findIndependencePairs() {
     unsigned NumTVs = ExecVectors.size();
-    for (unsigned I = 1; I < NumTVs; ++I) {
-      const MCDCRecord::TestVector &A = ExecVectors[I];
-      for (unsigned J = 0; J < I; ++J) {
-        const MCDCRecord::TestVector &B = ExecVectors[J];
-        // Enumerate two execution vectors whose outcomes are different.
-        if (A[NumConditions] == B[NumConditions])
-          continue;
+    for (unsigned I = NumExecVectorsF; I < NumTVs; ++I) {
+      const auto &[A, ACond] = ExecVectors[I];
+      assert(ACond == MCDCRecord::MCDC_True);
+      for (unsigned J = 0; J < NumExecVectorsF; ++J) {
+        const auto &[B, BCond] = ExecVectors[J];
+        assert(BCond == MCDCRecord::MCDC_False);
         unsigned Flip = NumConditions, Idx;
         for (Idx = 0; Idx < NumConditions; ++Idx) {
           MCDCRecord::CondState ACond = A[Idx], BCond = B[Idx];

From ed1aabef1da1b074b71ad523978ea836d6e7d2e7 Mon Sep 17 00:00:00 2001
From: mahesh-attarde <145317060+mahesh-attarde@users.noreply.github.com>
Date: Mon, 26 Feb 2024 22:39:09 -0800
Subject: [PATCH 411/546] remove expensive copy of options passed (#82577)

Codegen option is passed copy by value which 138 bytes of copy and used
as read-only. Making pass by const reference.
---
 llvm/include/llvm/Passes/CodeGenPassBuilder.h | 3 ++-
 llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 82a17e882b3c4..00eb9b096a935 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -111,7 +111,8 @@ namespace llvm {
 /// construction.
 template <typename DerivedT> class CodeGenPassBuilder {
 public:
-  explicit CodeGenPassBuilder(LLVMTargetMachine &TM, CGPassBuilderOption Opts,
+  explicit CodeGenPassBuilder(LLVMTargetMachine &TM,
+                              const CGPassBuilderOption &Opts,
                               PassInstrumentationCallbacks *PIC)
       : TM(TM), Opt(Opts), PIC(PIC) {
     // Target could set CGPassBuilderOption::MISchedPostRA to true to achieve
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index a620ba911ec61..6ef28b373cc93 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -22,7 +22,7 @@ namespace {
 class X86CodeGenPassBuilder : public CodeGenPassBuilder<X86CodeGenPassBuilder> {
 public:
   explicit X86CodeGenPassBuilder(LLVMTargetMachine &TM,
-                                 CGPassBuilderOption Opts,
+                                 const CGPassBuilderOption &Opts,
                                  PassInstrumentationCallbacks *PIC)
       : CodeGenPassBuilder(TM, Opts, PIC) {}
   void addPreISel(AddIRPass &addPass) const;

From d7c80bba698bded48c1df4b4bb7424a181aa6195 Mon Sep 17 00:00:00 2001
From: leecheechen <chenli@loongson.cn>
Date: Tue, 27 Feb 2024 15:38:11 +0800
Subject: [PATCH 412/546] [llvm][LoongArch] Improve loongarch_lasx_xvpermi_q
 instrinsic (#82984)

For instruction xvpermi.q, only [1:0] and [5:4] bits of operands[3] are
used. The unused bits in operands[3] need to be set to 0 to avoid
causing undefined behavior.
---
 .../LoongArch/LoongArchISelLowering.cpp       | 25 +++++++++++++++-
 .../CodeGen/LoongArch/lasx/intrinsic-permi.ll | 30 +++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 76c1a14fe0156..3324dd2e8fc21 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -968,6 +968,28 @@ static SDValue checkIntrinsicImmArg(SDValue Op, unsigned ImmOp,
   return SDValue();
 }
 
+static SDValue checkAndModifyXVPERMI_QIntrinsicImmArg(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  SDValue Op3 = Op->getOperand(3);
+  uint64_t Imm = Op3->getAsZExtVal();
+  // Check the range of ImmArg.
+  if (!isUInt<8>(Imm)) {
+    DAG.getContext()->emitError(Op->getOperationName(0) +
+                                ": argument out of range.");
+    return DAG.getNode(ISD::UNDEF, SDLoc(Op), Op.getValueType());
+  }
+
+  // For instruction xvpermi.q, only [1:0] and [5:4] bits of operands[3]
+  // are used. The unused bits in operands[3] need to be set to 0 to avoid
+  // causing undefined behavior on LA464.
+  if ((Imm & 0x33) != Imm) {
+    Op3 = DAG.getTargetConstant(Imm & 0x33, SDLoc(Op), Op3.getValueType());
+    DAG.UpdateNodeOperands(Op.getNode(), Op->getOperand(0), Op->getOperand(1),
+                           Op->getOperand(2), Op3);
+  }
+  return SDValue();
+}
+
 SDValue
 LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
@@ -1225,13 +1247,14 @@ LoongArchTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::loongarch_lsx_vextrins_d:
   case Intrinsic::loongarch_lasx_xvshuf4i_d:
   case Intrinsic::loongarch_lasx_xvpermi_w:
-  case Intrinsic::loongarch_lasx_xvpermi_q:
   case Intrinsic::loongarch_lasx_xvbitseli_b:
   case Intrinsic::loongarch_lasx_xvextrins_b:
   case Intrinsic::loongarch_lasx_xvextrins_h:
   case Intrinsic::loongarch_lasx_xvextrins_w:
   case Intrinsic::loongarch_lasx_xvextrins_d:
     return checkIntrinsicImmArg<8>(Op, 3, DAG);
+  case Intrinsic::loongarch_lasx_xvpermi_q:
+    return checkAndModifyXVPERMI_QIntrinsicImmArg(Op, DAG);
   case Intrinsic::loongarch_lsx_vrepli_b:
   case Intrinsic::loongarch_lsx_vrepli_h:
   case Intrinsic::loongarch_lsx_vrepli_w:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
index 0d9f9daabc448..92669d2e58955 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-permi.ll
@@ -36,3 +36,33 @@ entry:
   %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 1)
   ret <32 x i8> %res
 }
+
+define <32 x i8> @lasx_xvpermi_q_204(<32 x i8> %va, <32 x i8> %vb) nounwind {
+; CHECK-LABEL: lasx_xvpermi_q_204:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 204)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @lasx_xvpermi_q_221(<32 x i8> %va, <32 x i8> %vb) nounwind {
+; CHECK-LABEL: lasx_xvpermi_q_221:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 17
+; CHECK-NEXT:    ret
+entry:
+  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 221)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @lasx_xvpermi_q_255(<32 x i8> %va, <32 x i8> %vb) nounwind {
+; CHECK-LABEL: lasx_xvpermi_q_255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 51
+; CHECK-NEXT:    ret
+entry:
+  %res = call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> %va, <32 x i8> %vb, i32 255)
+  ret <32 x i8> %res
+}

From 512a8a78a7f3bffc41cd7c00788eb099519f4625 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 27 Feb 2024 16:46:49 +0900
Subject: [PATCH 413/546] [MC/DC] Introduce `class TestVector` with a pair of
 `BitVector` (#82174)

This replaces `SmallVector<CondState>` and emulates it.

- -------- True  False DontCare
- Values:  True  False False
- Visited: True  True  False

`findIndependencePairs()` can be optimized with logical ops.

FIXME: Specialize `findIndependencePairs()` for the single word.
---
 .../ProfileData/Coverage/CoverageMapping.h    | 53 ++++++++++++++++++-
 .../ProfileData/Coverage/CoverageMapping.cpp  | 33 ++++--------
 2 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 27b1c32b1fa54..7a8b6639f2971 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -384,7 +384,58 @@ struct MCDCRecord {
   /// are effectively ignored.
   enum CondState { MCDC_DontCare = -1, MCDC_False = 0, MCDC_True = 1 };
 
-  using TestVector = llvm::SmallVector<CondState>;
+  /// Emulate SmallVector<CondState> with a pair of BitVector.
+  ///
+  ///          True  False DontCare (Impossible)
+  /// Values:  True  False False    True
+  /// Visited: True  True  False    False
+  class TestVector {
+    BitVector Values;  /// True/False (False when DontCare)
+    BitVector Visited; /// ~DontCare
+
+  public:
+    /// Default values are filled with DontCare.
+    TestVector(unsigned N) : Values(N), Visited(N) {}
+
+    /// Emulate RHS SmallVector::operator[]
+    CondState operator[](int I) const {
+      return (Visited[I] ? (Values[I] ? MCDC_True : MCDC_False)
+                         : MCDC_DontCare);
+    }
+
+    /// Equivalent to buildTestVector's Index.
+    auto getIndex() const { return Values.getData()[0]; }
+
+    /// Set the condition \p Val at position \p I.
+    /// This emulates LHS SmallVector::operator[].
+    void set(int I, CondState Val) {
+      Visited[I] = (Val != MCDC_DontCare);
+      Values[I] = (Val == MCDC_True);
+    }
+
+    /// Emulate SmallVector::push_back.
+    void push_back(CondState Val) {
+      Visited.push_back(Val != MCDC_DontCare);
+      Values.push_back(Val == MCDC_True);
+      assert(Values.size() == Visited.size());
+    }
+
+    /// For each element:
+    /// - False if either is DontCare
+    /// - False if both have the same value
+    /// - True if both have the opposite value
+    /// ((A.Values ^ B.Values) & A.Visited & B.Visited)
+    /// Dedicated to findIndependencePairs().
+    auto getDifferences(const TestVector &B) const {
+      const auto &A = *this;
+      BitVector AB = A.Values;
+      AB ^= B.Values;
+      AB &= A.Visited;
+      AB &= B.Visited;
+      return AB;
+    }
+  };
+
   using TestVectors = llvm::SmallVector<std::pair<TestVector, CondState>>;
   using BoolVector = llvm::SmallVector<bool>;
   using TVRowPair = std::pair<unsigned, unsigned>;
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 64545164d597a..6c77ce017c036 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -400,26 +400,23 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
   // each node. When a terminal node (ID == 0) is reached, fill in the value in
   // the truth table.
   void buildTestVector(MCDCRecord::TestVector &TV, mcdc::ConditionID ID,
-                       int TVIdx, unsigned Index) {
-    assert((Index & (1 << ID)) == 0);
-
+                       int TVIdx) {
     for (auto MCDCCond : {MCDCRecord::MCDC_False, MCDCRecord::MCDC_True}) {
       static_assert(MCDCRecord::MCDC_False == 0);
       static_assert(MCDCRecord::MCDC_True == 1);
-      Index |= MCDCCond << ID;
-      TV[ID] = MCDCCond;
+      TV.set(ID, MCDCCond);
       auto NextID = NextIDs[ID][MCDCCond];
       auto NextTVIdx = TVIdx + Indices[ID][MCDCCond];
       assert(NextID == SavedNodes[ID].NextIDs[MCDCCond]);
       if (NextID >= 0) {
-        buildTestVector(TV, NextID, NextTVIdx, Index);
+        buildTestVector(TV, NextID, NextTVIdx);
         continue;
       }
 
       assert(TVIdx < SavedNodes[ID].Width);
       assert(TVIdxs.insert(NextTVIdx).second && "Duplicate TVIdx");
 
-      if (!Bitmap[DecisionParams.BitmapIdx * CHAR_BIT + Index])
+      if (!Bitmap[DecisionParams.BitmapIdx * CHAR_BIT + TV.getIndex()])
         continue;
 
       // Copy the completed test vector to the vector of testvectors.
@@ -429,7 +426,7 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
     }
 
     // Reset back to DontCare.
-    TV[ID] = MCDCRecord::MCDC_DontCare;
+    TV.set(ID, MCDCRecord::MCDC_DontCare);
   }
 
   /// Walk the bits in the bitmap.  A bit set to '1' indicates that the test
@@ -439,8 +436,8 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
     // We start at the root node (ID == 0) with all values being DontCare.
     // `TVIdx` starts with 0 and is in the traversal.
     // `Index` encodes the bitmask of true values and is initially 0.
-    MCDCRecord::TestVector TV(NumConditions, MCDCRecord::MCDC_DontCare);
-    buildTestVector(TV, 0, 0, 0);
+    MCDCRecord::TestVector TV(NumConditions);
+    buildTestVector(TV, 0, 0);
     assert(TVIdxs.size() == unsigned(NumTestVectors) &&
            "TVIdxs wasn't fulfilled");
 
@@ -465,20 +462,12 @@ class MCDCRecordProcessor : NextIDsBuilder, mcdc::TVIdxBuilder {
       for (unsigned J = 0; J < NumExecVectorsF; ++J) {
         const auto &[B, BCond] = ExecVectors[J];
         assert(BCond == MCDCRecord::MCDC_False);
-        unsigned Flip = NumConditions, Idx;
-        for (Idx = 0; Idx < NumConditions; ++Idx) {
-          MCDCRecord::CondState ACond = A[Idx], BCond = B[Idx];
-          if (ACond == BCond || ACond == MCDCRecord::MCDC_DontCare ||
-              BCond == MCDCRecord::MCDC_DontCare)
-            continue;
-          if (Flip != NumConditions)
-            break;
-          Flip = Idx;
-        }
         // If the two vectors differ in exactly one condition, ignoring DontCare
         // conditions, we have found an independence pair.
-        if (Idx == NumConditions && Flip != NumConditions)
-          IndependencePairs.insert({Flip, std::make_pair(J + 1, I + 1)});
+        auto AB = A.getDifferences(B);
+        if (AB.count() == 1)
+          IndependencePairs.insert(
+              {AB.find_first(), std::make_pair(J + 1, I + 1)});
       }
     }
   }

From 575bf33a27307e1ff2713f239b81ac5329b40096 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 27 Feb 2024 08:45:01 +0000
Subject: [PATCH 414/546] [mlir][ArmSME] Remove XFAIL for
 load-store-128-bit-tile

https://gitlab.com/qemu-project/qemu/-/issues/1833 was fixed by
https://gitlab.com/qemu-project/qemu/-/commit/4b3520fd93cd49cc56dfcab45d90735cc2e35af7
which was included in v8.1.1.

The buildbot is now using v8.1.3 so the test is passing.

https://lab.llvm.org/buildbot/#/builders/179/builds/9468
---
 .../Vector/CPU/ArmSME/load-store-128-bit-tile.mlir        | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
index 06b1c107cb2c1..2b8899b6c6fc3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
@@ -7,14 +7,6 @@
 
 // RUN: %{compile} | %{run} | FileCheck %s
 
-/// Note: The SME ST1Q/LD1Q instructions are currently broken in QEMU
-/// see: https://gitlab.com/qemu-project/qemu/-/issues/1833
-/// This test is expected to fail until a fixed version of QEMU can be used.
-
-/// FIXME: Remove the 'XFAIL' below once a fixed QEMU version is available
-/// (and installed on CI buildbot).
-// XFAIL: {{.*}}
-
 func.func @print_i8s(%bytes: memref<?xi8>, %len: index) {
   %c0 = arith.constant 0 : index
   %c16 = arith.constant 16 : index

From aaec22ffbc9fb0a3b12957166ebf35a7bb2c31e0 Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Tue, 27 Feb 2024 09:49:16 +0100
Subject: [PATCH 415/546] [clang][dataflow] Remove deprecated
 `ValueModel::merge()` function. (#82602)

I'm not aware of any remaining overrides of this function.

While I'm here, change an outdated comment in DataflowAnalysis.h that
still
referred to `merge()`. I've made the comment more general, referring
simply to
`ValueModel`, as we shouldn't really be repeating the documentation of
that
class here anyway.
---
 .../Analysis/FlowSensitive/DataflowAnalysis.h |  7 ++--
 .../FlowSensitive/DataflowEnvironment.h       | 32 +------------------
 2 files changed, 4 insertions(+), 35 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index b95095d2184c0..3c84704d0d6ce 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -54,10 +54,9 @@ namespace dataflow {
 ///                         Environment &Env)` - applies the analysis transfer
 ///    function for a given edge from a CFG block of a conditional statement.
 ///
-///  `Derived` can optionally override the following members:
-///   * `bool merge(QualType, const Value &, const Value &, Value &,
-///     Environment &)` -  joins distinct values. This could be a strict
-///     lattice join or a more general widening operation.
+///  `Derived` can optionally override the virtual functions in the
+///  `Environment::ValueModel` interface (which is an indirect base class of
+///  this class).
 ///
 ///  `LatticeT` is a bounded join-semilattice that is used by `Derived` and must
 ///  provide the following public members:
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 0aecc749bf415..7f8c70d169376 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -79,32 +79,6 @@ class Environment {
       return ComparisonResult::Unknown;
     }
 
-    /// DEPRECATED. Override `join` and/or `widen`, instead.
-    ///
-    /// Modifies `MergedVal` to approximate both `Val1` and `Val2`. This could
-    /// be a strict lattice join or a more general widening operation.
-    ///
-    /// If this function returns true, `MergedVal` will be assigned to a storage
-    /// location of type `Type` in `MergedEnv`.
-    ///
-    /// `Env1` and `Env2` can be used to query child values and path condition
-    /// implications of `Val1` and `Val2` respectively.
-    ///
-    /// Requirements:
-    ///
-    ///  `Val1` and `Val2` must be distinct.
-    ///
-    ///  `Val1`, `Val2`, and `MergedVal` must model values of type `Type`.
-    ///
-    ///  `Val1` and `Val2` must be assigned to the same storage location in
-    ///  `Env1` and `Env2` respectively.
-    virtual bool merge(QualType Type, const Value &Val1,
-                       const Environment &Env1, const Value &Val2,
-                       const Environment &Env2, Value &MergedVal,
-                       Environment &MergedEnv) {
-      return true;
-    }
-
     /// Modifies `JoinedVal` to approximate both `Val1` and `Val2`. This should
     /// obey the properties of a lattice join.
     ///
@@ -121,11 +95,7 @@ class Environment {
     ///  `Env1` and `Env2` respectively.
     virtual void join(QualType Type, const Value &Val1, const Environment &Env1,
                       const Value &Val2, const Environment &Env2,
-                      Value &JoinedVal, Environment &JoinedEnv) {
-      [[maybe_unused]] bool ShouldKeep =
-          merge(Type, Val1, Env1, Val2, Env2, JoinedVal, JoinedEnv);
-      assert(ShouldKeep && "dropping merged value is unsupported");
-    }
+                      Value &JoinedVal, Environment &JoinedEnv) {}
 
     /// This function may widen the current value -- replace it with an
     /// approximation that can reach a fixed point more quickly than iterated

From 4e9fe860d2dd462e514586a3b90ad373dc07b797 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 27 Feb 2024 09:25:02 +0000
Subject: [PATCH 416/546] [ConstraintElim] Add additional sext tests with
 unsigned predicates.

---
 .../{sext.ll => sext-signed-predicates.ll}    |  82 +++++++----
 .../sext-unsigned-predicates.ll               | 136 ++++++++++++++++++
 2 files changed, 188 insertions(+), 30 deletions(-)
 rename llvm/test/Transforms/ConstraintElimination/{sext.ll => sext-signed-predicates.ll} (80%)
 create mode 100644 llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll

diff --git a/llvm/test/Transforms/ConstraintElimination/sext.ll b/llvm/test/Transforms/ConstraintElimination/sext-signed-predicates.ll
similarity index 80%
rename from llvm/test/Transforms/ConstraintElimination/sext.ll
rename to llvm/test/Transforms/ConstraintElimination/sext-signed-predicates.ll
index 5a8a37d0d5703..34f023e4ba5f1 100644
--- a/llvm/test/Transforms/ConstraintElimination/sext.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sext-signed-predicates.ll
@@ -127,36 +127,6 @@ else:
 
 ; Negative tests
 
-define i1 @cmp_sext_unsigned(i32 %a, i32 %b){
-; CHECK-LABEL: define i1 @cmp_sext_unsigned(
-; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i64 [[SB]], [[ADD]]
-; CHECK-NEXT:    ret i1 [[CMP2]]
-; CHECK:       else:
-; CHECK-NEXT:    ret i1 false
-;
-entry:
-  %cmp = icmp slt i32 %a, %b
-  br i1 %cmp, label %then, label %else
-
-then:
-  %sa = sext i32 %a to i64
-  %sb = sext i32 %b to i64
-  %add = add nsw i64 %sa, 1
-  %cmp2 = icmp uge i64 %sb, %add
-  ret i1 %cmp2
-
-else:
-  ret i1 false
-}
-
 define i1 @cmp_zext(i32 %a, i32 %b){
 ; CHECK-LABEL: define i1 @cmp_zext(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
@@ -246,3 +216,55 @@ then:
 else:
   ret i1 false
 }
+
+declare void @use(i1)
+
+define void @sge_sext(i16 %x, i32 %y) {
+; CHECK-LABEL: define void @sge_sext(
+; CHECK-SAME: i16 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_EXT:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[X_EXT]], [[Y]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp sge i32 [[Y]], -10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_1]], [[C_2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_3:%.*]] = icmp sge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[T_3]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp sge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp sge i32 [[Y]], [[X_EXT]]
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp sge i16 [[X]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.ext = sext i16 %x to i32
+  %c.1 = icmp sge i32 %x.ext, %y
+  %c.2 = icmp sge i32 %y, -10
+  %and = and i1 %c.1, %c.2
+  br i1 %and, label %bb1, label %bb2
+
+bb1:
+  %t.1 = icmp sge i32 %x.ext, %y
+  call void @use(i1 %t.1)
+  %t.2 = icmp sge i16 %x, -10
+  call void @use(i1 %t.2)
+  %t.3 = icmp sge i32 %x.ext, -9
+  call void @use(i1 %t.3)
+  %c.3 = icmp sge i32 %x.ext, -9
+  call void @use(i1 %c.3)
+  %c.4 = icmp sge i32 %y, %x.ext
+  call void @use(i1 %c.4)
+  %c.5 = icmp sge i16 %x, -9
+  call void @use(i1 %c.5)
+  ret void
+
+bb2:
+  ret void
+}
diff --git a/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
new file mode 100644
index 0000000000000..ac3e57768ae55
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+declare void @use(i1)
+
+define void @uge_sext(i16 %x, i32 %y) {
+; CHECK-LABEL: define void @uge_sext(
+; CHECK-SAME: i16 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_EXT:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i32 [[X_EXT]], [[Y]]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp uge i32 [[Y]], -10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_1]], [[C_2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i16 [[X]], -10
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp uge i32 [[X_EXT]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    [[C_6:%.*]] = icmp uge i32 [[Y]], [[X_EXT]]
+; CHECK-NEXT:    call void @use(i1 [[C_6]])
+; CHECK-NEXT:    [[C_7:%.*]] = icmp uge i16 [[X]], -9
+; CHECK-NEXT:    call void @use(i1 [[C_7]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.ext = sext i16 %x to i32
+  %c.1 = icmp uge i32 %x.ext, %y
+  %c.2 = icmp uge i32 %y, -10
+  %and = and i1 %c.1, %c.2
+  br i1 %and, label %bb1, label %bb2
+
+bb1:
+  %t.1 = icmp uge i32 %x.ext, %y
+  call void @use(i1 %t.1)
+  %c.3 = icmp uge i16 %x, -10
+  call void @use(i1 %c.3)
+  %c.4 = icmp uge i32 %x.ext, -9
+  call void @use(i1 %c.4)
+  %c.5 = icmp uge i32 %x.ext, -9
+  call void @use(i1 %c.5)
+  %c.6 = icmp uge i32 %y, %x.ext
+  call void @use(i1 %c.6)
+  %c.7 = icmp uge i16 %x, -9
+  call void @use(i1 %c.7)
+  ret void
+
+bb2:
+  ret void
+}
+
+define void @uge_sext_known_positive(i16 %x, i32 %y) {
+; CHECK-LABEL: define void @uge_sext_known_positive(
+; CHECK-SAME: i16 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X_EXT:%.*]] = sext i16 [[X]] to i32
+; CHECK-NEXT:    [[C_2:%.*]] = icmp sge i16 [[X]], 0
+; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i32 [[X_EXT]], 10
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_2]], [[C_1]]
+; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_2:%.*]] = icmp uge i16 [[X]], 10
+; CHECK-NEXT:    call void @use(i1 [[T_2]])
+; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i32 [[X_EXT]], 11
+; CHECK-NEXT:    call void @use(i1 [[C_3]])
+; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], 11
+; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    [[C_5:%.*]] = icmp uge i32 [[Y]], [[X_EXT]]
+; CHECK-NEXT:    call void @use(i1 [[C_5]])
+; CHECK-NEXT:    [[C_6:%.*]] = icmp uge i16 [[X]], 11
+; CHECK-NEXT:    call void @use(i1 [[C_6]])
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.ext = sext i16 %x to i32
+  %c.2 = icmp sge i16 %x, 0
+  %c.1 = icmp uge i32 %x.ext, 10
+  %and = and i1 %c.2, %c.1
+  br i1 %and, label %bb1, label %bb2
+
+bb1:
+  %t.1 = icmp uge i32 %x.ext, 10
+  call void @use(i1 %t.1)
+  %t.2 = icmp uge i16 %x, 10
+  call void @use(i1 %t.2)
+  %c.3 = icmp uge i32 %x.ext, 11
+  call void @use(i1 %c.3)
+  %c.4 = icmp uge i32 %x.ext, 11
+  call void @use(i1 %c.4)
+  %c.5 = icmp uge i32 %y, %x.ext
+  call void @use(i1 %c.5)
+  %c.6 = icmp uge i16 %x, 11
+  call void @use(i1 %c.6)
+  ret void
+
+bb2:
+  ret void
+}
+
+define i1 @cmp_sext_unsigned(i32 %a, i32 %b){
+; CHECK-LABEL: define i1 @cmp_sext_unsigned(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[SA:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[SB:%.*]] = sext i32 [[B]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[SA]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i64 [[SB]], [[ADD]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp slt i32 %a, %b
+  br i1 %cmp, label %then, label %else
+
+then:
+  %sa = sext i32 %a to i64
+  %sb = sext i32 %b to i64
+  %add = add nsw i64 %sa, 1
+  %cmp2 = icmp uge i64 %sb, %add
+  ret i1 %cmp2
+
+else:
+  ret i1 false
+}

From a1bbba06eabc5ddf533e611d15fddcfd0a8e79db Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 27 Feb 2024 09:32:18 +0000
Subject: [PATCH 417/546] [lldb][test][Windows] Fix NonStop tests on case
 insensitive file systems

On a case insensitive file sytem, the build dir for `test_multiple_c` and
`test_multiple_C` are the same and therefore the log files are in the same
place. This means one tries to clear the log file for the other.

To fix this, make the names unique by adding the meaning of each
protocol packet.

https://sourceware.org/gdb/current/onlinedocs/gdb.html/Packets.html#Packets
---
 lldb/test/API/tools/lldb-server/TestNonStop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/tools/lldb-server/TestNonStop.py b/lldb/test/API/tools/lldb-server/TestNonStop.py
index b2d1361477221..62bda48ee049b 100644
--- a/lldb/test/API/tools/lldb-server/TestNonStop.py
+++ b/lldb/test/API/tools/lldb-server/TestNonStop.py
@@ -220,15 +220,15 @@ def multiple_resume_test(self, second_command):
         self.expect_gdbremote_sequence()
 
     @add_test_categories(["llgs"])
-    def test_multiple_C(self):
+    def test_multiple_C_continue_with_signal(self):
         self.multiple_resume_test("C05")
 
     @add_test_categories(["llgs"])
-    def test_multiple_c(self):
+    def test_multiple_c_continue_with_addr(self):
         self.multiple_resume_test("c")
 
     @add_test_categories(["llgs"])
-    def test_multiple_s(self):
+    def test_multiple_s_single_step_with_addr(self):
         self.multiple_resume_test("s")
 
     @skipIfWindows

From e421c12e47e2912d685dafeb8289b7118ad8b2e1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 27 Feb 2024 09:38:39 +0000
Subject: [PATCH 418/546] [VPlan] Remove left-over CHECK-NOT line.

This removes a  CHECK-NOT: vector.body line from the test which seems to
imply the test does not get vectorized, but it does now.

This line was left over from when the test was pre-committed, remove it.
---
 .../LoopVectorize/first-order-recurrence-chains-vplan.ll        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 15e77f3a48470..c04178a1c13e2 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -91,8 +91,6 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT: No successors
 ; CHECK-NEXT: }
-
-; CHECK-NOT: vector.body:
 ;
 entry:
   br label %loop

From 288d317fff42b46151b132ca2bdc4b0ca6d28d6f Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Tue, 27 Feb 2024 18:51:24 +0900
Subject: [PATCH 419/546] [mlir][complex] Support Fastmath flag in conversion
 of complex.div to standard (#82729)

Support Fastmath flag to convert `complex.div` to standard dialects.

See:
https://discourse.llvm.org/t/rfc-fastmath-flags-support-in-complex-dialect/71981
---
 .../ComplexToStandard/ComplexToStandard.cpp   |  83 +++++++------
 .../convert-to-standard.mlir                  | 113 +++++++++++++++++-
 2 files changed, 159 insertions(+), 37 deletions(-)

diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index cc315110f9be2..20525b962ae96 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -257,6 +257,7 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     auto loc = op.getLoc();
     auto type = cast<ComplexType>(adaptor.getLhs().getType());
     auto elementType = cast<FloatType>(type.getElementType());
+    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value lhsReal =
         rewriter.create<complex::ReOp>(loc, elementType, adaptor.getLhs());
@@ -290,45 +291,51 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     //
     // See https://dl.acm.org/citation.cfm?id=368661 for more details.
     Value rhsRealImagRatio =
-        rewriter.create<arith::DivFOp>(loc, rhsReal, rhsImag);
+        rewriter.create<arith::DivFOp>(loc, rhsReal, rhsImag, fmf);
     Value rhsRealImagDenom = rewriter.create<arith::AddFOp>(
         loc, rhsImag,
-        rewriter.create<arith::MulFOp>(loc, rhsRealImagRatio, rhsReal));
+        rewriter.create<arith::MulFOp>(loc, rhsRealImagRatio, rhsReal, fmf),
+        fmf);
     Value realNumerator1 = rewriter.create<arith::AddFOp>(
-        loc, rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealImagRatio),
-        lhsImag);
-    Value resultReal1 =
-        rewriter.create<arith::DivFOp>(loc, realNumerator1, rhsRealImagDenom);
+        loc,
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealImagRatio, fmf),
+        lhsImag, fmf);
+    Value resultReal1 = rewriter.create<arith::DivFOp>(loc, realNumerator1,
+                                                       rhsRealImagDenom, fmf);
     Value imagNumerator1 = rewriter.create<arith::SubFOp>(
-        loc, rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealImagRatio),
-        lhsReal);
-    Value resultImag1 =
-        rewriter.create<arith::DivFOp>(loc, imagNumerator1, rhsRealImagDenom);
+        loc,
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealImagRatio, fmf),
+        lhsReal, fmf);
+    Value resultImag1 = rewriter.create<arith::DivFOp>(loc, imagNumerator1,
+                                                       rhsRealImagDenom, fmf);
 
     Value rhsImagRealRatio =
-        rewriter.create<arith::DivFOp>(loc, rhsImag, rhsReal);
+        rewriter.create<arith::DivFOp>(loc, rhsImag, rhsReal, fmf);
     Value rhsImagRealDenom = rewriter.create<arith::AddFOp>(
         loc, rhsReal,
-        rewriter.create<arith::MulFOp>(loc, rhsImagRealRatio, rhsImag));
+        rewriter.create<arith::MulFOp>(loc, rhsImagRealRatio, rhsImag, fmf),
+        fmf);
     Value realNumerator2 = rewriter.create<arith::AddFOp>(
         loc, lhsReal,
-        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagRealRatio));
-    Value resultReal2 =
-        rewriter.create<arith::DivFOp>(loc, realNumerator2, rhsImagRealDenom);
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagRealRatio, fmf),
+        fmf);
+    Value resultReal2 = rewriter.create<arith::DivFOp>(loc, realNumerator2,
+                                                       rhsImagRealDenom, fmf);
     Value imagNumerator2 = rewriter.create<arith::SubFOp>(
         loc, lhsImag,
-        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagRealRatio));
-    Value resultImag2 =
-        rewriter.create<arith::DivFOp>(loc, imagNumerator2, rhsImagRealDenom);
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagRealRatio, fmf),
+        fmf);
+    Value resultImag2 = rewriter.create<arith::DivFOp>(loc, imagNumerator2,
+                                                       rhsImagRealDenom, fmf);
 
     // Consider corner cases.
     // Case 1. Zero denominator, numerator contains at most one NaN value.
     Value zero = rewriter.create<arith::ConstantOp>(
         loc, elementType, rewriter.getZeroAttr(elementType));
-    Value rhsRealAbs = rewriter.create<math::AbsFOp>(loc, rhsReal);
+    Value rhsRealAbs = rewriter.create<math::AbsFOp>(loc, rhsReal, fmf);
     Value rhsRealIsZero = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, rhsRealAbs, zero);
-    Value rhsImagAbs = rewriter.create<math::AbsFOp>(loc, rhsImag);
+    Value rhsImagAbs = rewriter.create<math::AbsFOp>(loc, rhsImag, fmf);
     Value rhsImagIsZero = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, rhsImagAbs, zero);
     Value lhsRealIsNotNaN = rewriter.create<arith::CmpFOp>(
@@ -347,9 +354,9 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
     Value infWithSignOfRhsReal =
         rewriter.create<math::CopySignOp>(loc, inf, rhsReal);
     Value infinityResultReal =
-        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsReal);
+        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsReal, fmf);
     Value infinityResultImag =
-        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsImag);
+        rewriter.create<arith::MulFOp>(loc, infWithSignOfRhsReal, lhsImag, fmf);
 
     // Case 2. Infinite numerator, finite denominator.
     Value rhsRealFinite = rewriter.create<arith::CmpFOp>(
@@ -358,10 +365,10 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
         loc, arith::CmpFPredicate::ONE, rhsImagAbs, inf);
     Value rhsFinite =
         rewriter.create<arith::AndIOp>(loc, rhsRealFinite, rhsImagFinite);
-    Value lhsRealAbs = rewriter.create<math::AbsFOp>(loc, lhsReal);
+    Value lhsRealAbs = rewriter.create<math::AbsFOp>(loc, lhsReal, fmf);
     Value lhsRealInfinite = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, lhsRealAbs, inf);
-    Value lhsImagAbs = rewriter.create<math::AbsFOp>(loc, lhsImag);
+    Value lhsImagAbs = rewriter.create<math::AbsFOp>(loc, lhsImag, fmf);
     Value lhsImagInfinite = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OEQ, lhsImagAbs, inf);
     Value lhsInfinite =
@@ -377,21 +384,23 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
         loc, rewriter.create<arith::SelectOp>(loc, lhsImagInfinite, one, zero),
         lhsImag);
     Value lhsRealIsInfWithSignTimesRhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsReal);
+        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsReal, fmf);
     Value lhsImagIsInfWithSignTimesRhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsImag);
+        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsImag, fmf);
     Value resultReal3 = rewriter.create<arith::MulFOp>(
         loc, inf,
         rewriter.create<arith::AddFOp>(loc, lhsRealIsInfWithSignTimesRhsReal,
-                                       lhsImagIsInfWithSignTimesRhsImag));
+                                       lhsImagIsInfWithSignTimesRhsImag, fmf),
+        fmf);
     Value lhsRealIsInfWithSignTimesRhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsImag);
+        rewriter.create<arith::MulFOp>(loc, lhsRealIsInfWithSign, rhsImag, fmf);
     Value lhsImagIsInfWithSignTimesRhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsReal);
+        rewriter.create<arith::MulFOp>(loc, lhsImagIsInfWithSign, rhsReal, fmf);
     Value resultImag3 = rewriter.create<arith::MulFOp>(
         loc, inf,
         rewriter.create<arith::SubFOp>(loc, lhsImagIsInfWithSignTimesRhsReal,
-                                       lhsRealIsInfWithSignTimesRhsImag));
+                                       lhsRealIsInfWithSignTimesRhsImag, fmf),
+        fmf);
 
     // Case 3: Finite numerator, infinite denominator.
     Value lhsRealFinite = rewriter.create<arith::CmpFOp>(
@@ -415,21 +424,23 @@ struct DivOpConversion : public OpConversionPattern<complex::DivOp> {
         loc, rewriter.create<arith::SelectOp>(loc, rhsImagInfinite, one, zero),
         rhsImag);
     Value rhsRealIsInfWithSignTimesLhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsRealIsInfWithSign, fmf);
     Value rhsImagIsInfWithSignTimesLhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsImagIsInfWithSign, fmf);
     Value resultReal4 = rewriter.create<arith::MulFOp>(
         loc, zero,
         rewriter.create<arith::AddFOp>(loc, rhsRealIsInfWithSignTimesLhsReal,
-                                       rhsImagIsInfWithSignTimesLhsImag));
+                                       rhsImagIsInfWithSignTimesLhsImag, fmf),
+        fmf);
     Value rhsRealIsInfWithSignTimesLhsImag =
-        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsImag, rhsRealIsInfWithSign, fmf);
     Value rhsImagIsInfWithSignTimesLhsReal =
-        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagIsInfWithSign);
+        rewriter.create<arith::MulFOp>(loc, lhsReal, rhsImagIsInfWithSign, fmf);
     Value resultImag4 = rewriter.create<arith::MulFOp>(
         loc, zero,
         rewriter.create<arith::SubFOp>(loc, rhsRealIsInfWithSignTimesLhsImag,
-                                       rhsImagIsInfWithSignTimesLhsReal));
+                                       rhsImagIsInfWithSignTimesLhsReal, fmf),
+        fmf);
 
     Value realAbsSmallerThanImagAbs = rewriter.create<arith::CmpFOp>(
         loc, arith::CmpFPredicate::OLT, rhsRealAbs, rhsImagAbs);
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 1fe843b1447ab..39af7dd02a62d 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -1045,4 +1045,115 @@ func.func @complex_mul_with_fmf(%lhs: complex<f32>, %rhs: complex<f32>) -> compl
 // CHECK: %[[FINAL_IMAG:.*]] = arith.select %[[RECALC3]], %[[NEW_IMAG_TIMES_INF]], %[[IMAG]] : f32
 
 // CHECK: %[[RESULT:.*]] = complex.create %[[FINAL_REAL]], %[[FINAL_IMAG]] : complex<f32>
-// CHECK: return %[[RESULT]] : complex<f32>
\ No newline at end of file
+// CHECK: return %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @complex_div_with_fmf
+// CHECK-SAME: (%[[LHS:.*]]: complex<f32>, %[[RHS:.*]]: complex<f32>)
+func.func @complex_div_with_fmf(%lhs: complex<f32>, %rhs: complex<f32>) -> complex<f32> {
+  %div = complex.div %lhs, %rhs fastmath<nnan,contract> : complex<f32>
+  return %div : complex<f32>
+}
+// CHECK: %[[LHS_REAL:.*]] = complex.re %[[LHS]] : complex<f32>
+// CHECK: %[[LHS_IMAG:.*]] = complex.im %[[LHS]] : complex<f32>
+// CHECK: %[[RHS_REAL:.*]] = complex.re %[[RHS]] : complex<f32>
+// CHECK: %[[RHS_IMAG:.*]] = complex.im %[[RHS]] : complex<f32>
+
+// CHECK: %[[RHS_REAL_IMAG_RATIO:.*]] = arith.divf %[[RHS_REAL]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_TIMES_RHS_REAL_IMAG_RATIO:.*]] = arith.mulf %[[RHS_REAL_IMAG_RATIO]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_IMAG_DENOM:.*]] = arith.addf %[[RHS_IMAG]], %[[RHS_REAL_TIMES_RHS_REAL_IMAG_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_TIMES_RHS_REAL_IMAG_RATIO:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_REAL_IMAG_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_NUMERATOR_1:.*]] = arith.addf %[[LHS_REAL_TIMES_RHS_REAL_IMAG_RATIO]], %[[LHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_1:.*]] = arith.divf %[[REAL_NUMERATOR_1]], %[[RHS_REAL_IMAG_DENOM]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_TIMES_RHS_REAL_IMAG_RATIO:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_REAL_IMAG_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_NUMERATOR_1:.*]] = arith.subf %[[LHS_IMAG_TIMES_RHS_REAL_IMAG_RATIO]], %[[LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_1:.*]] = arith.divf %[[IMAG_NUMERATOR_1]], %[[RHS_REAL_IMAG_DENOM]] fastmath<nnan,contract> : f32
+
+// CHECK: %[[RHS_IMAG_REAL_RATIO:.*]] = arith.divf %[[RHS_IMAG]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO:.*]] = arith.mulf %[[RHS_IMAG_REAL_RATIO]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_REAL_DENOM:.*]] = arith.addf %[[RHS_REAL]], %[[RHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_NUMERATOR_2:.*]] = arith.addf %[[LHS_REAL]], %[[LHS_IMAG_TIMES_RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_2:.*]] = arith.divf %[[REAL_NUMERATOR_2]], %[[RHS_IMAG_REAL_DENOM]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_TIMES_RHS_IMAG_REAL_RATIO:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[IMAG_NUMERATOR_2:.*]] = arith.subf %[[LHS_IMAG]], %[[LHS_REAL_TIMES_RHS_IMAG_REAL_RATIO]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_2:.*]] = arith.divf %[[IMAG_NUMERATOR_2]], %[[RHS_IMAG_REAL_DENOM]] fastmath<nnan,contract> : f32
+
+// Case 1. Zero denominator, numerator contains at most one NaN value.
+// CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[RHS_REAL_ABS:.*]] = math.absf %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_ABS_IS_ZERO:.*]] = arith.cmpf oeq, %[[RHS_REAL_ABS]], %[[ZERO]] : f32
+// CHECK: %[[RHS_IMAG_ABS:.*]] = math.absf %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_ABS_IS_ZERO:.*]] = arith.cmpf oeq, %[[RHS_IMAG_ABS]], %[[ZERO]] : f32
+// CHECK: %[[LHS_REAL_IS_NOT_NAN:.*]] = arith.cmpf ord, %[[LHS_REAL]], %[[ZERO]] : f32
+// CHECK: %[[LHS_IMAG_IS_NOT_NAN:.*]] = arith.cmpf ord, %[[LHS_IMAG]], %[[ZERO]] : f32
+// CHECK: %[[LHS_CONTAINS_NOT_NAN_VALUE:.*]] = arith.ori %[[LHS_REAL_IS_NOT_NAN]], %[[LHS_IMAG_IS_NOT_NAN]] : i1
+// CHECK: %[[RHS_IS_ZERO:.*]] = arith.andi %[[RHS_REAL_ABS_IS_ZERO]], %[[RHS_IMAG_ABS_IS_ZERO]] : i1
+// CHECK: %[[RESULT_IS_INFINITY:.*]] = arith.andi %[[LHS_CONTAINS_NOT_NAN_VALUE]], %[[RHS_IS_ZERO]] : i1
+// CHECK: %[[INF:.*]] = arith.constant 0x7F800000 : f32
+// CHECK: %[[INF_WITH_SIGN_OF_RHS_REAL:.*]] = math.copysign %[[INF]], %[[RHS_REAL]] : f32
+// CHECK: %[[INFINITY_RESULT_REAL:.*]] = arith.mulf %[[INF_WITH_SIGN_OF_RHS_REAL]], %[[LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[INFINITY_RESULT_IMAG:.*]] = arith.mulf %[[INF_WITH_SIGN_OF_RHS_REAL]], %[[LHS_IMAG]] fastmath<nnan,contract> : f32
+
+// Case 2. Infinite numerator, finite denominator.
+// CHECK: %[[RHS_REAL_FINITE:.*]] = arith.cmpf one, %[[RHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IMAG_FINITE:.*]] = arith.cmpf one, %[[RHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IS_FINITE:.*]] = arith.andi %[[RHS_REAL_FINITE]], %[[RHS_IMAG_FINITE]] : i1
+// CHECK: %[[LHS_REAL_ABS:.*]] = math.absf %[[LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_INFINITE:.*]] = arith.cmpf oeq, %[[LHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IMAG_ABS:.*]] = math.absf %[[LHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_INFINITE:.*]] = arith.cmpf oeq, %[[LHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IS_INFINITE:.*]] = arith.ori %[[LHS_REAL_INFINITE]], %[[LHS_IMAG_INFINITE]] : i1
+// CHECK: %[[INF_NUM_FINITE_DENOM:.*]] = arith.andi %[[LHS_IS_INFINITE]], %[[RHS_IS_FINITE]] : i1
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[LHS_REAL_IS_INF:.*]] = arith.select %[[LHS_REAL_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[LHS_REAL_IS_INF_WITH_SIGN:.*]] = math.copysign %[[LHS_REAL_IS_INF]], %[[LHS_REAL]] : f32
+// CHECK: %[[LHS_IMAG_IS_INF:.*]] = arith.select %[[LHS_IMAG_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[LHS_IMAG_IS_INF_WITH_SIGN:.*]] = math.copysign %[[LHS_IMAG_IS_INF]], %[[LHS_IMAG]] : f32
+// CHECK: %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_REAL_IS_INF_WITH_SIGN]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG_IS_INF_WITH_SIGN]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[INF_MULTIPLICATOR_1:.*]] = arith.addf %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_REAL]], %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_3:.*]] = arith.mulf %[[INF]], %[[INF_MULTIPLICATOR_1]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_IMAG:.*]] = arith.mulf %[[LHS_REAL_IS_INF_WITH_SIGN]], %[[RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_REAL:.*]] = arith.mulf %[[LHS_IMAG_IS_INF_WITH_SIGN]], %[[RHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[INF_MULTIPLICATOR_2:.*]] = arith.subf %[[LHS_IMAG_IS_INF_WITH_SIGN_TIMES_RHS_REAL]], %[[LHS_REAL_IS_INF_WITH_SIGN_TIMES_RHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_3:.*]] = arith.mulf %[[INF]], %[[INF_MULTIPLICATOR_2]] fastmath<nnan,contract> : f32
+
+// Case 3. Finite numerator, infinite denominator.
+// CHECK: %[[LHS_REAL_FINITE:.*]] = arith.cmpf one, %[[LHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IMAG_FINITE:.*]] = arith.cmpf one, %[[LHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[LHS_IS_FINITE:.*]] = arith.andi %[[LHS_REAL_FINITE]], %[[LHS_IMAG_FINITE]] : i1
+// CHECK: %[[RHS_REAL_INFINITE:.*]] = arith.cmpf oeq, %[[RHS_REAL_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IMAG_INFINITE:.*]] = arith.cmpf oeq, %[[RHS_IMAG_ABS]], %[[INF]] : f32
+// CHECK: %[[RHS_IS_INFINITE:.*]] = arith.ori %[[RHS_REAL_INFINITE]], %[[RHS_IMAG_INFINITE]] : i1
+// CHECK: %[[FINITE_NUM_INFINITE_DENOM:.*]] = arith.andi %[[LHS_IS_FINITE]], %[[RHS_IS_INFINITE]] : i1
+// CHECK: %[[RHS_REAL_IS_INF:.*]] = arith.select %[[RHS_REAL_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[RHS_REAL_IS_INF_WITH_SIGN:.*]] = math.copysign %[[RHS_REAL_IS_INF]], %[[RHS_REAL]] : f32
+// CHECK: %[[RHS_IMAG_IS_INF:.*]] = arith.select %[[RHS_IMAG_INFINITE]], %[[ONE]], %[[ZERO]] : f32
+// CHECK: %[[RHS_IMAG_IS_INF_WITH_SIGN:.*]] = math.copysign %[[RHS_IMAG_IS_INF]], %[[RHS_IMAG]] : f32
+// CHECK: %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_REAL:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_REAL_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_IMAG_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[ZERO_MULTIPLICATOR_1:.*]] = arith.addf %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_REAL]], %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL_4:.*]] = arith.mulf %[[ZERO]], %[[ZERO_MULTIPLICATOR_1]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_IMAG:.*]] = arith.mulf %[[LHS_IMAG]], %[[RHS_REAL_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_REAL:.*]] = arith.mulf %[[LHS_REAL]], %[[RHS_IMAG_IS_INF_WITH_SIGN]] fastmath<nnan,contract> : f32
+// CHECK: %[[ZERO_MULTIPLICATOR_2:.*]] = arith.subf %[[RHS_REAL_IS_INF_WITH_SIGN_TIMES_LHS_IMAG]], %[[RHS_IMAG_IS_INF_WITH_SIGN_TIMES_LHS_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG_4:.*]] = arith.mulf %[[ZERO]], %[[ZERO_MULTIPLICATOR_2]] fastmath<nnan,contract> : f32
+
+// CHECK: %[[REAL_ABS_SMALLER_THAN_IMAG_ABS:.*]] = arith.cmpf olt, %[[RHS_REAL_ABS]], %[[RHS_IMAG_ABS]] : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.select %[[REAL_ABS_SMALLER_THAN_IMAG_ABS]], %[[RESULT_REAL_1]], %[[RESULT_REAL_2]] : f32
+// CHECK: %[[RESULT_IMAG:.*]] = arith.select %[[REAL_ABS_SMALLER_THAN_IMAG_ABS]], %[[RESULT_IMAG_1]], %[[RESULT_IMAG_2]] : f32
+// CHECK: %[[RESULT_REAL_SPECIAL_CASE_3:.*]] = arith.select %[[FINITE_NUM_INFINITE_DENOM]], %[[RESULT_REAL_4]], %[[RESULT_REAL]] : f32
+// CHECK: %[[RESULT_IMAG_SPECIAL_CASE_3:.*]] = arith.select %[[FINITE_NUM_INFINITE_DENOM]], %[[RESULT_IMAG_4]], %[[RESULT_IMAG]] : f32
+// CHECK: %[[RESULT_REAL_SPECIAL_CASE_2:.*]] = arith.select %[[INF_NUM_FINITE_DENOM]], %[[RESULT_REAL_3]], %[[RESULT_REAL_SPECIAL_CASE_3]] : f32
+// CHECK: %[[RESULT_IMAG_SPECIAL_CASE_2:.*]] = arith.select %[[INF_NUM_FINITE_DENOM]], %[[RESULT_IMAG_3]], %[[RESULT_IMAG_SPECIAL_CASE_3]] : f32
+// CHECK: %[[RESULT_REAL_SPECIAL_CASE_1:.*]] = arith.select %[[RESULT_IS_INFINITY]], %[[INFINITY_RESULT_REAL]], %[[RESULT_REAL_SPECIAL_CASE_2]] : f32
+// CHECK: %[[RESULT_IMAG_SPECIAL_CASE_1:.*]] = arith.select %[[RESULT_IS_INFINITY]], %[[INFINITY_RESULT_IMAG]], %[[RESULT_IMAG_SPECIAL_CASE_2]] : f32
+// CHECK: %[[RESULT_REAL_IS_NAN:.*]] = arith.cmpf uno, %[[RESULT_REAL]], %[[ZERO]] : f32
+// CHECK: %[[RESULT_IMAG_IS_NAN:.*]] = arith.cmpf uno, %[[RESULT_IMAG]], %[[ZERO]] : f32
+// CHECK: %[[RESULT_IS_NAN:.*]] = arith.andi %[[RESULT_REAL_IS_NAN]], %[[RESULT_IMAG_IS_NAN]] : i1
+// CHECK: %[[RESULT_REAL_WITH_SPECIAL_CASES:.*]] = arith.select %[[RESULT_IS_NAN]], %[[RESULT_REAL_SPECIAL_CASE_1]], %[[RESULT_REAL]] : f32
+// CHECK: %[[RESULT_IMAG_WITH_SPECIAL_CASES:.*]] = arith.select %[[RESULT_IS_NAN]], %[[RESULT_IMAG_SPECIAL_CASE_1]], %[[RESULT_IMAG]] : f32
+// CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL_WITH_SPECIAL_CASES]], %[[RESULT_IMAG_WITH_SPECIAL_CASES]] : complex<f32>
+// CHECK: return %[[RESULT]] : complex<f32>

From 9796b0e9f91bc8ff00d2616fa3656e1ca848c05c Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Tue, 27 Feb 2024 10:58:04 +0100
Subject: [PATCH 420/546] Add support for the 'freeze' instruction (#82979)

This PR is to add support for the 'freeze' instruction:
https://llvm.org/docs/LangRef.html#freeze-instruction

There is no way to implement `freeze` correctly without support on
SPIR-V standard side, but we may at least address a simple (static) case
when undef/poison value presence is obvious. The main benefit of even
incomplete `freeze` support is preventing of translation from crashing
due to lack of support on legalization and instruction selection steps.
---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 44 +++++++++++++++++++
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |  2 +-
 llvm/test/CodeGen/SPIRV/freeze.ll             | 37 ++++++++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/freeze.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 7258d3b4d88ed..f1e18f0601b87 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -150,6 +150,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectOpUndef(Register ResVReg, const SPIRVType *ResType,
                      MachineInstr &I) const;
+  bool selectFreeze(Register ResVReg, const SPIRVType *ResType,
+                    MachineInstr &I) const;
   bool selectIntrinsic(Register ResVReg, const SPIRVType *ResType,
                        MachineInstr &I) const;
   bool selectExtractVal(Register ResVReg, const SPIRVType *ResType,
@@ -284,6 +286,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectGlobalValue(ResVReg, I);
   case TargetOpcode::G_IMPLICIT_DEF:
     return selectOpUndef(ResVReg, ResType, I);
+  case TargetOpcode::G_FREEZE:
+    return selectFreeze(ResVReg, ResType, I);
 
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
@@ -1014,6 +1018,46 @@ bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectFreeze(Register ResVReg,
+                                            const SPIRVType *ResType,
+                                            MachineInstr &I) const {
+  // There is no way to implement `freeze` correctly without support on SPIR-V
+  // standard side, but we may at least address a simple (static) case when
+  // undef/poison value presence is obvious. The main benefit of even
+  // incomplete `freeze` support is preventing of translation from crashing due
+  // to lack of support on legalization and instruction selection steps.
+  if (!I.getOperand(0).isReg() || !I.getOperand(1).isReg())
+    return false;
+  Register OpReg = I.getOperand(1).getReg();
+  if (MachineInstr *Def = MRI->getVRegDef(OpReg)) {
+    Register Reg;
+    switch (Def->getOpcode()) {
+    case SPIRV::ASSIGN_TYPE:
+      if (MachineInstr *AssignToDef =
+              MRI->getVRegDef(Def->getOperand(1).getReg())) {
+        if (AssignToDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+          Reg = Def->getOperand(2).getReg();
+      }
+      break;
+    case SPIRV::OpUndef:
+      Reg = Def->getOperand(1).getReg();
+      break;
+    }
+    unsigned DestOpCode;
+    if (Reg.isValid()) {
+      DestOpCode = SPIRV::OpConstantNull;
+    } else {
+      DestOpCode = TargetOpcode::COPY;
+      Reg = OpReg;
+    }
+    return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(DestOpCode))
+        .addDef(I.getOperand(0).getReg())
+        .addUse(Reg)
+        .constrainAllUses(TII, TRI, RBI);
+  }
+  return false;
+}
+
 bool SPIRVInstructionSelector::selectConstVector(Register ResVReg,
                                                  const SPIRVType *ResType,
                                                  MachineInstr &I) const {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 4f2e7a240fc2c..aedca798b4f6a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -184,7 +184,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
         return Query.Types[0].getSizeInBits() == Query.Types[1].getSizeInBits();
       }))));
 
-  getActionDefinitionsBuilder(G_IMPLICIT_DEF).alwaysLegal();
+  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}).alwaysLegal();
 
   getActionDefinitionsBuilder(G_INTTOPTR)
       .legalForCartesianProduct(allPtrs, allIntScalars);
diff --git a/llvm/test/CodeGen/SPIRV/freeze.ll b/llvm/test/CodeGen/SPIRV/freeze.ll
new file mode 100644
index 0000000000000..fe4335722fe8b
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/freeze.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpName %[[Arg1:.*]] "arg1"
+; CHECK: OpName %[[Arg2:.*]] "arg2"
+; CHECK: OpName %[[NotAStaticPoison:.*]] "poison1"
+; CHECK: OpName %[[NotAStaticPoison]] "nil0"
+; CHECK: OpName %[[StaticPoisonIntFreeze:.*]] "nil1"
+; CHECK: OpName %[[StaticPoisonFloatFreeze:.*]] "nil2"
+; CHECK: OpName %[[Arg1]] "val1"
+; CHECK: OpName %[[Const100:.*]] "val2"
+; CHECK: OpName %[[Const100]] "val3"
+; CHECK: OpDecorate
+; CHECK-DAG: %[[FloatTy:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[ShortTy:.*]] = OpTypeInt 16 0
+; CHECK-DAG: %[[IntTy:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[Undef:.*]] = OpUndef %[[ShortTy]]
+; CHECK-DAG: %[[Const100]] = OpConstant %[[IntTy]] 100
+; CHECK-DAG: %[[StaticPoisonIntFreeze]] = OpConstantNull %[[IntTy]]
+; CHECK-DAG: %[[StaticPoisonFloatFreeze]] = OpConstantNull %[[FloatTy]]
+; CHECK: %[[Arg1]] = OpFunctionParameter %[[FloatTy]]
+; CHECK: %[[NotAStaticPoison]] = OpIAdd %[[ShortTy]] %[[Arg2]] %[[Undef]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(float %arg1, i16 %arg2) {
+entry:
+  %poison1 = add i16 %arg2, undef
+  %nil0 = freeze i16 %poison1
+  %nil1 = freeze i32 undef
+  %nil2 = freeze float poison
+  %val1 = freeze float %arg1
+  %val2 = freeze i32 100
+  %val3 = freeze i32 %val2
+  ret void
+}

From ada70f50a5591839d0c81a167ca64ce98ab2d088 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Tue, 27 Feb 2024 10:58:45 +0100
Subject: [PATCH 421/546] [SPIR-V]: add SPIR-V extension:
 SPV_INTEL_variable_length_array (#83002)

This PR adds SPIR-V extension SPV_INTEL_variable_length_array that
allows to allocate local arrays whose number of elements is unknown at
compile time:
* add a new SPIR-V internal intrinsic:int_spv_alloca_array
* legalize G_STACKSAVE and G_STACKRESTORE
* implement allocation of arrays (previously getArraySize() of
AllocaInst was not used)
* add tests
---
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   1 +
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp |  18 ++-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |   9 ++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  56 +++++++++
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |   2 +
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |   8 ++
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp      |   4 +
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |   2 +
 .../SPV_INTEL_variable_length_array/vararr.ll |  54 +++++++++
 .../vararr_spec_const.ll                      | 110 ++++++++++++++++++
 10 files changed, 263 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 057dc64e88c26..d6eabc5d24079 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -33,6 +33,7 @@ let TargetPrefix = "spv" in {
   def int_spv_cmpxchg : Intrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_vararg_ty]>;
   def int_spv_unreachable : Intrinsic<[], []>;
   def int_spv_alloca : Intrinsic<[llvm_any_ty], []>;
+  def int_spv_alloca_array : Intrinsic<[llvm_any_ty], [llvm_anyint_ty]>;
   def int_spv_undef : Intrinsic<[llvm_i32_ty], []>;
 
   // Expect, Assume Intrinsics
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index e32cd50be56e3..afb24bfb32239 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -500,9 +500,25 @@ Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) {
 }
 
 Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) {
+  Value *ArraySize = nullptr;
+  if (I.isArrayAllocation()) {
+    const SPIRVSubtarget *STI = TM->getSubtargetImpl(*I.getFunction());
+    if (!STI->canUseExtension(
+            SPIRV::Extension::SPV_INTEL_variable_length_array))
+      report_fatal_error(
+          "array allocation: this instruction requires the following "
+          "SPIR-V extension: SPV_INTEL_variable_length_array",
+          false);
+    ArraySize = I.getArraySize();
+  }
+
   TrackConstants = false;
   Type *PtrTy = I.getType();
-  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_alloca, {PtrTy}, {});
+  auto *NewI =
+      ArraySize
+          ? IRB->CreateIntrinsic(Intrinsic::spv_alloca_array,
+                                 {PtrTy, ArraySize->getType()}, {ArraySize})
+          : IRB->CreateIntrinsic(Intrinsic::spv_alloca, {PtrTy}, {});
   std::string InstName = I.hasName() ? I.getName().str() : "";
   I.replaceAllUsesWith(NewI);
   I.eraseFromParent();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 7c5252e8cb372..fe8c909236cde 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -287,6 +287,15 @@ def OpPtrNotEqual: Op<402, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
 def OpPtrDiff: Op<403, (outs ID:$res), (ins TYPE:$resType, ID:$a, ID:$b),
                   "$res = OpPtrDiff $resType $a $b">;
 
+// - SPV_INTEL_variable_length_array
+
+def OpVariableLengthArrayINTEL: Op<5818, (outs ID:$res), (ins TYPE:$type, ID:$length),
+                  "$res = OpVariableLengthArrayINTEL $type $length">;
+def OpSaveMemoryINTEL: Op<5819, (outs ID:$res), (ins TYPE:$type),
+                  "$res = OpSaveMemoryINTEL $type">;
+def OpRestoreMemoryINTEL: Op<5820, (outs), (ins ID:$ptr),
+                  "OpRestoreMemoryINTEL $ptr">;
+
 // 3.42.9 Function Instructions
 
 def OpFunction: Op<54, (outs ID:$func),
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index f1e18f0601b87..9b38073ec3bcf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -99,6 +99,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
                   MachineInstr &I) const;
   bool selectStore(MachineInstr &I) const;
 
+  bool selectStackSave(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
+  bool selectStackRestore(MachineInstr &I) const;
+
   bool selectMemOperation(Register ResVReg, MachineInstr &I) const;
 
   bool selectAtomicRMW(Register ResVReg, const SPIRVType *ResType,
@@ -167,6 +171,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectFrameIndex(Register ResVReg, const SPIRVType *ResType,
                         MachineInstr &I) const;
+  bool selectAllocaArray(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I) const;
 
   bool selectBranch(MachineInstr &I) const;
   bool selectBranchCond(MachineInstr &I) const;
@@ -508,6 +514,11 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_FENCE:
     return selectFence(I);
 
+  case TargetOpcode::G_STACKSAVE:
+    return selectStackSave(ResVReg, ResType, I);
+  case TargetOpcode::G_STACKRESTORE:
+    return selectStackRestore(I);
+
   default:
     return false;
   }
@@ -653,6 +664,35 @@ bool SPIRVInstructionSelector::selectStore(MachineInstr &I) const {
   return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectStackSave(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  if (!STI.canUseExtension(SPIRV::Extension::SPV_INTEL_variable_length_array))
+    report_fatal_error(
+        "llvm.stacksave intrinsic: this instruction requires the following "
+        "SPIR-V extension: SPV_INTEL_variable_length_array",
+        false);
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSaveMemoryINTEL))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectStackRestore(MachineInstr &I) const {
+  if (!STI.canUseExtension(SPIRV::Extension::SPV_INTEL_variable_length_array))
+    report_fatal_error(
+        "llvm.stackrestore intrinsic: this instruction requires the following "
+        "SPIR-V extension: SPV_INTEL_variable_length_array",
+        false);
+  if (!I.getOperand(0).isReg())
+    return false;
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpRestoreMemoryINTEL))
+      .addUse(I.getOperand(0).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
                                                   MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
@@ -1505,6 +1545,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     break;
   case Intrinsic::spv_alloca:
     return selectFrameIndex(ResVReg, ResType, I);
+  case Intrinsic::spv_alloca_array:
+    return selectAllocaArray(ResVReg, ResType, I);
   case Intrinsic::spv_assume:
     if (STI.canUseExtension(SPIRV::Extension::SPV_KHR_expect_assume))
       BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpAssumeTrueKHR))
@@ -1524,6 +1566,20 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   return true;
 }
 
+bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  // there was an allocation size parameter to the allocation instruction
+  // that is not 1
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(),
+                 TII.get(SPIRV::OpVariableLengthArrayINTEL))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectFrameIndex(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index aedca798b4f6a..049ca4ac818c4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -186,6 +186,8 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}).alwaysLegal();
 
+  getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).alwaysLegal();
+
   getActionDefinitionsBuilder(G_INTTOPTR)
       .legalForCartesianProduct(allPtrs, allIntScalars);
   getActionDefinitionsBuilder(G_PTRTOINT)
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 3be28c97d9538..ac3d6b362d350 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1110,6 +1110,14 @@ void addInstrRequirements(const MachineInstr &MI,
   case SPIRV::OpAtomicFMaxEXT:
     AddAtomicFloatRequirements(MI, Reqs, ST);
     break;
+  case SPIRV::OpVariableLengthArrayINTEL:
+  case SPIRV::OpSaveMemoryINTEL:
+  case SPIRV::OpRestoreMemoryINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_variable_length_array)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_variable_length_array);
+      Reqs.addCapability(SPIRV::Capability::VariableLengthArrayINTEL);
+    }
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 79f16146ccd94..0e8952dc6a9c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -85,6 +85,10 @@ cl::list<SPIRV::Extension::Extension> Extensions(
                    "SPV_KHR_subgroup_rotate",
                    "Adds a new instruction that enables rotating values across "
                    "invocations within a subgroup."),
+        clEnumValN(SPIRV::Extension::SPV_INTEL_variable_length_array,
+                   "SPV_INTEL_variable_length_array",
+                   "Allows to allocate local arrays whose number of elements "
+                   "is unknown at compile time."),
         clEnumValN(SPIRV::Extension::SPV_INTEL_function_pointers,
                    "SPV_INTEL_function_pointers",
                    "Allows translation of function pointers.")));
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index b022b97408d7d..211c22340eb82 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -296,6 +296,7 @@ defm SPV_INTEL_fpga_latency_control : ExtensionOperand<101>;
 defm SPV_INTEL_fpga_argument_interfaces : ExtensionOperand<102>;
 defm SPV_INTEL_optnone : ExtensionOperand<103>;
 defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
+defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -462,6 +463,7 @@ defm AtomicFloat16AddEXT : CapabilityOperand<6095, 0, 0, [SPV_EXT_shader_atomic_
 defm AtomicFloat16MinMaxEXT : CapabilityOperand<5616, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat32MinMaxEXT : CapabilityOperand<5612, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
 defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atomic_float_min_max], []>;
+defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
 
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll
new file mode 100644
index 0000000000000..897aab70852d2
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr.ll
@@ -0,0 +1,54 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_variable_length_array/basic.ll
+
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: array allocation: this instruction requires the following SPIR-V extension: SPV_INTEL_variable_length_array
+
+; CHECK-SPIRV: Capability VariableLengthArrayINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_variable_length_array"
+
+; CHECK-SPIRV-DAG: OpName %[[Len:.*]] "a"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Char:.*]] = OpTypeInt 8 0
+; CHECK-SPIRV-DAG: %[[CharPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[Char]]
+; CHECK-SPIRV-DAG: %[[IntPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[Int]]
+; CHECK-SPIRV: %[[Len]] = OpFunctionParameter %[[Long:.*]]
+; CHECK-SPIRV: %[[SavedMem1:.*]] = OpSaveMemoryINTEL %[[CharPtr]]
+; CHECK-SPIRV: OpVariableLengthArrayINTEL %[[IntPtr]] %[[Len]]
+; CHECK-SPIRV: OpRestoreMemoryINTEL %[[SavedMem1]]
+; CHECK-SPIRV: %[[SavedMem2:.*]] = OpSaveMemoryINTEL %[[CharPtr]]
+; CHECK-SPIRV: OpVariableLengthArrayINTEL %[[IntPtr]] %[[Len]]
+; CHECK-SPIRV: OpRestoreMemoryINTEL %[[SavedMem2]]
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+define dso_local spir_func i32 @foo(i64 %a, i64 %b) {
+entry:
+  %vector1 = alloca [42 x i32], align 16
+  call void @llvm.lifetime.start.p0(i64 168, ptr nonnull %vector1)
+  %stack1 = call ptr @llvm.stacksave.p0()
+  %vla = alloca i32, i64 %a, align 16
+  %arrayidx = getelementptr inbounds i32, ptr %vla, i64 %b
+  %elem1 = load i32, ptr %arrayidx, align 4
+  call void @llvm.stackrestore.p0(ptr %stack1)
+  %stack2 = call ptr @llvm.stacksave.p0()
+  %vla2 = alloca i32, i64 %a, align 16
+  %arrayidx3 = getelementptr inbounds [42 x i32], ptr %vector1, i64 0, i64 %b
+  %elemt = load i32, ptr %arrayidx3, align 4
+  %add = add nsw i32 %elemt, %elem1
+  %arrayidx4 = getelementptr inbounds i32, ptr %vla2, i64 %b
+  %elem2 = load i32, ptr %arrayidx4, align 4
+  %add5 = add nsw i32 %add, %elem2
+  call void @llvm.stackrestore.p0(ptr %stack2)
+  call void @llvm.lifetime.end.p0(i64 168, ptr nonnull %vector1)
+  ret i32 %add5
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare ptr @llvm.stacksave.p0()
+declare void @llvm.stackrestore.p0(ptr)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll
new file mode 100644
index 0000000000000..fbac43e51f399
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_variable_length_array/vararr_spec_const.ll
@@ -0,0 +1,110 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_variable_length_array/vla_spec_const.ll
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_variable_length_array %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: Capability VariableLengthArrayINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_variable_length_array"
+; CHECK-SPIRV: OpDecorate %[[SpecConst:.*]] SpecId 0
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 64 0
+; CHECK-SPIRV-DAG: %[[Int:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[IntPtr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[Int]]
+; CHECK-SPIRV: %[[SpecConst]] = OpSpecConstant %[[Long]]
+; CHECK-SPIRV-LABEL: FunctionEnd
+; CHECK-SPIRV: %[[SpecConstVal:.*]] = OpFunctionCall %[[Long]]
+; CHECK-SPIRV: OpSaveMemoryINTEL
+; CHECK-SPIRV: OpVariableLengthArrayINTEL %[[IntPtr]] %[[SpecConstVal]]
+; CHECK-SPIRV: OpRestoreMemoryINTEL
+
+; CHECK-SPIRV: OpFunction %[[Long]]
+; CHECK-SPIRV: ReturnValue %[[SpecConst]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-linux"
+
+%"class._ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEEUlvE_.anon" = type { %"class._ZTSN2cl4sycl12experimental13spec_constantIm13MyUInt64ConstEE.cl::sycl::experimental::spec_constant" }
+%"class._ZTSN2cl4sycl12experimental13spec_constantIm13MyUInt64ConstEE.cl::sycl::experimental::spec_constant" = type { i8 }
+
+$_ZTS17SpecializedKernel = comdat any
+
+$_ZNK2cl4sycl12experimental13spec_constantIm13MyUInt64ConstE3getEv = comdat any
+
+; Function Attrs: norecurse
+define weak_odr dso_local spir_kernel void @_ZTS17SpecializedKernel() #0 comdat !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 {
+entry:
+  %p = alloca %"class._ZTSZZ4mainENK3$_0clERN2cl4sycl7handlerEEUlvE_.anon", align 1
+  call void @llvm.lifetime.start.p0(i64 1, ptr %p) #4
+  %p4 = addrspacecast ptr %p to ptr addrspace(4)
+  call spir_func void @"_ZZZ4mainENK3$_0clERN2cl4sycl7handlerEENKUlvE_clEv"(ptr addrspace(4) %p4)
+  call void @llvm.lifetime.end.p0(i64 1, ptr %p) #4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: inlinehint norecurse
+define internal spir_func void @"_ZZZ4mainENK3$_0clERN2cl4sycl7handlerEENKUlvE_clEv"(ptr addrspace(4) %this) #2 align 2 {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %saved_stack = alloca ptr, align 8
+  %__vla_expr0 = alloca i64, align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8, !tbaa !5
+  %this1 = load ptr addrspace(4), ptr %this.addr, align 8
+  %call = call spir_func i64 @_ZNK2cl4sycl12experimental13spec_constantIm13MyUInt64ConstE3getEv(ptr addrspace(4) %this1)
+  %p = call ptr @llvm.stacksave.p0()
+  store ptr %p, ptr %saved_stack, align 8
+  %vla = alloca i32, i64 %call, align 4
+  store i64 %call, ptr %__vla_expr0, align 8
+  store i32 42, ptr %vla, align 4, !tbaa !9
+  %torestore = load ptr, ptr %saved_stack, align 8
+  call void @llvm.stackrestore.p0(ptr %torestore)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind willreturn
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: norecurse
+define linkonce_odr dso_local spir_func i64 @_ZNK2cl4sycl12experimental13spec_constantIm13MyUInt64ConstE3getEv(ptr addrspace(4) %this) #3 comdat align 2 {
+entry:
+  %this.addr = alloca ptr addrspace(4), align 8
+  %TName = alloca ptr addrspace(4), align 8
+  store ptr addrspace(4) %this, ptr %this.addr, align 8, !tbaa !5
+  call void @llvm.lifetime.start.p0(i64 8, ptr %TName) #4
+  %p = call i64 @_Z20__spirv_SpecConstantix(i32 0, i64 0), !SYCL_SPEC_CONST_SYM_ID !11
+  call void @llvm.lifetime.end.p0(i64 8, ptr %TName) #4
+  ret i64 %p
+}
+
+; Function Attrs: nounwind
+declare ptr @llvm.stacksave.p0() #4
+
+; Function Attrs: nounwind
+declare void @llvm.stackrestore.p0(ptr) #4
+
+declare i64 @_Z20__spirv_SpecConstantix(i32, i64)
+
+attributes #0 = { norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="/work/intel/vla_spec_const.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind willreturn }
+attributes #2 = { inlinehint norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.spir.version = !{!1}
+!spirv.Source = !{!2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 4, i32 100000}
+!3 = !{!"clang version 12.0.0"}
+!4 = !{}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !7, i64 0}
+!11 = !{!"_ZTS13MyUInt64Const", i32 0}

From aa436493ab7ad4cf323b0189c15c59ac9dc293c7 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 27 Feb 2024 08:24:13 +0000
Subject: [PATCH 422/546] Reapply "[RemoveDIs] Print non-intrinsic debug info
 in textual IR output (#79281)"

Fixes the prior issue in which the symbol for a cl-arg was unavailable to
some binaries.

This reverts commit dc06d75ab27b4dcae2940fc386fadd06f70faffe.
---
 llvm/include/llvm/IR/Module.h                 | 14 +++
 llvm/include/llvm/IR/PrintPasses.h            | 19 ++++
 llvm/lib/IR/AsmWriter.cpp                     | 75 +++++++---------
 llvm/lib/IR/BasicBlock.cpp                    |  4 -
 llvm/lib/IR/IRPrintingPasses.cpp              | 27 +++---
 llvm/lib/IR/Module.cpp                        | 22 +++++
 llvm/lib/IRPrinter/IRPrintingPasses.cpp       | 24 ++----
 .../Generic/inline-alloca-ordering.ll         |  7 +-
 .../DebugInfo/Generic/inline-dbg-values.ll    |  8 +-
 llvm/test/DebugInfo/dpvalue-print-nocrash.ll  |  3 +-
 .../print-non-instruction-debug-info.ll       | 86 +++++++++++++++++++
 .../pr33641_remove_arg_dbgvalue.ll            | 18 ++--
 .../callsite-split-preserve-debug.ll          |  4 +-
 .../debug-info-on-skipped-selects.ll          |  2 +-
 .../DeadArgElim/2010-04-30-DbgInfo.ll         | 35 ++++++--
 .../IROutliner/outlining-debug-statements.ll  |  7 --
 llvm/test/Transforms/LoopRotate/dbgvalue.ll   |  4 +-
 .../LoopRotate/delete-dbg-values.ll           |  4 +-
 ...e-that-exception-unwind-path-is-visited.ll |  5 +-
 llvm/test/Transforms/SROA/dbg-inline.ll       | 50 +++++------
 20 files changed, 272 insertions(+), 146 deletions(-)
 create mode 100644 llvm/test/DebugInfo/print-non-instruction-debug-info.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 68a89dc45c283..f5f3c66f0ae1e 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -218,11 +218,18 @@ class LLVM_EXTERNAL_VISIBILITY Module {
   /// \ref BasicBlock.
   bool IsNewDbgInfoFormat;
 
+  /// Used when converting this module to the new debug info format; removes all
+  /// declarations of debug intrinsics that are replaced by non-intrinsic
+  /// records in the new format.
+  void removeDebugIntrinsicDeclarations();
+
   /// \see BasicBlock::convertToNewDbgValues.
   void convertToNewDbgValues() {
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
+    // Remove the declarations of the old debug intrinsics, if any exist.
+    removeDebugIntrinsicDeclarations();
     IsNewDbgInfoFormat = true;
   }
 
@@ -234,6 +241,13 @@ class LLVM_EXTERNAL_VISIBILITY Module {
     IsNewDbgInfoFormat = false;
   }
 
+  void setIsNewDbgInfoFormat(bool UseNewFormat) {
+    if (UseNewFormat && !IsNewDbgInfoFormat)
+      convertToNewDbgValues();
+    else if (!UseNewFormat && IsNewDbgInfoFormat)
+      convertFromNewDbgValues();
+  }
+
   /// The Module constructor. Note that there is no default constructor. You
   /// must provide a name for the module upon construction.
   explicit Module(StringRef ModuleID, LLVMContext& C);
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 95b97e76c867c..3803bd05cbe57 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,6 +78,25 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+    -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index fba404c9b027c..a3da6ca811489 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -861,7 +861,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
 
-  /// Add all of the metadata from an instruction.
+  /// Add all of the metadata from a DbgRecord.
   void processDbgRecordMetadata(const DbgRecord &DPV);
 };
 
@@ -1140,6 +1140,9 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
+    // Process metadata used by DbgRecords; we only specifically care about the
+    // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
+    // Expression fields should only be printed inline and so do not use a slot.
     CreateMetadataSlot(DPV->getVariable());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
@@ -2703,6 +2706,7 @@ class AssemblyWriter {
   void printDPValue(const DPValue &DPI);
   void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
+  void printDbgRecordLine(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -3885,9 +3889,6 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  bool ConvertBack = F->IsNewDbgInfoFormat;
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertFromNewDbgValues();
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
@@ -4030,8 +4031,6 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "}\n";
   }
 
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertToNewDbgValues();
   Machine.purgeFunction();
 }
 
@@ -4098,6 +4097,8 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (const Instruction &I : *BB) {
+    for (const DbgRecord &DR : I.getDbgValueRange())
+      printDbgRecordLine(DR);
     printInstructionLine(I);
   }
 
@@ -4611,12 +4612,10 @@ void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &Value) {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
-  Out << "  DPValue ";
-
-  switch (Value.getType()) {
+void AssemblyWriter::printDPValue(const DPValue &DPV) {
+  auto WriterCtx = getContext();
+  Out << "#dbg_";
+  switch (DPV.getType()) {
   case DPValue::LocationType::Value:
     Out << "value";
     break;
@@ -4629,35 +4628,39 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
   default:
     llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
   }
-  Out << " { ";
-  auto WriterCtx = getContext();
-  WriteAsOperandInternal(Out, Value.getRawLocation(), WriterCtx, true);
+  Out << "(";
+  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getExpression(), WriterCtx, true);
   Out << ", ";
-  if (Value.isDbgAssign()) {
-    WriteAsOperandInternal(Out, Value.getAssignID(), WriterCtx, true);
+  if (DPV.isDbgAssign()) {
+    WriteAsOperandInternal(Out, DPV.getAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, Value.getDebugLoc().get(), WriterCtx, true);
-  Out << " marker @" << Value.getMarker();
-  Out << " }";
+  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  Out << ")";
+}
+
+/// printDbgRecordLine - Print a DbgRecord with indentation and a newline
+/// character.
+void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
+  // Print lengthier indentation to bring out-of-line with instructions.
+  Out << "    ";
+  printDbgRecord(DR);
+  Out << '\n';
 }
 
 void AssemblyWriter::printDPLabel(const DPLabel &Label) {
-  // There's no formal representation of a DPLabel -- print purely as
-  // a debugging aid.
-  Out << "  DPLabel { ";
   auto WriterCtx = getContext();
+  Out << "#dbg_label(";
   WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
-  Out << " marker @" << Label.getMarker();
-  Out << " }";
+  Out << ")";
 }
 
 void AssemblyWriter::printMetadataAttachments(
@@ -4805,19 +4808,11 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
-  // RemoveDIs: always print with debug-info in intrinsic format.
-  bool ConvertAfter = IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module *>(this)->convertFromNewDbgValues();
-
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printModule(this);
-
-  if (ConvertAfter)
-    const_cast<Module *>(this)->convertToNewDbgValues();
 }
 
 void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
@@ -4908,8 +4903,6 @@ void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                      bool IsForDebug) const {
-  // There's no formal representation of a DPMarker -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4931,8 +4924,6 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4950,8 +4941,6 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DbgLabelRecord -- print purely as
-  // a debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 6ea876fde5ec6..c110c4c1437c3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -61,10 +61,6 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
-  // Is the command line option set?
-  if (!UseNewDbgInfoFormat)
-    return;
-
   IsNewDbgInfoFormat = true;
 
   // Iterate over all instructions in the instruction list, collecting dbg.value
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index b19210e776ed5..f63b89a06cf0b 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -23,6 +23,11 @@
 
 using namespace llvm;
 
+cl::opt<bool> WriteNewDbgInfoFormat(
+    "write-experimental-debuginfo",
+    cl::desc("Write debug info in the new non-intrinsic format"),
+    cl::init(false));
+
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -39,11 +44,9 @@ class PrintModulePassWrapper : public ModulePass {
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      M.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this module in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -62,9 +65,6 @@ class PrintModulePassWrapper : public ModulePass {
       }
     }
 
-    if (IsNewDbgInfoFormat)
-      M.convertToNewDbgValues();
-
     return false;
   }
 
@@ -87,11 +87,9 @@ class PrintFunctionPassWrapper : public FunctionPass {
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      F.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this function in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
@@ -101,9 +99,6 @@ class PrintFunctionPassWrapper : public FunctionPass {
         OS << Banner << '\n' << static_cast<Value &>(F);
     }
 
-    if (IsNewDbgInfoFormat)
-      F.convertToNewDbgValues();
-
     return false;
   }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 1946db2ee0be7..a8696ed9e3ce5 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -85,6 +85,28 @@ Module::~Module() {
   IFuncList.clear();
 }
 
+void Module::removeDebugIntrinsicDeclarations() {
+  auto *DeclareIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
+  assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug declare intrinsic should have had uses removed.");
+  DeclareIntrinsicFn->eraseFromParent();
+  auto *ValueIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
+  assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug value intrinsic should have had uses removed.");
+  ValueIntrinsicFn->eraseFromParent();
+  auto *AssignIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
+  assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug assign intrinsic should have had uses removed.");
+  AssignIntrinsicFn->eraseFromParent();
+  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
+  assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
+         "Debug label intrinsic should have had uses removed.");
+  LabelntrinsicFn->eraseFromParent();
+}
+
 std::unique_ptr<RandomNumberGenerator>
 Module::createRNG(const StringRef Name) const {
   SmallString<32> Salt(Name);
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index 52b242b4dcd52..b799eef7fb706 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -22,6 +22,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -31,11 +33,9 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = M.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    M.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this module in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -63,9 +63,6 @@ PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
     Index->print(OS);
   }
 
-  if (ShouldConvert)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -75,11 +72,9 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = F.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    F.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this function in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
@@ -88,8 +83,5 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
       OS << Banner << '\n' << static_cast<Value &>(F);
   }
 
-  if (ShouldConvert)
-    F.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
index 9ff3d80af80d1..9f401ceb5b6f4 100644
--- a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
+++ b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
@@ -15,8 +15,6 @@
 ;; doesn't transfer the dbg.value to the entry block. This needs Special
 ;; Handling once we get rid of debug-intrinsics.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK:    define i32 @bar()
 ; CHECK-NEXT: %1 = alloca [65 x i32], align 16
 ; CHECK-NEXT: call void @ext()
@@ -24,9 +22,10 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !12
 ; CHECK-NEXT: call void @init(ptr %1)
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 declare void @ext()
 declare void @init(ptr)
-declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define internal i32 @foo() !dbg !4 {
   %1 = alloca [65 x i32], align 16
@@ -42,6 +41,8 @@ define i32 @bar() !dbg !16 {
   ret i32 %1
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
index 7d580d6fba37a..b0390b9f78f41 100644
--- a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
+++ b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
@@ -7,8 +7,6 @@
 ;;
 ;; test should be inlined into test2
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK: define i32 @test2
 ; CHECK-NEXT: entry:
 ; CHECK:      %k.addr.i = alloca i32, align 4
@@ -47,6 +45,8 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[KVAR]], metadata !DIExpression()), !dbg ![[KLINE]]
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[K2VAR]], metadata !DIExpression()), !dbg ![[GLINE]]
 ;
+; CHECK: declare void @llvm.dbg.value(metadata,
+;
 ;; Test that the metadata maps onto the correct things, and that the DILocations
 ;; attached to the intrinsics have been inlined.
 ;
@@ -100,8 +100,6 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %3, !dbg !20
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 declare i32 @_Z8test_exti(i32)
 
 define i32 @test2(i32 %foo, i32 %bar) !dbg !10 {
@@ -118,6 +116,8 @@ try.cont:                                         ; preds = %catch, %invoke.cont
   ret i32 0, !dbg !30
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 9f120af13ac9c..0a618c6780d1d 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -2,8 +2,7 @@
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: CLONE:   DPValue value {
-; CHECK-SAME: marker @0x0
+; CHECK: CLONE:   #dbg_value(
 
 define ptr @func_10(i32 %p_11) {
 entry:
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
new file mode 100644
index 0000000000000..f8271df146fe9
--- /dev/null
+++ b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
@@ -0,0 +1,86 @@
+;; Test that we can write in the new debug info format.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+;; Test also that the new flag is independent of the flag that enables use of
+;; these non-instruction debug info during LLVM passes.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]])
+; CHECK-NEXT: entry:
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_B:[0-9a-zA-Z]+]] = alloca
+; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]])
+; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]])
+; NEWDBG-NEXT: {{^}}    #dbg_label(![[LABEL_ID:[0-9]+]])
+; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
+; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
+; CHECK-NEXT: {{^}}  ret i32
+
+; OLDDBG-DAG: declare void @llvm.dbg.value
+; OLDDBG-DAG: declare void @llvm.dbg.declare
+; OLDDBG-DAG: declare void @llvm.dbg.assign
+
+; CHECK-DAG: llvm.dbg.cu
+; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a"
+; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b"
+; CHECK-DAG: ![[LOC_1]] = !DILocation(line: 3, column: 15
+; CHECK-DAG: ![[LOC_2]] = !DILocation(line: 3, column: 20
+; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
+; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
+; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
+
+define dso_local i32 @f(i32 %a) !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30
+  %b = alloca i32, !dbg !30, !DIAssignID !40
+  call void @llvm.dbg.declare(metadata ptr %b, metadata !21, metadata !DIExpression()), !dbg !31
+  %add = add i32 %a, 5, !dbg !31
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %add), metadata !20, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg !32
+  call void @llvm.dbg.label(metadata !50), !dbg !32
+  store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
+  call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
+  ret i32 %add, !dbg !33
+
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "print.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 18.0.0"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!20, !21}
+!20 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !12)
+!21 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !12)
+!30 = !DILocation(line: 3, column: 15, scope: !7)
+!31 = !DILocation(line: 3, column: 20, scope: !7)
+!32 = !DILocation(line: 3, column: 25, scope: !7)
+!33 = !DILocation(line: 3, column: 30, scope: !7)
+!40 = distinct !DIAssignID()
+!50 = !DILabel(scope: !7, name: "label", file: !1, line: 3)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index 83226e56b5acd..e0777f9ecee8b 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -30,8 +30,7 @@ define internal void @bar(%p_t %p)  {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@bar
 ; CGSCC-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression())
-; CGSCC-SAME:         !dbg [[DBG5:![0-9]+]]
+; CGSCC-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.dbg.value(metadata %p_t %p, metadata !4, metadata !5), !dbg !6
@@ -52,21 +51,20 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; TUNIT: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; TUNIT: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; TUNIT: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
-; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CGSCC: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; CGSCC: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; CGSCC: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: !4)
-; CGSCC: [[META4:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: !0)
-; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: !4)
+; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: [[META4:![0-9]+]])
+; CGSCC: [[META4]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: [[META0]])
+; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: [[META4]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
index e185286304a68..3fdf62bf57502 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
@@ -3,8 +3,6 @@
 
 ;; Test that DebugLocs are preserved, and that dbg.values are duplicated.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK-LABEL: @test1
 ; CHECK:         call void @llvm.dbg.value(metadata i32 0,
 ; CHECK-NEXT:    [[R1:%.+]] = call i32 @callee(i32 0, i32 %dd), !dbg [[DBG1:!.*]]
@@ -43,6 +41,8 @@ CallSite:                                         ; preds = %land.rhs, %entry
 ; CHECK-NEXT:    phi i32 [ [[R1]], %Header.split ], [ [[R2]], %TBB.split ], !dbg [[DBG_CALL]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 1,
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 define void @test2(ptr %ptr, i32 %i) !dbg !19 {
 Header:
   %tobool = icmp ne i32 %i, 10
diff --git a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
index ffd4a0170ac9d..ce34b16a4d7a3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
@@ -5,8 +5,8 @@
 ; Test that when we skip over multiple selects in CGP, that the debug-info
 ; attached to those selects is still fixed up.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
 ; CHECK: call void @llvm.dbg.value(metadata ptr %sunkaddr,
+; CHECK: declare void @llvm.dbg.value(metadata,
 
 source_filename = "reduced.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 0dee72f4f6b6f..d84dd10e7d28b 100644
--- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,12 +8,21 @@
 
 define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp !dbg !1 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@vfs_addname
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[LEN:%.*]], i32 [[HASH:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[LEN]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META14:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[FLAGS]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR3:[0-9]+]], !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[TMP0]], !dbg [[DBG16]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %len, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %hash, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %flags, metadata !12, metadata !DIExpression()), !dbg !DILocation(scope: !1)
-; CHECK:  call fastcc ptr @add_name_internal(ptr %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <ptr> [#uses=1]
   ret ptr %0, !dbg !13
 }
@@ -22,6 +31,24 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@add_name_internal
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i8 poison, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[HASH]], 0, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB:%.*]], label [[BB1:%.*]], !dbg [[DBG28]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB2:%.*]], !dbg [[DBG30:![0-9]+]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]], !dbg [[DBG31:![0-9]+]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ @.str, [[BB]] ], [ [[NAME]], [[BB1]] ]
+; CHECK-NEXT:    ret ptr [[DOT0]], !dbg [[DBG31]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !16)
   call void @llvm.dbg.value(metadata i32 %len, metadata !20, metadata !DIExpression()), !dbg !DILocation(scope: !16)
@@ -45,9 +72,7 @@ bb2:                                              ; preds = %bb1, %bb
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind ssp }
-; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #2 = { noinline nounwind ssp }
-; CHECK: attributes [[NUW]] = { nounwind }
+; CHECK: attributes #1 = { noinline nounwind ssp }
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
@@ -68,9 +93,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !14 = distinct !DILexicalBlock(line: 12, column: 0, file: !28, scope: !1)
 !15 = !DILocalVariable(name: "name", line: 17, arg: 1, scope: !16, file: !2, type: !6)
 ; CHECK: !DISubprogram(name: "add_name_internal"
-; CHECK-SAME: type: ![[MD:[0-9]+]]
 !16 = distinct !DISubprogram(name: "add_name_internal", linkageName: "add_name_internal", line: 22, isLocal: true, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !28, scope: !2, type: !17)
-; CHECK: ![[MD]] = !DISubroutineType(cc: DW_CC_nocall, types: !{{[0-9]+}})
 !17 = !DISubroutineType(types: !18)
 !18 = !{!6, !6, !9, !9, !19, !9}
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index f932788c73e2a..bf846c310a525 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -51,14 +51,7 @@ entry:
   ret void
 }
 
-; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) #1 {
 ; CHECK: entry_to_outline:
-; CHECK-NEXT:    store i32 2, ptr [[ARG0]], align 4
-; CHECK-NEXT:    store i32 3, ptr [[ARG1]], align 4
-; CHECK-NEXT:    store i32 4, ptr [[ARG2]], align 4
-; CHECK-NEXT:    [[AL:%.*]] = load i32, ptr [[ARG0]], align 4
-; CHECK-NEXT:    [[BL:%.*]] = load i32, ptr [[ARG1]], align 4
-; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 !0 = !DIFile(filename: "foo.c", directory: "/tmp")
 !1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 92cc886bc81c1..331ca9efdd4a0 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -4,8 +4,6 @@
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; This function rotates the exit conditon into the entry block, moving the
 ; dbg.values with it. Check that they resolve through the PHIs to the arguments
 ; only in the entry block. In the loop block, the dbg.values shift down below
@@ -265,6 +263,8 @@ L0.latch:
   br label %L0, !dbg !77
 }
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 !llvm.module.flags = !{!20}
 !llvm.dbg.cu = !{!2}
 
diff --git a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
index bce5ed02b43bf..8c23c0276024c 100644
--- a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
+++ b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
@@ -10,8 +10,6 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK-LABEL: define void @_ZNK4llvm5APInt4sextEj(ptr
 ; CHECK-LABEL: entry:
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC:[0-9]+]],
@@ -22,6 +20,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SINK]],
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC]],
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define void @_ZNK4llvm5APInt4sextEj(ptr %agg.result) !dbg !5 {
diff --git a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index e4c123e02540a..22d14f52e9664 100644
--- a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -103,10 +103,9 @@ declare void @NSLog(ptr, ...)
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { ssp uwtable }
-; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #2 = { nonlazybind }
+; CHECK: attributes #1 = { nonlazybind }
 ; CHECK: attributes [[NUW]] = { nounwind }
-; CHECK: attributes #4 = { noinline ssp uwtable }
+; CHECK: attributes #3 = { noinline ssp uwtable }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
diff --git a/llvm/test/Transforms/SROA/dbg-inline.ll b/llvm/test/Transforms/SROA/dbg-inline.ll
index 454ca13230bfa..3e7683ad2db77 100644
--- a/llvm/test/Transforms/SROA/dbg-inline.ll
+++ b/llvm/test/Transforms/SROA/dbg-inline.ll
@@ -19,10 +19,10 @@ target triple = "x86_64-apple-macosx10.15.0"
 define i64 @_Z1g4pair(i64 %p.coerce0, i64 %p.coerce1) #0 !dbg !8 {
 ; CHECK-LABEL: @_Z1g4pair(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
 ; CHECK-NEXT:    ret i64 [[P_COERCE0]], !dbg [[DBG22:![0-9]+]]
 ;
 entry:
@@ -77,32 +77,32 @@ attributes #2 = { argmemonly nounwind willreturn }
 !26 = !DILocation(line: 10, column: 3, scope: !8)
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline ssp uwtable }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project 5110fd0343c2d06c8ae538741fbef13ece5e68de)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None, sysroot: "/")
-; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "/Volumes/Data/llvm-project")
-; CHECK: [[META2:![0-9]+]] = !{}
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], nameTableKind: None, sysroot: "/")
+; CHECK: [[META1]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: {{.*}})
+; CHECK: [[META2]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META6:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: !8, file: !8, line: 9, type: !9, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: [[META8:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
-; CHECK: [[META9:![0-9]+]] = !DISubroutineType(types: !10)
-; CHECK: [[META10:![0-9]+]] = !{!11, !12}
-; CHECK: [[META11:![0-9]+]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
-; CHECK: [[META12:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: !8, line: 1, size: 128, flags: DIFlagTypePassByValue, elements: !13, identifier: "_ZTS4pair")
-; CHECK: [[META13:![0-9]+]] = !{!14, !15}
-; CHECK: [[META14:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !12, file: !8, line: 1, baseType: !11, size: 64)
-; CHECK: [[META15:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !12, file: !8, line: 1, baseType: !11, size: 64, offset: 64)
-; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !8, line: 9, type: !12)
-; CHECK: [[DBG17]] = !DILocation(line: 0, scope: !7)
-; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: !19, file: !8, line: 5, type: !12)
-; CHECK: [[META19:![0-9]+]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: !8, file: !8, line: 5, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2)
-; CHECK: [[DBG20]] = !DILocation(line: 0, scope: !19, inlinedAt: !21)
-; CHECK: [[META21:![0-9]+]] = distinct !DILocation(line: 10, column: 10, scope: !7)
-; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: !7)
+; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: [[META8:![0-9]+]], file: [[META8]], line: 9, type: [[META9:![0-9]+]], scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META8]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
+; CHECK: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
+; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]]}
+; CHECK: [[META11]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+; CHECK: [[META12]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: [[META8]], line: 1, size: 128, flags: DIFlagTypePassByValue, elements: [[META13:![0-9]+]], identifier: "_ZTS4pair")
+; CHECK: [[META13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]]}
+; CHECK: [[META14]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64)
+; CHECK: [[META15]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64, offset: 64)
+; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: [[META7]], file: [[META8]], line: 9, type: [[META12]])
+; CHECK: [[DBG17]] = !DILocation(line: 0, scope: [[META7]])
+; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: [[META19:![0-9]+]], file: [[META8]], line: 5, type: [[META12]])
+; CHECK: [[META19]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: [[META8]], file: [[META8]], line: 5, type: [[META9]], scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[DBG20]] = !DILocation(line: 0, scope: [[META19]], inlinedAt: [[META21:![0-9]+]])
+; CHECK: [[META21]] = distinct !DILocation(line: 10, column: 10, scope: [[META7]])
+; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: [[META7]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}

From d128448efdd4e2bf3c9bc9a5b43ae642aa78026f Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 27 Feb 2024 10:17:24 +0000
Subject: [PATCH 423/546] Revert "Reapply "[RemoveDIs] Print non-intrinsic
 debug info in textual IR output (#79281)""

Reverted due to some test failures on some buildbots.

https://lab.llvm.org/buildbot/#/builders/67/builds/14669

This reverts commit aa436493ab7ad4cf323b0189c15c59ac9dc293c7.
---
 llvm/include/llvm/IR/Module.h                 | 14 ---
 llvm/include/llvm/IR/PrintPasses.h            | 19 ----
 llvm/lib/IR/AsmWriter.cpp                     | 75 +++++++++-------
 llvm/lib/IR/BasicBlock.cpp                    |  4 +
 llvm/lib/IR/IRPrintingPasses.cpp              | 27 +++---
 llvm/lib/IR/Module.cpp                        | 22 -----
 llvm/lib/IRPrinter/IRPrintingPasses.cpp       | 24 ++++--
 .../Generic/inline-alloca-ordering.ll         |  7 +-
 .../DebugInfo/Generic/inline-dbg-values.ll    |  8 +-
 llvm/test/DebugInfo/dpvalue-print-nocrash.ll  |  3 +-
 .../print-non-instruction-debug-info.ll       | 86 -------------------
 .../pr33641_remove_arg_dbgvalue.ll            | 18 ++--
 .../callsite-split-preserve-debug.ll          |  4 +-
 .../debug-info-on-skipped-selects.ll          |  2 +-
 .../DeadArgElim/2010-04-30-DbgInfo.ll         | 35 ++------
 .../IROutliner/outlining-debug-statements.ll  |  7 ++
 llvm/test/Transforms/LoopRotate/dbgvalue.ll   |  4 +-
 .../LoopRotate/delete-dbg-values.ll           |  4 +-
 ...e-that-exception-unwind-path-is-visited.ll |  5 +-
 llvm/test/Transforms/SROA/dbg-inline.ll       | 50 +++++------
 20 files changed, 146 insertions(+), 272 deletions(-)
 delete mode 100644 llvm/test/DebugInfo/print-non-instruction-debug-info.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index f5f3c66f0ae1e..68a89dc45c283 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -218,18 +218,11 @@ class LLVM_EXTERNAL_VISIBILITY Module {
   /// \ref BasicBlock.
   bool IsNewDbgInfoFormat;
 
-  /// Used when converting this module to the new debug info format; removes all
-  /// declarations of debug intrinsics that are replaced by non-intrinsic
-  /// records in the new format.
-  void removeDebugIntrinsicDeclarations();
-
   /// \see BasicBlock::convertToNewDbgValues.
   void convertToNewDbgValues() {
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
-    // Remove the declarations of the old debug intrinsics, if any exist.
-    removeDebugIntrinsicDeclarations();
     IsNewDbgInfoFormat = true;
   }
 
@@ -241,13 +234,6 @@ class LLVM_EXTERNAL_VISIBILITY Module {
     IsNewDbgInfoFormat = false;
   }
 
-  void setIsNewDbgInfoFormat(bool UseNewFormat) {
-    if (UseNewFormat && !IsNewDbgInfoFormat)
-      convertToNewDbgValues();
-    else if (!UseNewFormat && IsNewDbgInfoFormat)
-      convertFromNewDbgValues();
-  }
-
   /// The Module constructor. Note that there is no default constructor. You
   /// must provide a name for the module upon construction.
   explicit Module(StringRef ModuleID, LLVMContext& C);
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 3803bd05cbe57..95b97e76c867c 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,25 +78,6 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
-/// Used to temporarily set the debug info format of a function, module, or
-/// basic block for the duration of this object's lifetime, after which the
-/// prior state will be restored.
-template <typename T> class ScopedDbgInfoFormatSetter {
-  T &Obj;
-  bool OldState;
-
-public:
-  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
-    Obj.setIsNewDbgInfoFormat(NewState);
-  }
-  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
-};
-
-template <typename T>
-ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-    -> ScopedDbgInfoFormatSetter<T>;
-
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index a3da6ca811489..fba404c9b027c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -861,7 +861,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
 
-  /// Add all of the metadata from a DbgRecord.
+  /// Add all of the metadata from an instruction.
   void processDbgRecordMetadata(const DbgRecord &DPV);
 };
 
@@ -1140,9 +1140,6 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
-    // Process metadata used by DbgRecords; we only specifically care about the
-    // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
-    // Expression fields should only be printed inline and so do not use a slot.
     CreateMetadataSlot(DPV->getVariable());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
@@ -2706,7 +2703,6 @@ class AssemblyWriter {
   void printDPValue(const DPValue &DPI);
   void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
-  void printDbgRecordLine(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -3889,6 +3885,9 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
+  bool ConvertBack = F->IsNewDbgInfoFormat;
+  if (ConvertBack)
+    const_cast<Function *>(F)->convertFromNewDbgValues();
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
@@ -4031,6 +4030,8 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "}\n";
   }
 
+  if (ConvertBack)
+    const_cast<Function *>(F)->convertToNewDbgValues();
   Machine.purgeFunction();
 }
 
@@ -4097,8 +4098,6 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (const Instruction &I : *BB) {
-    for (const DbgRecord &DR : I.getDbgValueRange())
-      printDbgRecordLine(DR);
     printInstructionLine(I);
   }
 
@@ -4612,10 +4611,12 @@ void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &DPV) {
-  auto WriterCtx = getContext();
-  Out << "#dbg_";
-  switch (DPV.getType()) {
+void AssemblyWriter::printDPValue(const DPValue &Value) {
+  // There's no formal representation of a DPValue -- print purely as a
+  // debugging aid.
+  Out << "  DPValue ";
+
+  switch (Value.getType()) {
   case DPValue::LocationType::Value:
     Out << "value";
     break;
@@ -4628,39 +4629,35 @@ void AssemblyWriter::printDPValue(const DPValue &DPV) {
   default:
     llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
   }
-  Out << "(";
-  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
+  Out << " { ";
+  auto WriterCtx = getContext();
+  WriteAsOperandInternal(Out, Value.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, Value.getVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, DPV.getExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, Value.getExpression(), WriterCtx, true);
   Out << ", ";
-  if (DPV.isDbgAssign()) {
-    WriteAsOperandInternal(Out, DPV.getAssignID(), WriterCtx, true);
+  if (Value.isDbgAssign()) {
+    WriteAsOperandInternal(Out, Value.getAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, Value.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, DPV.getAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, Value.getAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
-  Out << ")";
-}
-
-/// printDbgRecordLine - Print a DbgRecord with indentation and a newline
-/// character.
-void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
-  // Print lengthier indentation to bring out-of-line with instructions.
-  Out << "    ";
-  printDbgRecord(DR);
-  Out << '\n';
+  WriteAsOperandInternal(Out, Value.getDebugLoc().get(), WriterCtx, true);
+  Out << " marker @" << Value.getMarker();
+  Out << " }";
 }
 
 void AssemblyWriter::printDPLabel(const DPLabel &Label) {
+  // There's no formal representation of a DPLabel -- print purely as
+  // a debugging aid.
+  Out << "  DPLabel { ";
   auto WriterCtx = getContext();
-  Out << "#dbg_label(";
   WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
-  Out << ")";
+  Out << " marker @" << Label.getMarker();
+  Out << " }";
 }
 
 void AssemblyWriter::printMetadataAttachments(
@@ -4808,11 +4805,19 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
+  // RemoveDIs: always print with debug-info in intrinsic format.
+  bool ConvertAfter = IsNewDbgInfoFormat;
+  if (IsNewDbgInfoFormat)
+    const_cast<Module *>(this)->convertFromNewDbgValues();
+
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printModule(this);
+
+  if (ConvertAfter)
+    const_cast<Module *>(this)->convertToNewDbgValues();
 }
 
 void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
@@ -4903,6 +4908,8 @@ void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                      bool IsForDebug) const {
+  // There's no formal representation of a DPMarker -- print purely as a
+  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4924,6 +4931,8 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
+  // There's no formal representation of a DPValue -- print purely as a
+  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4941,6 +4950,8 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
+  // There's no formal representation of a DbgLabelRecord -- print purely as
+  // a debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index c110c4c1437c3..6ea876fde5ec6 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -61,6 +61,10 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
+  // Is the command line option set?
+  if (!UseNewDbgInfoFormat)
+    return;
+
   IsNewDbgInfoFormat = true;
 
   // Iterate over all instructions in the instruction list, collecting dbg.value
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index f63b89a06cf0b..b19210e776ed5 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -23,11 +23,6 @@
 
 using namespace llvm;
 
-cl::opt<bool> WriteNewDbgInfoFormat(
-    "write-experimental-debuginfo",
-    cl::desc("Write debug info in the new non-intrinsic format"),
-    cl::init(false));
-
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -44,9 +39,11 @@ class PrintModulePassWrapper : public ModulePass {
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    // RemoveDIs: Regardless of the format we've processed this module in, use
-    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-    ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+    // RemoveDIs: there's no textual representation of the DPValue debug-info,
+    // convert to dbg.values before writing out.
+    bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
+    if (IsNewDbgInfoFormat)
+      M.convertFromNewDbgValues();
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -65,6 +62,9 @@ class PrintModulePassWrapper : public ModulePass {
       }
     }
 
+    if (IsNewDbgInfoFormat)
+      M.convertToNewDbgValues();
+
     return false;
   }
 
@@ -87,9 +87,11 @@ class PrintFunctionPassWrapper : public FunctionPass {
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    // RemoveDIs: Regardless of the format we've processed this function in, use
-    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-    ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
+    // RemoveDIs: there's no textual representation of the DPValue debug-info,
+    // convert to dbg.values before writing out.
+    bool IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+    if (IsNewDbgInfoFormat)
+      F.convertFromNewDbgValues();
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
@@ -99,6 +101,9 @@ class PrintFunctionPassWrapper : public FunctionPass {
         OS << Banner << '\n' << static_cast<Value &>(F);
     }
 
+    if (IsNewDbgInfoFormat)
+      F.convertToNewDbgValues();
+
     return false;
   }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index a8696ed9e3ce5..1946db2ee0be7 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -85,28 +85,6 @@ Module::~Module() {
   IFuncList.clear();
 }
 
-void Module::removeDebugIntrinsicDeclarations() {
-  auto *DeclareIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
-  assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
-         "Debug declare intrinsic should have had uses removed.");
-  DeclareIntrinsicFn->eraseFromParent();
-  auto *ValueIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
-  assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
-         "Debug value intrinsic should have had uses removed.");
-  ValueIntrinsicFn->eraseFromParent();
-  auto *AssignIntrinsicFn =
-      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
-  assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
-         "Debug assign intrinsic should have had uses removed.");
-  AssignIntrinsicFn->eraseFromParent();
-  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
-  assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
-         "Debug label intrinsic should have had uses removed.");
-  LabelntrinsicFn->eraseFromParent();
-}
-
 std::unique_ptr<RandomNumberGenerator>
 Module::createRNG(const StringRef Name) const {
   SmallString<32> Salt(Name);
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index b799eef7fb706..52b242b4dcd52 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -22,8 +22,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> WriteNewDbgInfoFormat;
-
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -33,9 +31,11 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: Regardless of the format we've processed this module in, use
-  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-  ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+  // RemoveDIs: there's no textual representation of the DPValue debug-info,
+  // convert to dbg.values before writing out.
+  bool ShouldConvert = M.IsNewDbgInfoFormat;
+  if (ShouldConvert)
+    M.convertFromNewDbgValues();
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -63,6 +63,9 @@ PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
     Index->print(OS);
   }
 
+  if (ShouldConvert)
+    M.convertToNewDbgValues();
+
   return PreservedAnalyses::all();
 }
 
@@ -72,9 +75,11 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  // RemoveDIs: Regardless of the format we've processed this function in, use
-  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
-  ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
+  // RemoveDIs: there's no textual representation of the DPValue debug-info,
+  // convert to dbg.values before writing out.
+  bool ShouldConvert = F.IsNewDbgInfoFormat;
+  if (ShouldConvert)
+    F.convertFromNewDbgValues();
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
@@ -83,5 +88,8 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
       OS << Banner << '\n' << static_cast<Value &>(F);
   }
 
+  if (ShouldConvert)
+    F.convertToNewDbgValues();
+
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
index 9f401ceb5b6f4..9ff3d80af80d1 100644
--- a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
+++ b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
@@ -15,6 +15,8 @@
 ;; doesn't transfer the dbg.value to the entry block. This needs Special
 ;; Handling once we get rid of debug-intrinsics.
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK:    define i32 @bar()
 ; CHECK-NEXT: %1 = alloca [65 x i32], align 16
 ; CHECK-NEXT: call void @ext()
@@ -22,10 +24,9 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !12
 ; CHECK-NEXT: call void @init(ptr %1)
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 declare void @ext()
 declare void @init(ptr)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define internal i32 @foo() !dbg !4 {
   %1 = alloca [65 x i32], align 16
@@ -41,8 +42,6 @@ define i32 @bar() !dbg !16 {
   ret i32 %1
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
index b0390b9f78f41..7d580d6fba37a 100644
--- a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
+++ b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
@@ -7,6 +7,8 @@
 ;;
 ;; test should be inlined into test2
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK: define i32 @test2
 ; CHECK-NEXT: entry:
 ; CHECK:      %k.addr.i = alloca i32, align 4
@@ -45,8 +47,6 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[KVAR]], metadata !DIExpression()), !dbg ![[KLINE]]
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[K2VAR]], metadata !DIExpression()), !dbg ![[GLINE]]
 ;
-; CHECK: declare void @llvm.dbg.value(metadata,
-;
 ;; Test that the metadata maps onto the correct things, and that the DILocations
 ;; attached to the intrinsics have been inlined.
 ;
@@ -100,6 +100,8 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %3, !dbg !20
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 declare i32 @_Z8test_exti(i32)
 
 define i32 @test2(i32 %foo, i32 %bar) !dbg !10 {
@@ -116,8 +118,6 @@ try.cont:                                         ; preds = %catch, %invoke.cont
   ret i32 0, !dbg !30
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 0a618c6780d1d..9f120af13ac9c 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -2,7 +2,8 @@
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: CLONE:   #dbg_value(
+; CHECK: CLONE:   DPValue value {
+; CHECK-SAME: marker @0x0
 
 define ptr @func_10(i32 %p_11) {
 entry:
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
deleted file mode 100644
index f8271df146fe9..0000000000000
--- a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-;; Test that we can write in the new debug info format.
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
-
-;; Test also that the new flag is independent of the flag that enables use of
-;; these non-instruction debug info during LLVM passes.
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
-; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
-
-; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]])
-; CHECK-NEXT: entry:
-; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]])
-; CHECK-NEXT: {{^}}  %[[VAL_B:[0-9a-zA-Z]+]] = alloca
-; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]])
-; CHECK-NEXT: {{^}}  %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5
-; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]])
-; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]])
-; NEWDBG-NEXT: {{^}}    #dbg_label(![[LABEL_ID:[0-9]+]])
-; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
-; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
-; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
-; CHECK-NEXT: {{^}}  ret i32
-
-; OLDDBG-DAG: declare void @llvm.dbg.value
-; OLDDBG-DAG: declare void @llvm.dbg.declare
-; OLDDBG-DAG: declare void @llvm.dbg.assign
-
-; CHECK-DAG: llvm.dbg.cu
-; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a"
-; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b"
-; CHECK-DAG: ![[LOC_1]] = !DILocation(line: 3, column: 15
-; CHECK-DAG: ![[LOC_2]] = !DILocation(line: 3, column: 20
-; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
-; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
-; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
-
-define dso_local i32 @f(i32 %a) !dbg !7 {
-entry:
-  call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30
-  %b = alloca i32, !dbg !30, !DIAssignID !40
-  call void @llvm.dbg.declare(metadata ptr %b, metadata !21, metadata !DIExpression()), !dbg !31
-  %add = add i32 %a, 5, !dbg !31
-  call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %add), metadata !20, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg !32
-  call void @llvm.dbg.label(metadata !50), !dbg !32
-  store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
-  call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
-  ret i32 %add, !dbg !33
-
-}
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-declare void @llvm.dbg.label(metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
-!1 = !DIFile(filename: "print.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 5}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 18.0.0"}
-!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !13)
-!8 = !DISubroutineType(types: !9)
-!9 = !{!12, !12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !{!20, !21}
-!20 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !12)
-!21 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !12)
-!30 = !DILocation(line: 3, column: 15, scope: !7)
-!31 = !DILocation(line: 3, column: 20, scope: !7)
-!32 = !DILocation(line: 3, column: 25, scope: !7)
-!33 = !DILocation(line: 3, column: 30, scope: !7)
-!40 = distinct !DIAssignID()
-!50 = !DILabel(scope: !7, name: "label", file: !1, line: 3)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index e0777f9ecee8b..83226e56b5acd 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -30,7 +30,8 @@ define internal void @bar(%p_t %p)  {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@bar
 ; CGSCC-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
+; CGSCC-NEXT:    call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression())
+; CGSCC-SAME:         !dbg [[DBG5:![0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.dbg.value(metadata %p_t %p, metadata !4, metadata !5), !dbg !6
@@ -51,20 +52,21 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; TUNIT: [[META1]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; TUNIT: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
 ; TUNIT: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
-; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CGSCC: [[META1]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; CGSCC: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
 ; CGSCC: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: [[META4:![0-9]+]])
-; CGSCC: [[META4]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: [[META0]])
-; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: [[META4]])
+; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: !4)
+; CGSCC: [[META4:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: !0)
+; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: !4)
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
index 3fdf62bf57502..e185286304a68 100644
--- a/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
+++ b/llvm/test/Transforms/CallSiteSplitting/callsite-split-preserve-debug.ll
@@ -3,6 +3,8 @@
 
 ;; Test that DebugLocs are preserved, and that dbg.values are duplicated.
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK-LABEL: @test1
 ; CHECK:         call void @llvm.dbg.value(metadata i32 0,
 ; CHECK-NEXT:    [[R1:%.+]] = call i32 @callee(i32 0, i32 %dd), !dbg [[DBG1:!.*]]
@@ -41,8 +43,6 @@ CallSite:                                         ; preds = %land.rhs, %entry
 ; CHECK-NEXT:    phi i32 [ [[R1]], %Header.split ], [ [[R2]], %TBB.split ], !dbg [[DBG_CALL]]
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 1,
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 define void @test2(ptr %ptr, i32 %i) !dbg !19 {
 Header:
   %tobool = icmp ne i32 %i, 10
diff --git a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
index ce34b16a4d7a3..ffd4a0170ac9d 100644
--- a/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/debug-info-on-skipped-selects.ll
@@ -5,8 +5,8 @@
 ; Test that when we skip over multiple selects in CGP, that the debug-info
 ; attached to those selects is still fixed up.
 
-; CHECK: call void @llvm.dbg.value(metadata ptr %sunkaddr,
 ; CHECK: declare void @llvm.dbg.value(metadata,
+; CHECK: call void @llvm.dbg.value(metadata ptr %sunkaddr,
 
 source_filename = "reduced.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index d84dd10e7d28b..0dee72f4f6b6f 100644
--- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,21 +8,12 @@
 
 define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp !dbg !1 {
 ;
-; CHECK-LABEL: define {{[^@]+}}@vfs_addname
-; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[LEN:%.*]], i32 [[HASH:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[LEN]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META14:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[FLAGS]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR3:[0-9]+]], !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    ret ptr [[TMP0]], !dbg [[DBG16]]
-;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %len, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %hash, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %flags, metadata !12, metadata !DIExpression()), !dbg !DILocation(scope: !1)
+; CHECK:  call fastcc ptr @add_name_internal(ptr %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <ptr> [#uses=1]
   ret ptr %0, !dbg !13
 }
@@ -31,24 +22,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 {
 ;
-; CHECK-LABEL: define {{[^@]+}}@add_name_internal
-; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i8 poison, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[HASH]], 0, !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT:    br i1 [[TMP0]], label [[BB:%.*]], label [[BB1:%.*]], !dbg [[DBG28]]
-; CHECK:       bb:
-; CHECK-NEXT:    br label [[BB2:%.*]], !dbg [[DBG30:![0-9]+]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[BB2]], !dbg [[DBG31:![0-9]+]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ @.str, [[BB]] ], [ [[NAME]], [[BB1]] ]
-; CHECK-NEXT:    ret ptr [[DOT0]], !dbg [[DBG31]]
-;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !16)
   call void @llvm.dbg.value(metadata i32 %len, metadata !20, metadata !DIExpression()), !dbg !DILocation(scope: !16)
@@ -72,7 +45,9 @@ bb2:                                              ; preds = %bb1, %bb
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { nounwind ssp }
-; CHECK: attributes #1 = { noinline nounwind ssp }
+; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #2 = { noinline nounwind ssp }
+; CHECK: attributes [[NUW]] = { nounwind }
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
@@ -93,7 +68,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !14 = distinct !DILexicalBlock(line: 12, column: 0, file: !28, scope: !1)
 !15 = !DILocalVariable(name: "name", line: 17, arg: 1, scope: !16, file: !2, type: !6)
 ; CHECK: !DISubprogram(name: "add_name_internal"
+; CHECK-SAME: type: ![[MD:[0-9]+]]
 !16 = distinct !DISubprogram(name: "add_name_internal", linkageName: "add_name_internal", line: 22, isLocal: true, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !28, scope: !2, type: !17)
+; CHECK: ![[MD]] = !DISubroutineType(cc: DW_CC_nocall, types: !{{[0-9]+}})
 !17 = !DISubroutineType(types: !18)
 !18 = !{!6, !6, !9, !9, !19, !9}
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index bf846c310a525..f932788c73e2a 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -51,7 +51,14 @@ entry:
   ret void
 }
 
+; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) #1 {
 ; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, ptr [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, ptr [[ARG1]], align 4
+; CHECK-NEXT:    store i32 4, ptr [[ARG2]], align 4
+; CHECK-NEXT:    [[AL:%.*]] = load i32, ptr [[ARG0]], align 4
+; CHECK-NEXT:    [[BL:%.*]] = load i32, ptr [[ARG1]], align 4
+; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 !0 = !DIFile(filename: "foo.c", directory: "/tmp")
 !1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 331ca9efdd4a0..92cc886bc81c1 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -4,6 +4,8 @@
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; This function rotates the exit conditon into the entry block, moving the
 ; dbg.values with it. Check that they resolve through the PHIs to the arguments
 ; only in the entry block. In the loop block, the dbg.values shift down below
@@ -263,8 +265,6 @@ L0.latch:
   br label %L0, !dbg !77
 }
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 !llvm.module.flags = !{!20}
 !llvm.dbg.cu = !{!2}
 
diff --git a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
index 8c23c0276024c..bce5ed02b43bf 100644
--- a/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
+++ b/llvm/test/Transforms/LoopRotate/delete-dbg-values.ll
@@ -10,6 +10,8 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 ; CHECK-LABEL: define void @_ZNK4llvm5APInt4sextEj(ptr
 ; CHECK-LABEL: entry:
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC:[0-9]+]],
@@ -20,8 +22,6 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK:       call void @llvm.dbg.value(metadata i32 0, metadata ![[SINK]],
 ; CHECK-NEXT:  call void @llvm.dbg.value(metadata i32 0, metadata ![[SRC]],
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define void @_ZNK4llvm5APInt4sextEj(ptr %agg.result) !dbg !5 {
diff --git a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
index 22d14f52e9664..e4c123e02540a 100644
--- a/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
+++ b/llvm/test/Transforms/ObjCARC/ensure-that-exception-unwind-path-is-visited.ll
@@ -103,9 +103,10 @@ declare void @NSLog(ptr, ...)
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 
 ; CHECK: attributes #0 = { ssp uwtable }
-; CHECK: attributes #1 = { nonlazybind }
+; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #2 = { nonlazybind }
 ; CHECK: attributes [[NUW]] = { nounwind }
-; CHECK: attributes #3 = { noinline ssp uwtable }
+; CHECK: attributes #4 = { noinline ssp uwtable }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!33, !34, !35, !36, !61}
diff --git a/llvm/test/Transforms/SROA/dbg-inline.ll b/llvm/test/Transforms/SROA/dbg-inline.ll
index 3e7683ad2db77..454ca13230bfa 100644
--- a/llvm/test/Transforms/SROA/dbg-inline.ll
+++ b/llvm/test/Transforms/SROA/dbg-inline.ll
@@ -19,10 +19,10 @@ target triple = "x86_64-apple-macosx10.15.0"
 define i64 @_Z1g4pair(i64 %p.coerce0, i64 %p.coerce1) #0 !dbg !8 {
 ; CHECK-LABEL: @_Z1g4pair(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
-; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
 ; CHECK-NEXT:    ret i64 [[P_COERCE0]], !dbg [[DBG22:![0-9]+]]
 ;
 entry:
@@ -77,32 +77,32 @@ attributes #2 = { argmemonly nounwind willreturn }
 !26 = !DILocation(line: 10, column: 3, scope: !8)
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline ssp uwtable }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], nameTableKind: None, sysroot: "/")
-; CHECK: [[META1]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: {{.*}})
-; CHECK: [[META2]] = !{}
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project 5110fd0343c2d06c8ae538741fbef13ece5e68de)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None, sysroot: "/")
+; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "/Volumes/Data/llvm-project")
+; CHECK: [[META2:![0-9]+]] = !{}
 ; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
 ; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META6:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: [[META8:![0-9]+]], file: [[META8]], line: 9, type: [[META9:![0-9]+]], scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK: [[META8]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
-; CHECK: [[META9]] = !DISubroutineType(types: [[META10:![0-9]+]])
-; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META12:![0-9]+]]}
-; CHECK: [[META11]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
-; CHECK: [[META12]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: [[META8]], line: 1, size: 128, flags: DIFlagTypePassByValue, elements: [[META13:![0-9]+]], identifier: "_ZTS4pair")
-; CHECK: [[META13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]]}
-; CHECK: [[META14]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64)
-; CHECK: [[META15]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: [[META12]], file: [[META8]], line: 1, baseType: [[META11]], size: 64, offset: 64)
-; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: [[META7]], file: [[META8]], line: 9, type: [[META12]])
-; CHECK: [[DBG17]] = !DILocation(line: 0, scope: [[META7]])
-; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: [[META19:![0-9]+]], file: [[META8]], line: 5, type: [[META12]])
-; CHECK: [[META19]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: [[META8]], file: [[META8]], line: 5, type: [[META9]], scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK: [[DBG20]] = !DILocation(line: 0, scope: [[META19]], inlinedAt: [[META21:![0-9]+]])
-; CHECK: [[META21]] = distinct !DILocation(line: 10, column: 10, scope: [[META7]])
-; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: [[META7]])
+; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "g", linkageName: "_Z1g4pair", scope: !8, file: !8, line: 9, type: !9, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: [[META8:![0-9]+]] = !DIFile(filename: "/tmp/inlinesplit.cpp", directory: "")
+; CHECK: [[META9:![0-9]+]] = !DISubroutineType(types: !10)
+; CHECK: [[META10:![0-9]+]] = !{!11, !12}
+; CHECK: [[META11:![0-9]+]] = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+; CHECK: [[META12:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "pair", file: !8, line: 1, size: 128, flags: DIFlagTypePassByValue, elements: !13, identifier: "_ZTS4pair")
+; CHECK: [[META13:![0-9]+]] = !{!14, !15}
+; CHECK: [[META14:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !12, file: !8, line: 1, baseType: !11, size: 64)
+; CHECK: [[META15:![0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !12, file: !8, line: 1, baseType: !11, size: 64, offset: 64)
+; CHECK: [[META16]] = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !8, line: 9, type: !12)
+; CHECK: [[DBG17]] = !DILocation(line: 0, scope: !7)
+; CHECK: [[META18]] = !DILocalVariable(name: "p", arg: 1, scope: !19, file: !8, line: 5, type: !12)
+; CHECK: [[META19:![0-9]+]] = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: !8, file: !8, line: 5, type: !9, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: [[DBG20]] = !DILocation(line: 0, scope: !19, inlinedAt: !21)
+; CHECK: [[META21:![0-9]+]] = distinct !DILocation(line: 10, column: 10, scope: !7)
+; CHECK: [[DBG22]] = !DILocation(line: 10, column: 3, scope: !7)
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}

From 2c9b6c1b36b8185299de083c3058e0c1e7760442 Mon Sep 17 00:00:00 2001
From: "Dhruv Chawla (work)" <dhruvc@nvidia.com>
Date: Tue, 27 Feb 2024 15:57:46 +0530
Subject: [PATCH 424/546] [AArch64][GlobalISel] Improve codegen for
 G_VECREDUCE_{SMIN,SMAX,UMIN,UMAX} for odd-sized vectors (#82740)

i8 vectors do not have their sizes changed as I noticed regressions in
some tests when that was done.

This patch also adds support for most G_VECREDUCE_* operations to
moreElementsVector in LegalizerHelper.cpp.

The code for getting the "neutral" element is taken almost exactly as it
is in SelectionDAG, with the exception that support for
G_VECREDUCE_{FMAXIMUM,FMINIMUM} was not added.

The code for SelectionDAG is located at
SelectionDAG::getNeutralELement().
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   4 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  68 +++++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   7 +
 llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll  | 191 +++++-------------
 .../AArch64/vecreduce-umax-legalization.ll    |  27 +--
 5 files changed, 132 insertions(+), 165 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 2beb9919418fc..5bb3692f0a46b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -281,6 +281,10 @@ class LegalizerHelper {
                                          MachineInstr &MI,
                                          LostDebugLocObserver &LocObserver);
 
+  MachineInstrBuilder
+  getNeutralElementForVecReduce(unsigned Opcode, MachineIRBuilder &MIRBuilder,
+                                LLT Ty);
+
 public:
   /// Return the alignment to use for a stack temporary object with the given
   /// type.
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 30f12bf5cca58..8079f853aef85 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5216,6 +5216,43 @@ LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
   return Legalized;
 }
 
+MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
+    unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
+  assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable(
+        "getNeutralElementForVecReduce called with invalid opcode!");
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+    return MIRBuilder.buildConstant(Ty, 0);
+  case TargetOpcode::G_VECREDUCE_MUL:
+    return MIRBuilder.buildConstant(Ty, 1);
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_UMIN:
+    return MIRBuilder.buildConstant(
+        Ty, APInt::getAllOnes(Ty.getScalarSizeInBits()));
+  case TargetOpcode::G_VECREDUCE_SMAX:
+    return MIRBuilder.buildConstant(
+        Ty, APInt::getSignedMinValue(Ty.getSizeInBits()));
+  case TargetOpcode::G_VECREDUCE_SMIN:
+    return MIRBuilder.buildConstant(
+        Ty, APInt::getSignedMaxValue(Ty.getSizeInBits()));
+  case TargetOpcode::G_VECREDUCE_FADD:
+    return MIRBuilder.buildFConstant(Ty, -0.0);
+  case TargetOpcode::G_VECREDUCE_FMUL:
+    return MIRBuilder.buildFConstant(Ty, 1.0);
+  case TargetOpcode::G_VECREDUCE_FMINIMUM:
+  case TargetOpcode::G_VECREDUCE_FMAXIMUM:
+    assert(false && "getNeutralElementForVecReduce unimplemented for "
+                    "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
+  }
+  llvm_unreachable("switch expected to return!");
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                     LLT MoreTy) {
@@ -5420,6 +5457,37 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_FMUL:
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_MUL:
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_SMAX:
+  case TargetOpcode::G_VECREDUCE_SMIN:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+  case TargetOpcode::G_VECREDUCE_UMIN: {
+    LLT OrigTy = MRI.getType(MI.getOperand(1).getReg());
+    MachineOperand &MO = MI.getOperand(1);
+    auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO);
+    auto NeutralElement = getNeutralElementForVecReduce(
+        MI.getOpcode(), MIRBuilder, MoreTy.getElementType());
+
+    LLT IdxTy(TLI.getVectorIdxTy(MIRBuilder.getDataLayout()));
+    for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
+         i != e; i++) {
+      auto Idx = MIRBuilder.buildConstant(IdxTy, i);
+      NewVec = MIRBuilder.buildInsertVectorElement(MoreTy, NewVec,
+                                                   NeutralElement, Idx);
+    }
+
+    Observer.changingInstr(MI);
+    MO.setReg(NewVec.getReg(0));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
   default:
     return UnableToLegalize;
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 60e046bc6cf40..a2e805e8cb56d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1074,6 +1074,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {s16, v8s16},
                  {s32, v2s32},
                  {s32, v4s32}})
+      .moreElementsIf(
+          [=](const LegalityQuery &Query) {
+            return Query.Types[1].isVector() &&
+                   Query.Types[1].getElementType() != s8 &&
+                   Query.Types[1].getNumElements() & 1;
+          },
+          LegalizeMutations::moreElementsToNextPow2(1))
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
       .clampMaxNumElements(1, s16, 8)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index 194fe5be40c2b..76790d128d066 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -595,30 +595,14 @@ entry:
 }
 
 define i16 @sminv_v3i16(<3 x i16> %a) {
-; CHECK-SD-LABEL: sminv_v3i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    mov w8, #32767 // =0x7fff
-; CHECK-SD-NEXT:    mov v0.h[3], w8
-; CHECK-SD-NEXT:    sminv h0, v0.4h
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sminv_v3i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    smov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    smov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, sxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, lt
-; CHECK-GI-NEXT:    cmp w11, w8, sxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, gt
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sminv_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-NEXT:    mov v0.h[3], w8
+; CHECK-NEXT:    sminv h0, v0.4h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
   ret i16 %arg1
@@ -670,28 +654,13 @@ entry:
 }
 
 define i32 @sminv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: sminv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #2147483647 // =0x7fffffff
-; CHECK-SD-NEXT:    mov v0.s[3], w8
-; CHECK-SD-NEXT:    sminv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: sminv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, lt
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, lt
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: sminv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #2147483647 // =0x7fffffff
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    sminv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.smin.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -972,17 +941,10 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: smaxv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    smov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    smov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, sxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, gt
-; CHECK-GI-NEXT:    cmp w11, w8, sxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, lt
+; CHECK-GI-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    smaxv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smax.v3i16(<3 x i16> %a)
@@ -1035,28 +997,13 @@ entry:
 }
 
 define i32 @smaxv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: smaxv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #-2147483648 // =0x80000000
-; CHECK-SD-NEXT:    mov v0.s[3], w8
-; CHECK-SD-NEXT:    smaxv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: smaxv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, gt
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, gt
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: smaxv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #-2147483648 // =0x80000000
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    smaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.smax.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -1335,17 +1282,10 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: uminv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    umov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, uxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, lo
-; CHECK-GI-NEXT:    cmp w11, w8, uxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, hi
+; CHECK-GI-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    uminv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umin.v3i16(<3 x i16> %a)
@@ -1398,28 +1338,13 @@ entry:
 }
 
 define i32 @uminv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: uminv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-SD-NEXT:    mov v0.s[3], w8
-; CHECK-SD-NEXT:    uminv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: uminv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, lo
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, lo
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: uminv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.umin.v3i32(<3 x i32> %a)
   ret i32 %arg1
@@ -1697,17 +1622,10 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: umaxv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    umov w8, v0.h[0]
-; CHECK-GI-NEXT:    umov w9, v0.h[0]
-; CHECK-GI-NEXT:    umov w10, v0.h[1]
-; CHECK-GI-NEXT:    umov w11, v0.h[2]
-; CHECK-GI-NEXT:    umov w13, v0.h[2]
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w12, uxth
-; CHECK-GI-NEXT:    csel w8, w9, w10, hi
-; CHECK-GI-NEXT:    cmp w11, w8, uxth
-; CHECK-GI-NEXT:    csel w0, w8, w13, lo
+; CHECK-GI-NEXT:    mov w8, #0 // =0x0
+; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    umaxv h0, v0.4h
+; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umax.v3i16(<3 x i16> %a)
@@ -1760,27 +1678,12 @@ entry:
 }
 
 define i32 @umaxv_v3i32(<3 x i32> %a) {
-; CHECK-SD-LABEL: umaxv_v3i32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mov v0.s[3], wzr
-; CHECK-SD-NEXT:    umaxv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: umaxv_v3i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, hi
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umaxv_v3i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.s[3], wzr
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
 entry:
   %arg1 = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %arg1
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 8988481708cfb..d71aed2d17506 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -187,27 +187,12 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 }
 
 define i32 @test_v3i32(<3 x i32> %a) nounwind {
-; CHECK-SD-LABEL: test_v3i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov v0.s[3], wzr
-; CHECK-SD-NEXT:    umaxv s0, v0.4s
-; CHECK-SD-NEXT:    fmov w0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_v3i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, hi
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    cmp w8, w9
-; CHECK-GI-NEXT:    fcsel s0, s0, s2, hi
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], wzr
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v3i32(<3 x i32> %a)
   ret i32 %b
 }

From ca0560d8c821fa93e34cf84362fd523b546ba426 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 27 Feb 2024 10:58:20 +0000
Subject: [PATCH 425/546] [AMDGPU] Add new aliases ds_subrev_u32/u64 for
 ds_rsub_u32/u64 (#83118)

Note that the instructions have not been renamed and that there are no
corresponding aliases for ds_rsub_rtn_u32/u64. This matches SP3
behavior.
---
 llvm/lib/Target/AMDGPU/DSInstructions.td | 4 ++++
 llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 515b9476b25b7..074e13317ef89 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1259,6 +1259,10 @@ defm DS_PK_ADD_RTN_F16    : DS_Real_gfx12<0x0aa>;
 defm DS_PK_ADD_BF16       : DS_Real_gfx12<0x09b>;
 defm DS_PK_ADD_RTN_BF16   : DS_Real_gfx12<0x0ab>;
 
+// New aliases added in GFX12 without renaming the instructions.
+def : MnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">, Requires<[isGFX12Plus]>;
+def : MnemonicAlias<"ds_subrev_u64", "ds_rsub_u64">, Requires<[isGFX12Plus]>;
+
 //===----------------------------------------------------------------------===//
 // GFX11.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
index aa4b028160175..7076772484043 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
@@ -23,3 +23,9 @@ ds_min_rtn_f32 v5, v1, v2
 
 ds_min_rtn_f64 v[5:6], v1, v[2:3]
 // GFX12: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
+
+ds_subrev_u32 v1, v2
+// GFX12: ds_rsub_u32 v1, v2                      ; encoding: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
+
+ds_subrev_u64 v1, v[2:3]
+// GFX12: ds_rsub_u64 v1, v[2:3]                  ; encoding: [0x00,0x00,0x08,0xd9,0x01,0x02,0x00,0x00]

From d273a1970e3703cc33ef70382a89e9075287d403 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 27 Feb 2024 11:53:23 +0000
Subject: [PATCH 426/546] [AMDGPU] Shorten mnemonic alias tests (#83121)

Only test one example of each alias. Do not test error cases which are
already tested in the normal (non-alias) tests.
---
 llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s   |  380 +-
 llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s   | 3480 +----------------
 llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s    |  417 --
 llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s    |  186 -
 .../MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s |  411 +-
 .../MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s | 2199 +----------
 llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s   |    9 -
 7 files changed, 198 insertions(+), 6884 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
index 187d18e9045a8..4a55a33928211 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
@@ -1,403 +1,25 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_8_UNORM] offset:4095
 // GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x02,0x03]
 
-tbuffer_load_format_d16_x v255, off, s[8:11], s3, format:1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0xff,0x02,0x03]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s101, format:[BUF_FMT_8_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], m0, format:2 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], 61, format:[BUF_FMT_8_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], 61, format:3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX11: encoding: [0x34,0x00,0x1c,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3, format:[BUF_FMT_8_SSCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0x24,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_x v4, v[1:2], s[8:11], s0, format:4 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0x24,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SSCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x24,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_FMT_8_UINT] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x2c,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:5 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x2c,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x2c,0xe8,0x00,0x04,0x1c,0x03]
-
 tbuffer_load_format_d16_xy v4, off, s[8:11], s3, format:[BUF_FMT_8_SINT] offset:4095
 // GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x02,0x03]
 
-tbuffer_load_format_d16_xy v255, off, s[8:11], s3, format:6 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0xff,0x02,0x03]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s101, format:[BUF_FMT_16_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x3c,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], m0, format:7 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x3c,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x3c,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], 61, format:[BUF_FMT_16_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x44,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], 61, format:8 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x44,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX11: encoding: [0x34,0x80,0x44,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_FMT_16_USCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0x4c,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_xy v4, v[1:2], s[8:11], s0, format:9 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0x4c,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_USCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0x4c,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_FMT_16_SSCALED] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0x54,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:10 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0x54,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0x54,0xe8,0x00,0x04,0x1c,0x03]
-
 tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_UINT] offset:4095
 // GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x02,0x03]
 
-tbuffer_load_format_d16_xyz v[254:255], off, s[8:11], s3, format:11 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0xfe,0x02,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s101, format:[BUF_FMT_16_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x65,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], m0, format:12 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x65,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x65,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], 61, format:[BUF_FMT_16_FLOAT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x6d,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], 61, format:13 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x6d,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_FLOAT] offen offset:52
-// GFX11: encoding: [0x34,0x00,0x6d,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_FMT_8_8_UNORM] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0x75,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], v[1:2], s[8:11], s0, format:14 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0x75,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UNORM] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x75,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_8_8_SNORM] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x7d,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:15 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x7d,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x7d,0xe8,0x00,0x04,0x1c,0x03]
-
 tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s3, format:[BUF_FMT_8_8_USCALED] offset:4095
 // GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x02,0x03]
 
-tbuffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3, format:16 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0xfe,0x02,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x03,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s101, format:[BUF_FMT_8_8_SSCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x8d,0xe8,0x00,0x04,0x03,0x65]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], m0, format:17 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x8d,0xe8,0x00,0x04,0x03,0x7d]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SSCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x8d,0xe8,0x00,0x04,0x02,0x80]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], 61, format:[BUF_FMT_8_8_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x95,0xe8,0x00,0x04,0x02,0xbd]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], 61, format:18 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x95,0xe8,0x00,0x04,0x1c,0xbd]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX11: encoding: [0x34,0x80,0x95,0xe8,0x01,0x04,0x42,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_FMT_8_8_SINT] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0x9d,0xe8,0x01,0x04,0x82,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0, format:19 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0x9d,0xe8,0x01,0x04,0xc2,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SINT] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0x9d,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_32_UINT] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0xa5,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:20 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0xa5,0xe8,0x00,0x04,0x1c,0x03]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0xa5,0xe8,0x00,0x04,0x1c,0x03]
-
 tbuffer_store_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_2_10_10_10_SINT] offset:4095
 // GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x02,0x03]
 
-tbuffer_store_format_d16_x v255, off, s[8:11], s3, format:41 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0xff,0x02,0x03]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_2_10_10_10, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s101, format:[BUF_FMT_8_8_8_8_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x56,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], m0, format:42 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x56,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x56,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], 61, format:[BUF_FMT_8_8_8_8_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5e,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], 61, format:43 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5e,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX11: encoding: [0x34,0x00,0x5e,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3, format:[BUF_FMT_8_8_8_8_USCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0x66,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_x v4, v[1:2], s[8:11], s0, format:44 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0x66,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_USCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x66,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_FMT_8_8_8_8_SSCALED] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x6e,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:45 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x6e,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x6e,0xe9,0x00,0x04,0x1c,0x03]
-
 tbuffer_store_format_d16_xy v4, off, s[8:11], s3, format:[BUF_FMT_8_8_8_8_UINT] offset:4095
 // GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x02,0x03]
 
-tbuffer_store_format_d16_xy v255, off, s[8:11], s3, format:46 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0xff,0x02,0x03]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s3, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s101, format:[BUF_FMT_8_8_8_8_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x7e,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], m0, format:47 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x7e,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], 0, format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x7e,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], 61, format:[BUF_FMT_32_32_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x86,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], 61, format:48 offset:4095
-// GFX11: encoding: [0xff,0x8f,0x86,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX11: encoding: [0x34,0x80,0x86,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3, format:[BUF_FMT_32_32_SINT] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0x8e,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_xy v4, v[1:2], s[8:11], s0, format:49 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0x8e,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_SINT] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0x8e,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_FMT_32_32_FLOAT] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0x96,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:50 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0x96,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0x96,0xe9,0x00,0x04,0x1c,0x03]
-
 tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_16_16_16_UNORM] offset:4095
 // GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x02,0x03]
 
-tbuffer_store_format_d16_xyz v[254:255], off, s[8:11], s3, format:51 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0xfe,0x02,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s101, format:[BUF_FMT_16_16_16_16_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0xa7,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], m0, format:52 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xa7,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0xa7,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], 61, format:[BUF_FMT_16_16_16_16_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x0f,0xaf,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], 61, format:53 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xaf,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX11: encoding: [0x34,0x00,0xaf,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3, format:[BUF_FMT_16_16_16_16_SSCALED] idxen offset:52
-// GFX11: encoding: [0x34,0x00,0xb7,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], v[1:2], s[8:11], s0, format:54 idxen offen offset:52
-// GFX11: encoding: [0x34,0x00,0xb7,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SSCALED] offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xb7,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_16_16_16_16_UINT] offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xbf,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:55 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0xbf,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0xbf,0xe9,0x00,0x04,0x1c,0x03]
-
 tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_16_16_16_SINT] offset:4095
 // GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x02,0x03]
-
-tbuffer_store_format_d16_xyzw v[254:255], off, s[8:11], s3, format:56 offset:4095
-// GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0xfe,0x02,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s3, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x03,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s101, format:[BUF_FMT_16_16_16_16_FLOAT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xcf,0xe9,0x00,0x04,0x03,0x65]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], m0, format:57 offset:4095
-// GFX11: encoding: [0xff,0x8f,0xcf,0xe9,0x00,0x04,0x03,0x7d]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], 0, format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_FLOAT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xcf,0xe9,0x00,0x04,0x02,0x80]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], 61, format:[BUF_FMT_32_32_32_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xd7,0xe9,0x00,0x04,0x02,0xbd]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], 61, format:58 offset:4095
-// GFX11: encoding: [0xff,0x8f,0xd7,0xe9,0x00,0x04,0x1c,0xbd]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX11: encoding: [0x34,0x80,0xd7,0xe9,0x01,0x04,0x42,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3, format:[BUF_FMT_32_32_32_SINT] idxen offset:52
-// GFX11: encoding: [0x34,0x80,0xdf,0xe9,0x01,0x04,0x82,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0, format:59 idxen offen offset:52
-// GFX11: encoding: [0x34,0x80,0xdf,0xe9,0x01,0x04,0xc2,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_SINT] offset:4095 glc
-// GFX11: encoding: [0xff,0xcf,0xdf,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_FMT_32_32_32_FLOAT] offset:4095 slc
-// GFX11: encoding: [0xff,0x9f,0xe7,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:60 offset:4095 dlc
-// GFX11: encoding: [0xff,0xaf,0xe7,0xe9,0x00,0x04,0x1c,0x03]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3, format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0xff,0xe7,0xe9,0x00,0x04,0x1c,0x03]
-
-//Removed formats (compared to gfx10)
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_UNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_SNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_USCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_SSCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_UINT] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_11_11_SINT] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_UNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_SNORM] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_USCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_SSCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_11_11_10_UINT] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_10_10_2_USCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: unsupported format
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
index 9cf7503e5f375..fd58215f89ad5 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
@@ -1,3453 +1,187 @@
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 buffer_load_dword v5, off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_dword v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dword v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dword v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dword v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dword v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dword v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dword v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dword v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dword v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dword v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x50,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:4095 lds
-// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx2 v[5:7], off, s[8:11], s3 offset:4095 dlc tfe
-// GFX11: encoding: [0xff,0x2f,0x54,0xe0,0x00,0x05,0x22,0x03]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x54,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[253:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0xfd,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x58,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[252:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x5c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_short_d16 v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_short_d16 v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_short_d16 v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_short_d16 v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_short_d16 v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_short_d16 v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x80,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_x v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_x v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_x v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_x v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_x v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_x v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x20,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_xy v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_xy v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_xy v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_xy v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_xy v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_xy v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x24,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x28,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x2c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_short_d16_hi v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_short_d16_hi v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_short_d16_hi v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_short_d16_hi v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_short_d16_hi v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_short_d16_hi v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x8c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x98,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x88,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x84,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x7c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x78,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sbyte v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sbyte v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sbyte v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sbyte v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sbyte v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sbyte v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x44,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_sshort v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_sshort v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_sshort v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_sshort v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_sshort v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_sshort v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_sshort v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_sshort v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_sshort v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_sshort v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x4c,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ubyte v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ubyte v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ubyte v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ubyte v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ubyte v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ubyte v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ubyte v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ubyte v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x40,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_load_ushort v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_load_ushort v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_load_ushort v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_load_ushort v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_load_ushort v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_load_ushort v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_load_ushort v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_load_ushort v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_load_ushort v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 dlc
-// GFX11: encoding: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_load_ushort v5, off, s[8:11], s3 offset:4095 glc slc dlc
-// GFX11: encoding: [0xff,0x7f,0x48,0xe0,0x00,0x05,0x02,0x03]
-
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_byte v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_byte v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_byte v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_byte v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_byte v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_byte v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_byte v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_byte v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_byte v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_byte v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x60,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_short v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_short v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_short v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_short v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_short v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_short v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_short v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_short v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_short v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_short v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_short v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x64,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_dword v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dword v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dword v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dword v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dword v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dword v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dword v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dword v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dword v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dword v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x68,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[254:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0xfe,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x6c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[253:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0xfd,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x70,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0xfc,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x74,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_x v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_x v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_x v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_x v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_x v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_x v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x30,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_xy v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_xy v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_xy v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_xy v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_xy v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_xy v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x34,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[254:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0xfe,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x38,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[254:255], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0xfe,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x3c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x90,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_short_d16_hi v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_short_d16_hi v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_short_d16_hi v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_short_d16_hi v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_short_d16_hi v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_short_d16_hi v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x94,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v255, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0xff,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[16:19], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x04,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[96:99], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x18,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s101 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x65]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], m0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x7d]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], 0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x80]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], -1 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0xc1]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], 0.5 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0xf0]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], -4.0 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0xf7]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 idxen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x83,0x04]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 offen offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x43,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4
-//  GFX11: encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:0
-//  GFX11: encoding: [0x00,0x00,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:7
-//  GFX11: encoding: [0x07,0x00,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 glc
-//  GFX11: encoding: [0xff,0x4f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 slc
-//  GFX11: encoding: [0xff,0x1f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 dlc
-//  GFX11: encoding: [0xff,0x2f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095 glc slc dlc
-//  GFX11: encoding: [0xff,0x7f,0x9c,0xe0,0x00,0x01,0x03,0x04]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_add v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_add v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_add v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_add v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_add v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_add v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_add v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_add v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_add v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xd4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x0c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_and v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_and v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_and v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_and v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_and v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_and v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_and v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_and v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_and v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xf0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x24,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xd0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[252:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0xfc,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x08,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_fcmpswap v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_fcmpswap v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x40,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v255, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_csub v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xdc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[12:15], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_csub v5, off, s[96:99], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s101 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_csub v5, off, s[8:11], m0 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_csub v5, off, s[8:11], 0 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_csub v5, off, s[8:11], -1 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_csub v5, off, s[8:11], 0.5 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_csub v5, off, s[8:11], -4.0 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 glc
-// GFX11: encoding: [0x00,0x40,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:0 glc
-// GFX11: encoding: [0x00,0x40,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:7 glc
-// GFX11: encoding: [0x07,0x40,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xdc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_dec v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_dec v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_dec v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_dec v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_dec v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_dec v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x00,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x34,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_inc v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_inc v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_inc v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_inc v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_inc v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_inc v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xfc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x30,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0xff,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_fmax v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_fmax v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_fmax v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_fmax v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_fmax v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_fmax v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x48,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smax v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smax v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smax v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smax v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smax v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smax v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xe8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x1c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_umax v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_umax v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_umax v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_umax v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_umax v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_umax v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xec,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x20,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0xff,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_fmin v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_fmin v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_fmin v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_fmin v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_fmin v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_fmin v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x44,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_smin v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_smin v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_smin v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_smin v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_smin v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_smin v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xe0,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_smin_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0xfe,0x02,0x03]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x03,0x03]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x18,0x03]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x65]
+buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x7d]
+buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x80]
+buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xc1]
+buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xf0]
+buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xf7]
+buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x82,0x03]
+buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x42,0x03]
+buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
+buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
+buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
+buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x02,0x03]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x02,0x03]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x14,0xe1,0x00,0x05,0x02,0x03]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0xff,0x02,0x03]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x03,0x03]
+buffer_store_short v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x18,0x03]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x65]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x7d]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x80]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0xc1]
+buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0xf0]
+buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0xf7]
+buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x82,0x03]
+buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x42,0x03]
+buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095
+//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_atomic_add v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_atomic_and v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xe4,0xe0,0x00,0x05,0x02,0x03]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0xfe,0x02,0x03]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x03,0x03]
+buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x18,0x03]
+buffer_atomic_csub v255, off, s[8:11], s3 offset:4095 glc
+// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x65]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x7d]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x80]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xc1]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xf0]
+buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xf7]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x82,0x03]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x42,0x03]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x18,0xe1,0x00,0x05,0x02,0x03]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095
+// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_or v5, off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_or v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_or v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_or v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_or v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_or v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_or v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_or v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_or v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_or v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xf4,0xe0,0x00,0x05,0x02,0x03]
-
 buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x28,0xe1,0x00,0x05,0x02,0x03]
-
 buffer_atomic_sub v5, off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_sub v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_sub v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_sub v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_sub v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_sub v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_sub v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xd8,0xe0,0x00,0x05,0x02,0x03]
-
 buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x10,0xe1,0x00,0x05,0x02,0x03]
-
 buffer_atomic_swap v5, off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_swap v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_swap v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_swap v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_swap v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_swap v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_swap v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xcc,0xe0,0x00,0x05,0x02,0x03]
-
 buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x04,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x04,0xe1,0x00,0x05,0x02,0x03]
-
 buffer_atomic_xor v5, off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v255, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0xff,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x03,0x03]
-
-buffer_atomic_xor v5, off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x18,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x65]
-
-buffer_atomic_xor v5, off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_xor v5, off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x80]
-
-buffer_atomic_xor v5, off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_xor v5, off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_xor v5, off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x82,0x03]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x42,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0xf8,0xe0,0x00,0x05,0x02,0x03]
-
 buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095
 // GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[254:255], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0xfe,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[12:15], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x03,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[96:99], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x18,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s101 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x65]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], m0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x7d]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], 0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x80]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], -1 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xc1]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], 0.5 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xf0]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], -4.0 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xf7]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x82,0x03]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x42,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3
-// GFX11: encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX11: encoding: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX11: encoding: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
-// GFX11: encoding: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x02,0x03]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 glc slc
-// GFX11: encoding: [0xff,0x5f,0x2c,0xe1,0x00,0x05,0x02,0x03]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
index 9331931940906..97a55c6df75f5 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
@@ -7,446 +7,29 @@
 s_load_dword s5, s[2:3], s0
 // GFX11: s_load_b32 s5, s[2:3], s0               ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dword s101, s[2:3], s0
-// GFX11: s_load_b32 s101, s[2:3], s0             ; encoding: [0x41,0x19,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword vcc_lo, s[2:3], s0
-// GFX11: s_load_b32 vcc_lo, s[2:3], s0           ; encoding: [0x81,0x1a,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword vcc_hi, s[2:3], s0
-// GFX11: s_load_b32 vcc_hi, s[2:3], s0           ; encoding: [0xc1,0x1a,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[4:5], s0
-// GFX11: s_load_b32 s5, s[4:5], s0               ; encoding: [0x42,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[100:101], s0
-// GFX11: s_load_b32 s5, s[100:101], s0           ; encoding: [0x72,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, vcc, s0
-// GFX11: s_load_b32 s5, vcc, s0                  ; encoding: [0x75,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], s101
-// GFX11: s_load_b32 s5, s[2:3], s101             ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dword s5, s[2:3], vcc_lo
-// GFX11: s_load_b32 s5, s[2:3], vcc_lo           ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dword s5, s[2:3], vcc_hi
-// GFX11: s_load_b32 s5, s[2:3], vcc_hi           ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dword s5, s[2:3], m0
-// GFX11: s_load_b32 s5, s[2:3], m0               ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dword s5, s[2:3], 0x0
-// GFX11: s_load_b32 s5, s[2:3], 0x0              ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dword s5, s[2:3], s0 glc
-// GFX11: s_load_b32 s5, s[2:3], s0 glc           ; encoding: [0x41,0x41,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], s0 dlc
-// GFX11: s_load_b32 s5, s[2:3], s0 dlc           ; encoding: [0x41,0x21,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], s0 glc dlc
-// GFX11: s_load_b32 s5, s[2:3], s0 glc dlc       ; encoding: [0x41,0x61,0x00,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dword s5, s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b32 s5, s[2:3], 0x1234 glc dlc   ; encoding: [0x41,0x61,0x00,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx2 s[10:11], s[2:3], s0
 // GFX11: s_load_b64 s[10:11], s[2:3], s0         ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx2 s[12:13], s[2:3], s0
-// GFX11: s_load_b64 s[12:13], s[2:3], s0         ; encoding: [0x01,0x03,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[100:101], s[2:3], s0
-// GFX11: s_load_b64 s[100:101], s[2:3], s0       ; encoding: [0x01,0x19,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 vcc, s[2:3], s0
-// GFX11: s_load_b64 vcc, s[2:3], s0              ; encoding: [0x81,0x1a,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[4:5], s0
-// GFX11: s_load_b64 s[10:11], s[4:5], s0         ; encoding: [0x82,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[100:101], s0
-// GFX11: s_load_b64 s[10:11], s[100:101], s0     ; encoding: [0xb2,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], vcc, s0
-// GFX11: s_load_b64 s[10:11], vcc, s0            ; encoding: [0xb5,0x02,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], s101
-// GFX11: s_load_b64 s[10:11], s[2:3], s101       ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx2 s[10:11], s[2:3], vcc_lo
-// GFX11: s_load_b64 s[10:11], s[2:3], vcc_lo     ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx2 s[10:11], s[2:3], vcc_hi
-// GFX11: s_load_b64 s[10:11], s[2:3], vcc_hi     ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx2 s[10:11], s[2:3], m0
-// GFX11: s_load_b64 s[10:11], s[2:3], m0         ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx2 s[10:11], s[2:3], 0x0
-// GFX11: s_load_b64 s[10:11], s[2:3], 0x0        ; encoding: [0x81,0x02,0x04,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx2 s[10:11], s[2:3], s0 glc
-// GFX11: s_load_b64 s[10:11], s[2:3], s0 glc     ; encoding: [0x81,0x42,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], s0 dlc
-// GFX11: s_load_b64 s[10:11], s[2:3], s0 dlc     ; encoding: [0x81,0x22,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], s0 glc dlc
-// GFX11: s_load_b64 s[10:11], s[2:3], s0 glc dlc ; encoding: [0x81,0x62,0x04,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx2 s[10:11], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b64 s[10:11], s[2:3], 0x1234 glc dlc ; encoding: [0x81,0x62,0x04,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx4 s[20:23], s[2:3], s0
 // GFX11: s_load_b128 s[20:23], s[2:3], s0        ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx4 s[24:27], s[2:3], s0
-// GFX11: s_load_b128 s[24:27], s[2:3], s0        ; encoding: [0x01,0x06,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[96:99], s[2:3], s0
-// GFX11: s_load_b128 s[96:99], s[2:3], s0        ; encoding: [0x01,0x18,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[4:5], s0
-// GFX11: s_load_b128 s[20:23], s[4:5], s0        ; encoding: [0x02,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[100:101], s0
-// GFX11: s_load_b128 s[20:23], s[100:101], s0    ; encoding: [0x32,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], vcc, s0
-// GFX11: s_load_b128 s[20:23], vcc, s0           ; encoding: [0x35,0x05,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], s101
-// GFX11: s_load_b128 s[20:23], s[2:3], s101      ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx4 s[20:23], s[2:3], vcc_lo
-// GFX11: s_load_b128 s[20:23], s[2:3], vcc_lo    ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx4 s[20:23], s[2:3], vcc_hi
-// GFX11: s_load_b128 s[20:23], s[2:3], vcc_hi    ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx4 s[20:23], s[2:3], m0
-// GFX11: s_load_b128 s[20:23], s[2:3], m0        ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx4 s[20:23], s[2:3], 0x0
-// GFX11: s_load_b128 s[20:23], s[2:3], 0x0       ; encoding: [0x01,0x05,0x08,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx4 s[20:23], s[2:3], s0 glc
-// GFX11: s_load_b128 s[20:23], s[2:3], s0 glc    ; encoding: [0x01,0x45,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], s0 dlc
-// GFX11: s_load_b128 s[20:23], s[2:3], s0 dlc    ; encoding: [0x01,0x25,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], s0 glc dlc
-// GFX11: s_load_b128 s[20:23], s[2:3], s0 glc dlc ; encoding: [0x01,0x65,0x08,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx4 s[20:23], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b128 s[20:23], s[2:3], 0x1234 glc dlc ; encoding: [0x01,0x65,0x08,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx8 s[20:27], s[2:3], s0
 // GFX11: s_load_b256 s[20:27], s[2:3], s0        ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx8 s[24:31], s[2:3], s0
-// GFX11: s_load_b256 s[24:31], s[2:3], s0        ; encoding: [0x01,0x06,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[92:99], s[2:3], s0
-// GFX11: s_load_b256 s[92:99], s[2:3], s0        ; encoding: [0x01,0x17,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[4:5], s0
-// GFX11: s_load_b256 s[20:27], s[4:5], s0        ; encoding: [0x02,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[100:101], s0
-// GFX11: s_load_b256 s[20:27], s[100:101], s0    ; encoding: [0x32,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], vcc, s0
-// GFX11: s_load_b256 s[20:27], vcc, s0           ; encoding: [0x35,0x05,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], s101
-// GFX11: s_load_b256 s[20:27], s[2:3], s101      ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx8 s[20:27], s[2:3], vcc_lo
-// GFX11: s_load_b256 s[20:27], s[2:3], vcc_lo    ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx8 s[20:27], s[2:3], vcc_hi
-// GFX11: s_load_b256 s[20:27], s[2:3], vcc_hi    ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx8 s[20:27], s[2:3], m0
-// GFX11: s_load_b256 s[20:27], s[2:3], m0        ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx8 s[20:27], s[2:3], 0x0
-// GFX11: s_load_b256 s[20:27], s[2:3], 0x0       ; encoding: [0x01,0x05,0x0c,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx8 s[20:27], s[2:3], s0 glc
-// GFX11: s_load_b256 s[20:27], s[2:3], s0 glc    ; encoding: [0x01,0x45,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], s0 dlc
-// GFX11: s_load_b256 s[20:27], s[2:3], s0 dlc    ; encoding: [0x01,0x25,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], s0 glc dlc
-// GFX11: s_load_b256 s[20:27], s[2:3], s0 glc dlc ; encoding: [0x01,0x65,0x0c,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx8 s[20:27], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b256 s[20:27], s[2:3], 0x1234 glc dlc ; encoding: [0x01,0x65,0x0c,0xf4,0x34,0x12,0x00,0xf8]
-
 s_load_dwordx16 s[20:35], s[2:3], s0
 // GFX11: s_load_b512 s[20:35], s[2:3], s0        ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
 
-s_load_dwordx16 s[24:39], s[2:3], s0
-// GFX11: s_load_b512 s[24:39], s[2:3], s0        ; encoding: [0x01,0x06,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[84:99], s[2:3], s0
-// GFX11: s_load_b512 s[84:99], s[2:3], s0        ; encoding: [0x01,0x15,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[4:5], s0
-// GFX11: s_load_b512 s[20:35], s[4:5], s0        ; encoding: [0x02,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[100:101], s0
-// GFX11: s_load_b512 s[20:35], s[100:101], s0    ; encoding: [0x32,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], vcc, s0
-// GFX11: s_load_b512 s[20:35], vcc, s0           ; encoding: [0x35,0x05,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], s101
-// GFX11: s_load_b512 s[20:35], s[2:3], s101      ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xca]
-
-s_load_dwordx16 s[20:35], s[2:3], vcc_lo
-// GFX11: s_load_b512 s[20:35], s[2:3], vcc_lo    ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xd4]
-
-s_load_dwordx16 s[20:35], s[2:3], vcc_hi
-// GFX11: s_load_b512 s[20:35], s[2:3], vcc_hi    ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xd6]
-
-s_load_dwordx16 s[20:35], s[2:3], m0
-// GFX11: s_load_b512 s[20:35], s[2:3], m0        ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xfa]
-
-s_load_dwordx16 s[20:35], s[2:3], 0x0
-// GFX11: s_load_b512 s[20:35], s[2:3], 0x0       ; encoding: [0x01,0x05,0x10,0xf4,0x00,0x00,0x00,0xf8]
-
-s_load_dwordx16 s[20:35], s[2:3], s0 glc
-// GFX11: s_load_b512 s[20:35], s[2:3], s0 glc    ; encoding: [0x01,0x45,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], s0 dlc
-// GFX11: s_load_b512 s[20:35], s[2:3], s0 dlc    ; encoding: [0x01,0x25,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], s0 glc dlc
-// GFX11: s_load_b512 s[20:35], s[2:3], s0 glc dlc ; encoding: [0x01,0x65,0x10,0xf4,0x00,0x00,0x00,0x00]
-
-s_load_dwordx16 s[20:35], s[2:3], 0x1234 glc dlc
-// GFX11: s_load_b512 s[20:35], s[2:3], 0x1234 glc dlc ; encoding: [0x01,0x65,0x10,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dword s5, s[4:7], s0
 // GFX11: s_buffer_load_b32 s5, s[4:7], s0        ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dword s101, s[4:7], s0
-// GFX11: s_buffer_load_b32 s101, s[4:7], s0      ; encoding: [0x42,0x19,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword vcc_lo, s[4:7], s0
-// GFX11: s_buffer_load_b32 vcc_lo, s[4:7], s0    ; encoding: [0x82,0x1a,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword vcc_hi, s[4:7], s0
-// GFX11: s_buffer_load_b32 vcc_hi, s[4:7], s0    ; encoding: [0xc2,0x1a,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[8:11], s0
-// GFX11: s_buffer_load_b32 s5, s[8:11], s0       ; encoding: [0x44,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[96:99], s0
-// GFX11: s_buffer_load_b32 s5, s[96:99], s0      ; encoding: [0x70,0x01,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], s101
-// GFX11: s_buffer_load_b32 s5, s[4:7], s101      ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dword s5, s[4:7], vcc_lo
-// GFX11: s_buffer_load_b32 s5, s[4:7], vcc_lo    ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dword s5, s[4:7], vcc_hi
-// GFX11: s_buffer_load_b32 s5, s[4:7], vcc_hi    ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dword s5, s[4:7], m0
-// GFX11: s_buffer_load_b32 s5, s[4:7], m0        ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dword s5, s[4:7], 0x0
-// GFX11: s_buffer_load_b32 s5, s[4:7], 0x0       ; encoding: [0x42,0x01,0x20,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dword s5, s[4:7], s0 glc
-// GFX11: s_buffer_load_b32 s5, s[4:7], s0 glc    ; encoding: [0x42,0x41,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], s0 dlc
-// GFX11: s_buffer_load_b32 s5, s[4:7], s0 dlc    ; encoding: [0x42,0x21,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b32 s5, s[4:7], s0 glc dlc ; encoding: [0x42,0x61,0x20,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dword s5, s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b32 s5, s[4:7], 0x1234 glc dlc ; encoding: [0x42,0x61,0x20,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx2 s[10:11], s[4:7], s0
 // GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0  ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[12:13], s[4:7], s0
-// GFX11: s_buffer_load_b64 s[12:13], s[4:7], s0  ; encoding: [0x02,0x03,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[100:101], s[4:7], s0
-// GFX11: s_buffer_load_b64 s[100:101], s[4:7], s0 ; encoding: [0x02,0x19,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 vcc, s[4:7], s0
-// GFX11: s_buffer_load_b64 vcc, s[4:7], s0       ; encoding: [0x82,0x1a,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[8:11], s0
-// GFX11: s_buffer_load_b64 s[10:11], s[8:11], s0 ; encoding: [0x84,0x02,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[96:99], s0
-// GFX11: s_buffer_load_b64 s[10:11], s[96:99], s0 ; encoding: [0xb0,0x02,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s101
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s101 ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], vcc_lo ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], vcc_hi ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], m0
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], m0  ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], 0x0
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], 0x0 ; encoding: [0x82,0x02,0x24,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s0 glc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0 glc ; encoding: [0x82,0x42,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0 dlc ; encoding: [0x82,0x22,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], s0 glc dlc ; encoding: [0x82,0x62,0x24,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx2 s[10:11], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b64 s[10:11], s[4:7], 0x1234 glc dlc ; encoding: [0x82,0x62,0x24,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx4 s[20:23], s[4:7], s0
 // GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[24:27], s[4:7], s0
-// GFX11: s_buffer_load_b128 s[24:27], s[4:7], s0 ; encoding: [0x02,0x06,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[96:99], s[4:7], s0
-// GFX11: s_buffer_load_b128 s[96:99], s[4:7], s0 ; encoding: [0x02,0x18,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[8:11], s0
-// GFX11: s_buffer_load_b128 s[20:23], s[8:11], s0 ; encoding: [0x04,0x05,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[96:99], s0
-// GFX11: s_buffer_load_b128 s[20:23], s[96:99], s0 ; encoding: [0x30,0x05,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s101
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s101 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], vcc_lo ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], vcc_hi ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], m0
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], m0 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], 0x0
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], 0x0 ; encoding: [0x02,0x05,0x28,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s0 glc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 glc ; encoding: [0x02,0x45,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 dlc ; encoding: [0x02,0x25,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], s0 glc dlc ; encoding: [0x02,0x65,0x28,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx4 s[20:23], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b128 s[20:23], s[4:7], 0x1234 glc dlc ; encoding: [0x02,0x65,0x28,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx8 s[20:27], s[4:7], s0
 // GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[24:31], s[4:7], s0
-// GFX11: s_buffer_load_b256 s[24:31], s[4:7], s0 ; encoding: [0x02,0x06,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[92:99], s[4:7], s0
-// GFX11: s_buffer_load_b256 s[92:99], s[4:7], s0 ; encoding: [0x02,0x17,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[8:11], s0
-// GFX11: s_buffer_load_b256 s[20:27], s[8:11], s0 ; encoding: [0x04,0x05,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[96:99], s0
-// GFX11: s_buffer_load_b256 s[20:27], s[96:99], s0 ; encoding: [0x30,0x05,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s101
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s101 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], vcc_lo ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], vcc_hi ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], m0
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], m0 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], 0x0
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], 0x0 ; encoding: [0x02,0x05,0x2c,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s0 glc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 glc ; encoding: [0x02,0x45,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 dlc ; encoding: [0x02,0x25,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], s0 glc dlc ; encoding: [0x02,0x65,0x2c,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx8 s[20:27], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b256 s[20:27], s[4:7], 0x1234 glc dlc ; encoding: [0x02,0x65,0x2c,0xf4,0x34,0x12,0x00,0xf8]
-
 s_buffer_load_dwordx16 s[20:35], s[4:7], s0
 // GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[24:39], s[4:7], s0
-// GFX11: s_buffer_load_b512 s[24:39], s[4:7], s0 ; encoding: [0x02,0x06,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[84:99], s[4:7], s0
-// GFX11: s_buffer_load_b512 s[84:99], s[4:7], s0 ; encoding: [0x02,0x15,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[8:11], s0
-// GFX11: s_buffer_load_b512 s[20:35], s[8:11], s0 ; encoding: [0x04,0x05,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[96:99], s0
-// GFX11: s_buffer_load_b512 s[20:35], s[96:99], s0 ; encoding: [0x30,0x05,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s101
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s101 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xca]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_lo
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], vcc_lo ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xd4]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_hi
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], vcc_hi ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xd6]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], m0
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], m0 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xfa]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], 0x0
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], 0x0 ; encoding: [0x02,0x05,0x30,0xf4,0x00,0x00,0x00,0xf8]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s0 glc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 glc ; encoding: [0x02,0x45,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s0 dlc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 dlc ; encoding: [0x02,0x25,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], s0 glc dlc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], s0 glc dlc ; encoding: [0x02,0x65,0x30,0xf4,0x00,0x00,0x00,0x00]
-
-s_buffer_load_dwordx16 s[20:35], s[4:7], 0x1234 glc dlc
-// GFX11: s_buffer_load_b512 s[20:35], s[4:7], 0x1234 glc dlc ; encoding: [0x02,0x65,0x30,0xf4,0x34,0x12,0x00,0xf8]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
index 86c3bdbaf8300..6efc561991c73 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
@@ -27,197 +27,11 @@ s_subb_u32 s0, s1, s2
 s_min_f32 s5, s1, s2
 // GFX12: encoding: [0x01,0x02,0x05,0xa1]
 
-s_min_f32 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0x69,0xa1]
-
-s_min_f32 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x05,0xa1]
-
-s_min_f32 s5, s103, s2
-// GFX12: encoding: [0x67,0x02,0x05,0xa1]
-
-s_min_f32 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x05,0xa1]
-
-s_min_f32 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x05,0xa1]
-
-s_min_f32 s5, ttmp11, s2
-// GFX12: encoding: [0x77,0x02,0x05,0xa1]
-
-s_min_f32 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x05,0xa1]
-
-s_min_f32 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x05,0xa1]
-
-s_min_f32 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x05,0xa1]
-
-s_min_f32 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x05,0xa1]
-
-s_min_f32 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x05,0xa1]
-
-s_min_f32 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x05,0xa1]
-
-s_min_f32 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x05,0xa1]
-
-s_min_f32 s5, 0xaf123456, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa1,0x56,0x34,0x12,0xaf]
-
-s_min_f32 s5, 0x3f717273, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa1,0x73,0x72,0x71,0x3f]
-
-s_min_f32 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x05,0xa1]
-
 s_max_f32 s5, s1, s2
 // GFX12: encoding: [0x01,0x02,0x85,0xa1]
 
-s_max_f32 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0xe9,0xa1]
-
-s_max_f32 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x85,0xa1]
-
-s_max_f32 s5, s103, s2
-// GFX12: encoding: [0x67,0x02,0x85,0xa1]
-
-s_max_f32 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x85,0xa1]
-
-s_max_f32 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x85,0xa1]
-
-s_max_f32 s5, ttmp11, s2
-// GFX12: encoding: [0x77,0x02,0x85,0xa1]
-
-s_max_f32 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x85,0xa1]
-
-s_max_f32 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x85,0xa1]
-
-s_max_f32 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x85,0xa1]
-
-s_max_f32 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x85,0xa1]
-
-s_max_f32 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x85,0xa1]
-
-s_max_f32 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x85,0xa1]
-
-s_max_f32 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x85,0xa1]
-
-s_max_f32 s5, 0xaf123456, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa1,0x56,0x34,0x12,0xaf]
-
-s_max_f32 s5, 0x3f717273, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa1,0x73,0x72,0x71,0x3f]
-
-s_max_f32 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x85,0xa1]
-
 s_max_f16 s5, s1, s2
 // GFX12: encoding: [0x01,0x02,0x05,0xa6]
 
-s_max_f16 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0x69,0xa6]
-
-s_max_f16 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x05,0xa6]
-
-s_max_f16 s5, s101, s2
-// GFX12: encoding: [0x65,0x02,0x05,0xa6]
-
-s_max_f16 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x05,0xa6]
-
-s_max_f16 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x05,0xa6]
-
-s_max_f16 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x05,0xa6]
-
-s_max_f16 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x05,0xa6]
-
-s_max_f16 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x05,0xa6]
-
-s_max_f16 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x05,0xa6]
-
-s_max_f16 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x05,0xa6]
-
-s_max_f16 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x05,0xa6]
-
-s_max_f16 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x05,0xa6]
-
-s_max_f16 s5, 0xfe0b, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa6,0x0b,0xfe,0x00,0x00]
-
-s_max_f16 s5, 0x3456, s2
-// GFX12: encoding: [0xff,0x02,0x05,0xa6,0x56,0x34,0x00,0x00]
-
-s_max_f16 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x05,0xa6]
-
 s_min_f16 s5, s1, s2
 // GFX12: encoding: [0x01,0x02,0x85,0xa5]
-
-s_min_f16 s105, s1, s2
-// GFX12: encoding: [0x01,0x02,0xe9,0xa5]
-
-s_min_f16 s5, s105, s2
-// GFX12: encoding: [0x69,0x02,0x85,0xa5]
-
-s_min_f16 s5, s101, s2
-// GFX12: encoding: [0x65,0x02,0x85,0xa5]
-
-s_min_f16 s5, vcc_lo, s2
-// GFX12: encoding: [0x6a,0x02,0x85,0xa5]
-
-s_min_f16 s5, vcc_hi, s2
-// GFX12: encoding: [0x6b,0x02,0x85,0xa5]
-
-s_min_f16 s5, m0, s2
-// GFX12: encoding: [0x7d,0x02,0x85,0xa5]
-
-s_min_f16 s5, exec_lo, s2
-// GFX12: encoding: [0x7e,0x02,0x85,0xa5]
-
-s_min_f16 s5, exec_hi, s2
-// GFX12: encoding: [0x7f,0x02,0x85,0xa5]
-
-s_min_f16 s5, 0, s2
-// GFX12: encoding: [0x80,0x02,0x85,0xa5]
-
-s_min_f16 s5, -1, s2
-// GFX12: encoding: [0xc1,0x02,0x85,0xa5]
-
-s_min_f16 s5, 0.5, s2
-// GFX12: encoding: [0xf0,0x02,0x85,0xa5]
-
-s_min_f16 s5, -4.0, s2
-// GFX12: encoding: [0xf7,0x02,0x85,0xa5]
-
-s_min_f16 s5, 0xfe0b, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa5,0x0b,0xfe,0x00,0x00]
-
-s_min_f16 s5, 0x3456, s2
-// GFX12: encoding: [0xff,0x02,0x85,0xa5,0x56,0x34,0x00,0x00]
-
-s_min_f16 s5, s1, s105
-// GFX12: encoding: [0x01,0x69,0x85,0xa5]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
index 15d6269b6f595..ebe3af66bb0c3 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
@@ -1,434 +1,25 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 tbuffer_load_format_d16_x v4, off, s[8:11], s3 format:[BUF_FMT_8_UNORM] offset:8388607
 // GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-tbuffer_load_format_d16_x v255, off, s[8:11], s3 format:1 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], s101 format:[BUF_FMT_8_SNORM] offset:8388607
-// GFX12: encoding: [0x65,0x00,0x22,0xc4,0x04,0x18,0x00,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[12:15], m0 format:2 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x22,0xc4,0x04,0x18,0x00,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SNORM] offset:8388607
-// GFX12: encoding: [0x00,0x00,0x22,0xc4,0x04,0x10,0x00,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, s[8:11], s61 format:[BUF_FMT_8_USCALED] offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x22,0xc4,0x04,0x10,0x80,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s61 format:3 offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x22,0xc4,0x04,0xe0,0x80,0x01,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x41,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_x v4, v1, s[8:11], s3 format:[BUF_FMT_8_SSCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x00,0x82,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_x v4, v[1:2], s[8:11], s0 format:4 idxen offen offset:52
-// GFX12: encoding: [0x00,0x00,0x22,0xc4,0x04,0x10,0x00,0xc2,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0x68,0x02,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_UINT] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0xbc,0x02,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:5 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0x84,0x02,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_UINT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_load_format_d16_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:8388607
 // GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
 
-tbuffer_load_format_d16_xy v255, off, s[8:11], s3 format:6 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0xff,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_8, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x18,0x00,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], s101 format:[BUF_FMT_16_UNORM] offset:8388607
-// GFX12: encoding: [0x65,0x40,0x22,0xc4,0x04,0x18,0x80,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[12:15], m0 format:7 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x22,0xc4,0x04,0x18,0x80,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x00,0x40,0x22,0xc4,0x04,0x10,0x80,0x03,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, s[8:11], s61 format:[BUF_FMT_16_SNORM] offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x22,0xc4,0x04,0x10,0x00,0x04,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s61 format:8 offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x22,0xc4,0x04,0xe0,0x00,0x04,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x44,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_FMT_16_USCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x80,0x84,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xy v4, v[1:2], s[8:11], s0 format:9 idxen offen offset:52
-// GFX12: encoding: [0x00,0x40,0x22,0xc4,0x04,0x10,0x80,0xc4,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_USCALED] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0xe0,0xe8,0x04,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_FMT_16_SSCALED] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0xe0,0x3c,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:10 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0xe0,0x04,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:8388607
 // GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
 
-tbuffer_load_format_d16_xyz v[254:255], off, s[8:11], s3 format:11 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0xfe,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x18,0x80,0x05,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], s101 format:[BUF_FMT_16_SINT] offset:8388607
-// GFX12: encoding: [0x65,0x80,0x22,0xc4,0x04,0x18,0x00,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[12:15], m0 format:12 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x22,0xc4,0x04,0x18,0x00,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x00,0x80,0x22,0xc4,0x04,0x10,0x00,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s61 format:[BUF_FMT_16_FLOAT] offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x22,0xc4,0x04,0x10,0x80,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s61 format:13 offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x22,0xc4,0x04,0xe0,0x80,0x06,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_16, BUF_NUM_FORMAT_FLOAT] offen offset:52
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x46,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_FMT_8_8_UNORM] idxen offset:52
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x00,0x87,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], v[1:2], s[8:11], s0 format:14 idxen offen offset:52
-// GFX12: encoding: [0x00,0x80,0x22,0xc4,0x04,0x10,0x00,0xc7,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UNORM] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0xe0,0x68,0x07,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_8_8_SNORM] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0xe0,0xbc,0x07,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:15 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0xe0,0x84,0x07,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SNORM] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
 
-tbuffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3 format:16 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0xfe,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_USCALED] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x18,0x00,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], s101 format:[BUF_FMT_8_8_SSCALED] offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x22,0xc4,0x04,0x18,0x80,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[12:15], m0 format:17 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x22,0xc4,0x04,0x18,0x80,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607
-// GFX12: encoding: [0x00,0xc0,0x22,0xc4,0x04,0x10,0x80,0x08,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s61 format:[BUF_FMT_8_8_UINT] offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x22,0xc4,0x04,0x10,0x00,0x09,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s61 format:18 offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x22,0xc4,0x04,0xe0,0x00,0x09,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x49,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_FMT_8_8_SINT] idxen offset:52
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x80,0x89,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0 format:19 idxen offen offset:52
-// GFX12: encoding: [0x00,0xc0,0x22,0xc4,0x04,0x10,0x80,0xc9,0x01,0x34,0x00,0x00]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8, BUF_NUM_FORMAT_SINT] offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0xe0,0xe8,0x09,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_32_UINT] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0xe0,0x3c,0x0a,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:20 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0xe0,0x04,0x0a,0x00,0xff,0xff,0x7f]
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_load_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32, BUF_NUM_FORMAT_UINT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_store_format_d16_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:8388607
 // GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
 
-tbuffer_store_format_d16_x v255, off, s[8:11], s3 format:41 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0xff,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_2_10_10_10, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x18,0x80,0x14,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], s101 format:[BUF_FMT_8_8_8_8_UNORM] offset:8388607
-// GFX12: encoding: [0x65,0x00,0x23,0xc4,0x04,0x18,0x00,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[12:15], m0 format:42 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x23,0xc4,0x04,0x18,0x00,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x00,0x00,0x23,0xc4,0x04,0x10,0x00,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, s[8:11], s61 format:[BUF_FMT_8_8_8_8_SNORM] offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x23,0xc4,0x04,0x10,0x80,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s61 format:43 offset:8388607
-// GFX12: encoding: [0x3d,0x00,0x23,0xc4,0x04,0xe0,0x80,0x15,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SNORM] offen offset:52
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x55,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_x v4, v1, s[8:11], s3 format:[BUF_FMT_8_8_8_8_USCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x00,0x96,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_x v4, v[1:2], s[8:11], s0 format:44 idxen offen offset:52
-// GFX12: encoding: [0x00,0x00,0x23,0xc4,0x04,0x10,0x00,0xd6,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_USCALED] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0xe0,0x68,0x16,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_8_8_8_SSCALED] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0xe0,0xbc,0x16,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:45 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0xe0,0x84,0x16,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_x v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SSCALED] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_store_format_d16_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:8388607
 // GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
 
-tbuffer_store_format_d16_xy v255, off, s[8:11], s3 format:46 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0xff,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s3 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x18,0x00,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], s101 format:[BUF_FMT_8_8_8_8_SINT] offset:8388607
-// GFX12: encoding: [0x65,0x40,0x23,0xc4,0x04,0x18,0x80,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[12:15], m0 format:47 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x23,0xc4,0x04,0x18,0x80,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], s0 format:[BUF_DATA_FORMAT_8_8_8_8, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x00,0x40,0x23,0xc4,0x04,0x10,0x80,0x17,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, s[8:11], s61 format:[BUF_FMT_32_32_UINT] offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x23,0xc4,0x04,0x10,0x00,0x18,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s61 format:48 offset:8388607
-// GFX12: encoding: [0x3d,0x40,0x23,0xc4,0x04,0xe0,0x00,0x18,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x58,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xy v4, v1, s[8:11], s3 format:[BUF_FMT_32_32_SINT] idxen offset:52
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x80,0x98,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xy v4, v[1:2], s[8:11], s0 format:49 idxen offen offset:52
-// GFX12: encoding: [0x00,0x40,0x23,0xc4,0x04,0x10,0x80,0xd8,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_SINT] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0xe0,0xe8,0x18,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_FMT_32_32_FLOAT] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0xe0,0x3c,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:50 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0xe0,0x04,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xy v4, off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:8388607
 // GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
 
-tbuffer_store_format_d16_xyz v[254:255], off, s[8:11], s3 format:51 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0xfe,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x18,0x80,0x19,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], s101 format:[BUF_FMT_16_16_16_16_SNORM] offset:8388607
-// GFX12: encoding: [0x65,0x80,0x23,0xc4,0x04,0x18,0x00,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[12:15], m0 format:52 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x23,0xc4,0x04,0x18,0x00,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SNORM] offset:8388607
-// GFX12: encoding: [0x00,0x80,0x23,0xc4,0x04,0x10,0x00,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s61 format:[BUF_FMT_16_16_16_16_USCALED] offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x23,0xc4,0x04,0x10,0x80,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s61 format:53 offset:8388607
-// GFX12: encoding: [0x3d,0x80,0x23,0xc4,0x04,0xe0,0x80,0x1a,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_USCALED] offen offset:52
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x5a,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], v1, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SSCALED] idxen offset:52
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x00,0x9b,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], v[1:2], s[8:11], s0 format:54 idxen offen offset:52
-// GFX12: encoding: [0x00,0x80,0x23,0xc4,0x04,0x10,0x00,0xdb,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SSCALED] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0xe0,0x68,0x1b,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_16_16_16_16_UINT] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0xe0,0xbc,0x1b,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:55 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0xe0,0x84,0x1b,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyz v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_UINT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[254:255], off, s[8:11], s3 format:56 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0xfe,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s3 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_SINT] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x18,0x00,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], s101 format:[BUF_FMT_16_16_16_16_FLOAT] offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x23,0xc4,0x04,0x18,0x80,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[12:15], m0 format:57 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x23,0xc4,0x04,0x18,0x80,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s0 format:[BUF_DATA_FORMAT_16_16_16_16, BUF_NUM_FORMAT_FLOAT] offset:8388607
-// GFX12: encoding: [0x00,0xc0,0x23,0xc4,0x04,0x10,0x80,0x1c,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s61 format:[BUF_FMT_32_32_32_UINT] offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1d,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s61 format:58 offset:8388607
-// GFX12: encoding: [0x3d,0xc0,0x23,0xc4,0x04,0xe0,0x00,0x1d,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_UINT] offen offset:52
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x5d,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], v1, s[8:11], s3 format:[BUF_FMT_32_32_32_SINT] idxen offset:52
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x80,0x9d,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], v[1:2], s[8:11], s0 format:59 idxen offen offset:52
-// GFX12: encoding: [0x00,0xc0,0x23,0xc4,0x04,0x10,0x80,0xdd,0x01,0x34,0x00,0x00]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_SINT] offset:8388607 th:TH_STORE_NT_HT scope:SCOPE_DEV
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0xe0,0xe8,0x1d,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_FMT_32_32_32_FLOAT] offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0xe0,0x3c,0x1e,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:60 offset:8388607 scope:SCOPE_SE
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0xe0,0x04,0x1e,0x00,0xff,0xff,0x7f]
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], 0 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 glc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 slc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-tbuffer_store_format_d16_xyzw v[4:5], off, ttmp[4:7], s3 format:[BUF_DATA_FORMAT_32_32_32, BUF_NUM_FORMAT_FLOAT] offset:8388607 dlc
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
index 6a8a4ef567b96..42073707a55aa 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
@@ -1,2214 +1,193 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
-// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: %s
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 buffer_load_dword v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_dword v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dword v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_load_dword v5, off, s[8:11], s3 offset:8388607 lds
-// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
 buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_dwordx2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_dwordx3 v[253:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0xfd,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_dwordx4 v[252:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0xfc,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_short_d16 v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_short_d16 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_format_d16_x v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_format_d16_x v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_x v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_format_d16_xy v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_format_d16_xyz v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_format_d16_xyzw v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_short_d16_hi v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_format_d16_hi_x v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sbyte_d16_hi v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte_d16_hi v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sbyte_d16 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte_d16 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_sbyte v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sbyte v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_load_sbyte v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_load_sbyte v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
 buffer_load_sshort v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_load_ushort v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_byte v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_short v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_dword v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_store_format_d16_x v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_sshort v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:8388607
+// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_add v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+buffer_atomic_and v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ubyte v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_csub v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
+buffer_atomic_fmax v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
+buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_load_ushort v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_fmin v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
+buffer_atomic_or v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_store_byte v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_short v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_short v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dword v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dword v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[254:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0xfe,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[253:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0xfd,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0xfc,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_x v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[254:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0xfe,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[254:255], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0xfe,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v255, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0xff,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[16:19], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x20,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[96:99], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 idxen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, v0, s[12:15], s4 offen offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:0
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:7
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_add v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_and v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[252:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0xfc,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v255, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0xff,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[12:15], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x18,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[96:99], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0xc0,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], s101 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x65,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], m0 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x7d,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub v5, off, s[8:11], s3 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:0 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:7 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_csub v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v255, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0xff,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[12:15], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x18,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[96:99], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0xc0,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s101 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x65,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], m0 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x7d,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, v0, s[8:11], s3 idxen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, v0, s[8:11], s3 offen offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:0 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:7 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_dec v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_inc v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmax v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_fmax v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smax v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umax v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_fmin v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_fmin v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smin v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umin v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_or v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_sub v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_swap v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
+buffer_atomic_swap v5, off, s[8:11], s3 offset:8388607
+// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_swap_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
 buffer_atomic_xor v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_xor v255, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor v5, off, s[8:11], s3
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_xor v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
-
 buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[254:255], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0xfe,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[12:15], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[96:99], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s101 offset:8388607
-// GFX12: encoding: [0x65,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], m0 offset:8388607
-// GFX12: encoding: [0x7d,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 idxen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 offen offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:0
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:7
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
-
-buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
index 3633f8815c00a..0728b43dddd92 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
@@ -3,21 +3,12 @@
 global_atomic_csub v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
 // GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
-global_atomic_csub v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
-
 global_atomic_csub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
 // GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
-global_atomic_csub_u32 v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
-
 flat_atomic_csub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN
 // GFX12: flat_atomic_sub_clamp_u32 {{.*}} encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
-flat_atomic_csub_u32 v[0:1], v2 offset:64
-// GFX12: flat_atomic_sub_clamp_u32 {{.*}} encoding: [0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
-
 flat_atomic_fmax v[0:1], v2 offset:64
 // GFX12: flat_atomic_max_num_f32 {{.*}} encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 

From 56ad6d19397c4286fa412f5070fd1a563b6c43e4 Mon Sep 17 00:00:00 2001
From: michaelselehov <michael.selehov@amd.com>
Date: Tue, 27 Feb 2024 13:31:29 +0100
Subject: [PATCH 427/546] [MachineLICM] Hoist COPY instruction only when user
 can be hoisted (#81735)

befa925acac8fd6a9266e introduced preliminary hoisting of COPY
instructions when the user of the COPY is inside the same loop. That
optimization appeared to be too aggressive and hoisted too many COPY's
greatly increasing register pressure causing performance regressions for
AMDGPU target.

This is intended to fix the regression by hoisting COPY instruction only
if either:
 - User of COPY can be hoisted (other args are invariant)
 or
 - Hoisting COPY doesn't bring high register pressure
---
 llvm/include/llvm/CodeGen/MachineLoopInfo.h   |  5 +-
 llvm/lib/CodeGen/MachineLICM.cpp              | 19 +++-
 llvm/lib/CodeGen/MachineLoopInfo.cpp          |  6 +-
 .../CodeGen/AMDGPU/copy-hoist-no-spills.ll    | 94 +++++++++++++++++++
 4 files changed, 118 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll

diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
index ae075bee1daf7..445c9b1c3bc00 100644
--- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h
@@ -79,7 +79,10 @@ class MachineLoop : public LoopBase<MachineBasicBlock, MachineLoop> {
   /// I.e., all virtual register operands are defined outside of the loop,
   /// physical registers aren't accessed explicitly, and there are no side
   /// effects that aren't captured by the operands or other flags.
-  bool isLoopInvariant(MachineInstr &I) const;
+  /// ExcludeReg can be used to exclude the given register from the check
+  /// i.e. when we're considering hoisting it's definition but not hoisted it
+  /// yet
+  bool isLoopInvariant(MachineInstr &I, const Register ExcludeReg = 0) const;
 
   void dump() const;
 
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index efc19f8fdbf8c..997f6eb085129 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1264,13 +1264,24 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
 
   // If we have a COPY with other uses in the loop, hoist to allow the users to
   // also be hoisted.
+  Register DefReg;
   if (MI.isCopy() && MI.getOperand(0).isReg() &&
-      MI.getOperand(0).getReg().isVirtual() && MI.getOperand(1).isReg() &&
-      MI.getOperand(1).getReg().isVirtual() &&
+      (DefReg = MI.getOperand(0).getReg()).isVirtual() &&
+      MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
       IsLoopInvariantInst(MI, CurLoop) &&
       any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
-             [&CurLoop](MachineInstr &UseMI) {
-               return CurLoop->contains(&UseMI);
+             [&CurLoop, this, DefReg, Cost](MachineInstr &UseMI) {
+               if (!CurLoop->contains(&UseMI))
+                 return false;
+
+               // COPY is a cheap instruction, but if moving it won't cause high
+               // RP we're fine to hoist it even if the user can't be hoisted
+               // later Otherwise we want to check the user if it's hoistable
+               if (CanCauseHighRegPressure(Cost, false) &&
+                   !CurLoop->isLoopInvariant(UseMI, DefReg))
+                 return false;
+
+               return true;
              }))
     return true;
 
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index bdbc57099aa8d..1492c8c366fb4 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -198,7 +198,8 @@ MDNode *MachineLoop::getLoopID() const {
   return LoopID;
 }
 
-bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
+bool MachineLoop::isLoopInvariant(MachineInstr &I,
+                                  const Register ExcludeReg) const {
   MachineFunction *MF = I.getParent()->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST = MF->getSubtarget();
@@ -213,6 +214,9 @@ bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
     Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
+    if (ExcludeReg == Reg)
+      continue;
+
     // An instruction that uses or defines a physical register can't e.g. be
     // hoisted, so mark this as not invariant.
     if (Reg.isPhysical()) {
diff --git a/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
new file mode 100644
index 0000000000000..4ae122bb3546a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/copy-hoist-no-spills.ll
@@ -0,0 +1,94 @@
+; NOTE: There must be no spill reload inside the loop starting with LBB0_1:
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p9:192:256:256:32"
+target triple = "amdgcn-amd-amdhsa"
+
+define amdgpu_kernel void @foo(ptr %.sroa.1.0.copyload, ptr %0, ptr %1, ptr %2, ptr %3, ptr %4, ptr %5, ptr %6, ptr %7, ptr %8, ptr %9, ptr %10, ptr %11, ptr %12, ptr %13, ptr %14, ptr %15, ptr %16, ptr %17, ptr %18, ptr %19, ptr %20, ptr %21, ptr %22, ptr %23, ptr %24, ptr %25, ptr %26, ptr %27, ptr %28, ptr %29, ptr %30, ptr %31, ptr %32, ptr %33, double %34, double %35, double %36, float %37, float %38, float %39, float %40, ptr %41) {
+; CHECK-LABEL: foo:
+; CHECK-LABEL: .LBB0_1
+; CHECK-NOT: buffer_load_dword {{v[0-9]+}}
+
+.lr.ph:
+  %.pre = load double, ptr null, align 8
+  br label %42
+
+42:                                               ; preds = %42, %.lr.ph
+  %.0.i4402 = phi i32 [ 1, %.lr.ph ], [ 0, %42 ]
+  %43 = zext i32 %.0.i4402 to i64
+  %44 = load double, ptr %2, align 8
+  %45 = load double, ptr %4, align 8
+  %46 = load double, ptr %7, align 8
+  %47 = load double, ptr %13, align 8
+  %48 = load double, ptr %15, align 8
+  %49 = load double, ptr %17, align 8
+  %50 = load double, ptr %19, align 8
+  %51 = load double, ptr %18, align 8
+  %52 = load double, ptr %27, align 8
+  %53 = load double, ptr %23, align 8
+  %54 = load double, ptr %31, align 8
+  %55 = load double, ptr %33, align 8
+  %56 = load double, ptr %25, align 8
+  %57 = load double, ptr %16, align 8
+  %58 = fpext float %40 to double
+  %59 = fmul double %52, %58
+  %60 = fadd double %59, %51
+  %61 = fsub double %60, %48
+  %62 = fmul double 0.000000e+00, %36
+  %63 = fsub double %61, %62
+  %64 = fadd double %49, %63
+  %65 = fptrunc double %64 to float
+  %66 = fsub double 0.000000e+00, %34
+  %67 = fpext float %39 to double
+  %68 = fmul double %53, %67
+  %69 = fsub double %66, %68
+  %70 = fadd double %50, %69
+  %71 = fptrunc double %70 to float
+  store float 0.000000e+00, ptr %30, align 4
+  store float 0.000000e+00, ptr %26, align 4
+  %72 = getelementptr float, ptr %41, i64 %43
+  store float %38, ptr %72, align 4
+  store float %65, ptr %29, align 4
+  store float %71, ptr %14, align 4
+  store float %39, ptr %3, align 4
+  store float %39, ptr %11, align 4
+  %73 = fsub double %46, %44
+  %74 = fptrunc double %73 to float
+  %75 = fsub double %47, %45
+  %76 = fptrunc double %75 to float
+  %77 = fadd float %74, %76
+  %78 = fpext float %37 to double
+  %79 = fmul contract double %56, 0.000000e+00
+  %80 = fsub contract double %34, %79
+  %81 = fpext float %77 to double
+  %82 = fmul double %.pre, %81
+  %83 = fsub double %80, %82
+  %84 = fpext float %38 to double
+  %85 = fmul double %57, %84
+  %86 = fsub double %83, %85
+  %87 = fptrunc double %86 to float
+  %88 = fmul double %34, 0.000000e+00
+  %89 = fmul double %54, %78
+  %90 = fadd double %89, %88
+  %91 = fsub double %90, %55
+  %92 = fmul double 0.000000e+00, %35
+  %93 = fsub double %91, %92
+  %94 = fmul double %34, %34
+  %95 = fadd double %93, %94
+  %96 = fptrunc double %95 to float
+  store float %87, ptr %1, align 4
+  store float %37, ptr %21, align 4
+  store float %96, ptr %0, align 4
+  store float 0.000000e+00, ptr %9, align 4
+  store float 0.000000e+00, ptr %32, align 4
+  store float 0.000000e+00, ptr %20, align 4
+  store float 0.000000e+00, ptr %22, align 4
+  store float 0.000000e+00, ptr %5, align 4
+  store float 0.000000e+00, ptr %28, align 4
+  store float 0.000000e+00, ptr %12, align 4
+  store float 0.000000e+00, ptr %6, align 4
+  store float 0.000000e+00, ptr %8, align 4
+  store float 0.000000e+00, ptr %.sroa.1.0.copyload, align 4
+  store float %37, ptr %10, align 4
+  store float 0.000000e+00, ptr %24, align 4
+  br label %42
+}

From 2e4643a53e8877d6a237b18cb2ad3f814899881f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 27 Feb 2024 13:30:34 +0530
Subject: [PATCH 428/546] AMDGPU: Regenerate baseline test checks

---
 .../AMDGPU/amdgpu-simplify-libcall-sincos.ll     | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index 4061857789ed6..5c56276eeb0f1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -991,7 +991,7 @@ define void @sincos_f32_preserve_fpmath_0(float %x, ptr addrspace(1) nocapture w
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath !5
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath [[META5:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
@@ -1010,7 +1010,7 @@ define void @sincos_f32_preserve_fpmath_1(float %x, ptr addrspace(1) nocapture w
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath !6
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !fpmath [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
@@ -1051,9 +1051,9 @@ define void @sincos_f32_debuginfo(float %x, ptr addrspace(1) nocapture writeonly
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5), !dbg [[DBG14:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]), !dbg [[DBG14]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG14]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP0]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata float [[TMP0]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]]
 ; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG18:![0-9]+]]
 ; CHECK-NEXT:    ret void, !dbg [[DBG19:![0-9]+]]
 ;
@@ -1072,9 +1072,9 @@ define float @sin_sincos_private_f32(float %x, ptr addrspace(1) %sin_out, ptr ad
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !5
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath [[META5]]
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !6
+; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath [[META6]]
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
@@ -1094,10 +1094,10 @@ define float @sin_sincos_generic_f32(float %x, ptr addrspace(1) %sin_out, ptr ad
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !5
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath [[META5]]
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[COS_TMP_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP]] to ptr
-; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !6
+; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath [[META6]]
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]

From ca66f7469fe71d78586eebe22cd0acab066cf65f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 27 Feb 2024 18:37:01 +0530
Subject: [PATCH 429/546] AMDGPU: Merge tests for llvm.amdgcn.dispatch.id

---
 .../GlobalISel/llvm.amdgcn.dispatch.id.ll     | 20 -------------------
 .../CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll |  3 ++-
 2 files changed, 2 insertions(+), 21 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
deleted file mode 100644
index fc22d5ec66f07..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-declare i64 @llvm.amdgcn.dispatch.id() #1
-
-; GCN-LABEL: {{^}}dispatch_id:
-; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
-; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[LO]]:[[HI]]]
-; GCN: .amdhsa_user_sgpr_dispatch_id 1
-define amdgpu_kernel void @dispatch_id(ptr addrspace(1) %out) #0 {
-  %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
-  store i64 %tmp0, ptr addrspace(1) %out
-  ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 4756571aaa591..04ac24948b8be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
 
 declare i64 @llvm.amdgcn.dispatch.id() #1
 

From a28a7d41ef1a60795719fa3e6e2f7dc3b7fc3d27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 26 Feb 2024 18:23:56 +0100
Subject: [PATCH 430/546] [clang][Interp][NFC] Remove leftover comments

---
 clang/lib/AST/Interp/InterpBuiltin.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 760219e0ffa9f..4ba518e5f0619 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -493,7 +493,6 @@ static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC,
                                        const CallExpr *Call) {
   PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
   APSInt Val = peekToAPSInt(S.Stk, ArgT);
-  // pushAPSInt(S, APSInt(Val.reverseBits(), /*IsUnsigned=*/true));
   pushInteger(S, Val.reverseBits(), Call->getType());
   return true;
 }
@@ -552,7 +551,6 @@ static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
     Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())),
                     /*IsUnsigned=*/true);
 
-  // pushAPSInt(S, Result);
   pushInteger(S, Result, Call->getType());
   return true;
 }
@@ -785,7 +783,6 @@ static bool interp__builtin_carryop(InterpState &S, CodePtr OpPC,
   CarryOutPtr.initialize();
 
   assert(Call->getType() == Call->getArg(0)->getType());
-  // pushAPSInt(S, Result);
   pushInteger(S, Result, Call->getType());
   return true;
 }

From 19cec9ca1206c4707064cc2fc2344de75dfbd8c9 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakamura <k.nakamura.hirofumi@gmail.com>
Date: Tue, 27 Feb 2024 22:31:23 +0900
Subject: [PATCH 431/546] [clang-format] Add
 AlignConsecutiveTableGenDefinitions option. (#83008)

To align TableGen consecutive definitions.
---
 clang/docs/ClangFormatStyleOptions.rst        | 140 ++++++++++++++++++
 clang/include/clang/Format/Format.h           |  12 ++
 clang/lib/Format/Format.cpp                   |   3 +
 clang/lib/Format/WhitespaceManager.cpp        |   9 +-
 clang/lib/Format/WhitespaceManager.h          |   3 +
 clang/unittests/Format/FormatTestTableGen.cpp |  14 ++
 6 files changed, 180 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index d509bb8076797..df399a229d8d4 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1095,6 +1095,146 @@ the configuration (without a prefix: ``Auto``).
       bbb >>= 2;
 
 
+.. _AlignConsecutiveTableGenDefinitionColons:
+
+**AlignConsecutiveTableGenDefinitionColons** (``AlignConsecutiveStyle``) :versionbadge:`clang-format 19` :ref:`¶ <AlignConsecutiveTableGenDefinitionColons>`
+  Style of aligning consecutive TableGen definition colons.
+  This aligns the inheritance colons of consecutive definitions.
+
+  .. code-block:: c++
+
+    def Def       : Parent {}
+    def DefDef    : Parent {}
+    def DefDefDef : Parent {}
+
+  Nested configuration flags:
+
+  Alignment options.
+
+  They can also be read as a whole for compatibility. The choices are:
+  - None
+  - Consecutive
+  - AcrossEmptyLines
+  - AcrossComments
+  - AcrossEmptyLinesAndComments
+
+  For example, to align across empty lines and not across comments, either
+  of these work.
+
+  .. code-block:: c++
+
+    AlignConsecutiveMacros: AcrossEmptyLines
+
+    AlignConsecutiveMacros:
+      Enabled: true
+      AcrossEmptyLines: true
+      AcrossComments: false
+
+  * ``bool Enabled`` Whether aligning is enabled.
+
+    .. code-block:: c++
+
+      #define SHORT_NAME       42
+      #define LONGER_NAME      0x007f
+      #define EVEN_LONGER_NAME (2)
+      #define foo(x)           (x * x)
+      #define bar(y, z)        (y + z)
+
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int aaaa : 1;
+      int b    : 12;
+      int ccc  : 8;
+
+      int         aaaa = 12;
+      float       b = 23;
+      std::string ccc;
+
+  * ``bool AcrossEmptyLines`` Whether to align across empty lines.
+
+    .. code-block:: c++
+
+      true:
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int d            = 3;
+
+      false:
+      int a            = 1;
+      int somelongname = 2;
+      double c         = 3;
+
+      int d = 3;
+
+  * ``bool AcrossComments`` Whether to align across comments.
+
+    .. code-block:: c++
+
+      true:
+      int d    = 3;
+      /* A comment. */
+      double e = 4;
+
+      false:
+      int d = 3;
+      /* A comment. */
+      double e = 4;
+
+  * ``bool AlignCompound`` Only for ``AlignConsecutiveAssignments``.  Whether compound assignments
+    like ``+=`` are aligned along with ``=``.
+
+    .. code-block:: c++
+
+      true:
+      a   &= 2;
+      bbb  = 2;
+
+      false:
+      a &= 2;
+      bbb = 2;
+
+  * ``bool AlignFunctionPointers`` Only for ``AlignConsecutiveDeclarations``. Whether function pointers are
+    aligned.
+
+    .. code-block:: c++
+
+      true:
+      unsigned i;
+      int     &r;
+      int     *p;
+      int      (*f)();
+
+      false:
+      unsigned i;
+      int     &r;
+      int     *p;
+      int (*f)();
+
+  * ``bool PadOperators`` Only for ``AlignConsecutiveAssignments``.  Whether short assignment
+    operators are left-padded to the same length as long ones in order to
+    put all assignment operators to the right of the left hand side.
+
+    .. code-block:: c++
+
+      true:
+      a   >>= 2;
+      bbb   = 2;
+
+      a     = 2;
+      bbb >>= 2;
+
+      false:
+      a >>= 2;
+      bbb = 2;
+
+      a     = 2;
+      bbb >>= 2;
+
+
 .. _AlignEscapedNewlines:
 
 **AlignEscapedNewlines** (``EscapedNewlineAlignmentStyle``) :versionbadge:`clang-format 5` :ref:`¶ <AlignEscapedNewlines>`
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 449ce9e53be14..613f1fd168465 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -424,6 +424,16 @@ struct FormatStyle {
   /// \version 19
   AlignConsecutiveStyle AlignConsecutiveTableGenCondOperatorColons;
 
+  /// Style of aligning consecutive TableGen definition colons.
+  /// This aligns the inheritance colons of consecutive definitions.
+  /// \code
+  ///   def Def       : Parent {}
+  ///   def DefDef    : Parent {}
+  ///   def DefDefDef : Parent {}
+  /// \endcode
+  /// \version 19
+  AlignConsecutiveStyle AlignConsecutiveTableGenDefinitionColons;
+
   /// Different styles for aligning escaped newlines.
   enum EscapedNewlineAlignmentStyle : int8_t {
     /// Don't align escaped newlines.
@@ -4817,6 +4827,8 @@ struct FormatStyle {
                R.AlignConsecutiveShortCaseStatements &&
            AlignConsecutiveTableGenCondOperatorColons ==
                R.AlignConsecutiveTableGenCondOperatorColons &&
+           AlignConsecutiveTableGenDefinitionColons ==
+               R.AlignConsecutiveTableGenDefinitionColons &&
            AlignEscapedNewlines == R.AlignEscapedNewlines &&
            AlignOperands == R.AlignOperands &&
            AlignTrailingComments == R.AlignTrailingComments &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 13588ff705f54..e64ba7eebc1ce 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -917,6 +917,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AlignConsecutiveShortCaseStatements);
     IO.mapOptional("AlignConsecutiveTableGenCondOperatorColons",
                    Style.AlignConsecutiveTableGenCondOperatorColons);
+    IO.mapOptional("AlignConsecutiveTableGenDefinitionColons",
+                   Style.AlignConsecutiveTableGenDefinitionColons);
     IO.mapOptional("AlignEscapedNewlines", Style.AlignEscapedNewlines);
     IO.mapOptional("AlignOperands", Style.AlignOperands);
     IO.mapOptional("AlignTrailingComments", Style.AlignTrailingComments);
@@ -1423,6 +1425,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AlignConsecutiveMacros = {};
   LLVMStyle.AlignConsecutiveShortCaseStatements = {};
   LLVMStyle.AlignConsecutiveTableGenCondOperatorColons = {};
+  LLVMStyle.AlignConsecutiveTableGenDefinitionColons = {};
   LLVMStyle.AlignEscapedNewlines = FormatStyle::ENAS_Right;
   LLVMStyle.AlignOperands = FormatStyle::OAS_Align;
   LLVMStyle.AlignTrailingComments = {};
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index dd9d5847a10dc..753be25bfd675 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -111,8 +111,10 @@ const tooling::Replacements &WhitespaceManager::generateReplacements() {
   alignConsecutiveDeclarations();
   alignConsecutiveBitFields();
   alignConsecutiveAssignments();
-  if (Style.isTableGen())
+  if (Style.isTableGen()) {
     alignConsecutiveTableGenCondOperatorColons();
+    alignConsecutiveTableGenDefinitions();
+  }
   alignChainedConditionals();
   alignTrailingComments();
   alignEscapedNewlines();
@@ -984,6 +986,11 @@ void WhitespaceManager::alignConsecutiveTableGenCondOperatorColons() {
                          TT_TableGenCondOperatorColon);
 }
 
+void WhitespaceManager::alignConsecutiveTableGenDefinitions() {
+  alignConsecutiveColons(Style.AlignConsecutiveTableGenDefinitionColons,
+                         TT_InheritanceColon);
+}
+
 void WhitespaceManager::alignConsecutiveDeclarations() {
   if (!Style.AlignConsecutiveDeclarations.Enabled)
     return;
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index c604cdb6f185a..9942e0f35738f 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -243,6 +243,9 @@ class WhitespaceManager {
   /// Align consecutive TableGen cond operator colon over all \c Changes.
   void alignConsecutiveTableGenCondOperatorColons();
 
+  /// Align consecutive TableGen definitions over all \c Changes.
+  void alignConsecutiveTableGenDefinitions();
+
   /// Align trailing comments over all \c Changes.
   void alignTrailingComments();
 
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index 6c110beabca40..76b871e2e1a52 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -346,5 +346,19 @@ TEST_F(FormatTestTableGen, CondOperatorAlignment) {
                Style);
 }
 
+TEST_F(FormatTestTableGen, DefAlignment) {
+  FormatStyle Style = getGoogleStyle(FormatStyle::LK_TableGen);
+  Style.ColumnLimit = 60;
+  verifyFormat("def Def : Parent {}\n"
+               "def DefDef : Parent {}\n"
+               "def DefDefDef : Parent {}\n",
+               Style);
+  Style.AlignConsecutiveTableGenDefinitionColons.Enabled = true;
+  verifyFormat("def Def       : Parent {}\n"
+               "def DefDef    : Parent {}\n"
+               "def DefDefDef : Parent {}\n",
+               Style);
+}
+
 } // namespace format
 } // end namespace clang

From d6ff986dd276e06c1e290759bc934431ca3ec0f4 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 13 Feb 2024 18:39:53 +0000
Subject: [PATCH 432/546] [LLVM][tests/CodeGen/AArch64] Convert instances of
 ConstantExpr based splats to use splat().

This is mostly NFC but some output does change due to consistently
inserting into poison rather than undef and using i64 as the index
type for inserts.
---
 .../complex-deinterleaving-splat-scalable.ll  |   8 +-
 .../AArch64/dag-combine-concat-vectors.ll     |   2 +-
 .../fold-int-pow2-with-fmul-or-fdiv.ll        |   6 +-
 .../AArch64/sve-calling-convention-mixed.ll   |   2 +-
 llvm/test/CodeGen/AArch64/sve-expand-div.ll   |  16 +-
 .../CodeGen/AArch64/sve-fp-int-min-max.ll     |   6 +-
 .../AArch64/sve-gather-scatter-dag-combine.ll |  12 +-
 llvm/test/CodeGen/AArch64/sve-hadd.ll         | 162 +++++++++---------
 .../test/CodeGen/AArch64/sve-int-arith-imm.ll |  10 +-
 llvm/test/CodeGen/AArch64/sve-int-arith.ll    |  36 ++--
 llvm/test/CodeGen/AArch64/sve-knownbits.ll    |   4 +-
 .../test/CodeGen/AArch64/sve-pred-selectop.ll |  12 +-
 .../CodeGen/AArch64/sve-pred-selectop3.ll     |  70 ++++----
 llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll    |  16 +-
 llvm/test/CodeGen/AArch64/sve-splat-sext.ll   |   4 +-
 .../CodeGen/AArch64/sve-srem-combine-loop.ll  |   2 +-
 .../AArch64/sve2-intrinsics-combine-rshrnb.ll |  64 +++----
 17 files changed, 216 insertions(+), 216 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 2627f2a4fb5ec..742a7099559f7 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -40,11 +40,11 @@ entry:
   %7 = fmul fast <vscale x 2 x double> %2, %0
   %8 = fmul fast <vscale x 2 x double> %3, %1
   %9 = fsub fast <vscale x 2 x double> %7, %8
-  %10 = fmul fast <vscale x 2 x double> %9, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-  %11 = fmul fast <vscale x 2 x double> %6, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.100000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %10 = fmul fast <vscale x 2 x double> %9, splat (double 3.000000e+00)
+  %11 = fmul fast <vscale x 2 x double> %6, splat (double 1.100000e+01)
   %12 = fadd fast <vscale x 2 x double> %10, %11
-  %13 = fmul fast <vscale x 2 x double> %9, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.100000e+01, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-  %14 = fmul fast <vscale x 2 x double> %6, shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 3.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %13 = fmul fast <vscale x 2 x double> %9, splat (double 1.100000e+01)
+  %14 = fmul fast <vscale x 2 x double> %6, splat (double 3.000000e+00)
   %15 = fsub fast <vscale x 2 x double> %13, %14
   %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %15, <vscale x 2 x double> %12)
   ret <vscale x 4 x double> %interleaved.vec
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 4e50cb11be71f..4a2e85c715f7a 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -63,7 +63,7 @@ define fastcc i8 @allocno_reload_assign() {
   br label %1
 
 1:                                                ; preds = %1, %0
-  call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> xor (<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> icmp eq (<vscale x 16 x ptr> insertelement (<vscale x 16 x ptr> poison, ptr null, i64 0), <vscale x 16 x ptr> zeroinitializer), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)))
+  call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x ptr> zeroinitializer, i32 0, <vscale x 16 x i1> xor (<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> icmp eq (<vscale x 16 x ptr> insertelement (<vscale x 16 x ptr> poison, ptr null, i64 0), <vscale x 16 x ptr> zeroinitializer), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> splat (i1 true)))
   br label %1
 }
 
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index ad062da5491da..03e64f8b785b0 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -611,9 +611,9 @@ define <vscale x 4 x float> @fdiv_pow2_nx4xfloat(<vscale x 4 x i32> %i) "target-
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fdivr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %p2 = shl <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %i
+  %p2 = shl <vscale x 4 x i32> splat (i32 1), %i
   %p2_f = uitofp <vscale x 4 x i32> %p2 to <vscale x 4 x float>
-  %r = fdiv <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 9.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), %p2_f
+  %r = fdiv <vscale x 4 x float> splat (float 9.000000e+00), %p2_f
   ret <vscale x 4 x float> %r
 }
 
@@ -626,6 +626,6 @@ define <vscale x 2 x double> @scalable2(<vscale x 2 x i64> %0) "target-features"
 ; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %2 = uitofp <vscale x 2 x i64> %0 to <vscale x 2 x double>
-  %3 = fdiv <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), %2
+  %3 = fdiv <vscale x 2 x double> splat (double 1.000000e+00), %2
   ret <vscale x 2 x double> %3
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 3965af6a9066d..56b023086ea24 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -724,7 +724,7 @@ define void @verify_all_operands_are_initialised() {
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
-  call void @func_f8_and_v0_passed_via_memory(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 9.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer))
+  call void @func_f8_and_v0_passed_via_memory(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, <vscale x 4 x float> splat (float 9.000000e+00))
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 5469c29f1aa7e..fe5cdc9387728 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -16,7 +16,7 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    lsr z1.b, z0.b, #7
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> undef, i8 3, i32 0), <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer)
+  %div = sdiv <vscale x 16 x i8> %a, splat (i8 3)
   ret <vscale x 16 x i8> %div
 }
 
@@ -30,7 +30,7 @@ define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    lsr z1.h, z0.h, #15
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 3, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %div = sdiv <vscale x 8 x i16> %a, splat (i16 3)
   ret <vscale x 8 x i16> %div
 }
 
@@ -45,7 +45,7 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    lsr z1.s, z0.s, #31
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = sdiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -60,7 +60,7 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    lsr z1.d, z0.d, #63
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ret
-  %div = sdiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 3, i32 0), <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer)
+  %div = sdiv <vscale x 2 x i64> %a, splat (i64 3)
   ret <vscale x 2 x i64> %div
 }
 
@@ -76,7 +76,7 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    lsr z0.b, z0.b, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> undef, i8 3, i32 0), <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer)
+  %div = udiv <vscale x 16 x i8> %a, splat (i8 3)
   ret <vscale x 16 x i8> %div
 }
 
@@ -89,7 +89,7 @@ define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z0.h, z0.h, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 3, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %div = udiv <vscale x 8 x i16> %a, splat (i16 3)
   ret <vscale x 8 x i16> %div
 }
 
@@ -103,7 +103,7 @@ define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    lsr z0.s, z0.s, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = udiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -117,7 +117,7 @@ define <vscale x 2 x i64> @udiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z0.d, z0.d, #1
 ; CHECK-NEXT:    ret
-  %div = udiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 3, i32 0), <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer)
+  %div = udiv <vscale x 2 x i64> %a, splat (i64 3)
   ret <vscale x 2 x i64> %div
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index f6059d715a052..bdaea0ecf144a 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -33,11 +33,11 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 entry:
   %i56 = getelementptr inbounds float, ptr %arg, i64 0
   %i57 = load <vscale x 2 x float>, ptr %i56, align 4
-  %i58 = fmul <vscale x 2 x float> %i57, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 0x401D41D420000000, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-  %i59 = fadd <vscale x 2 x float> %i58, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.023500e+03, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
+  %i58 = fmul <vscale x 2 x float> %i57, splat (float 0x401D41D420000000)
+  %i59 = fadd <vscale x 2 x float> %i58, splat (float 1.023500e+03)
   %i60 = fptosi <vscale x 2 x float> %i59 to <vscale x 2 x i32>
   %i61 = tail call <vscale x 2 x i32> @llvm.smax.nxv2i32(<vscale x 2 x i32> %i60, <vscale x 2 x i32> zeroinitializer)
-  %i62 = tail call <vscale x 2 x i32> @llvm.smin.nxv2i32(<vscale x 2 x i32> %i61, <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1023, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %i62 = tail call <vscale x 2 x i32> @llvm.smin.nxv2i32(<vscale x 2 x i32> %i61, <vscale x 2 x i32> splat (i32 1023))
   %i63 = icmp ne <vscale x 2 x i32> %i62, zeroinitializer
   %i64 = getelementptr float, ptr %arg1, i64 0
   %i65 = tail call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr %i64, i32 4, <vscale x 2 x i1> %i63, <vscale x 2 x float> poison)
diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
index f7a963186c139..a40d550852798 100644
--- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-dag-combine.ll
@@ -95,7 +95,7 @@ define <vscale x 16 x i8> @narrow_i64_gather_index_i8_zext(ptr %out, ptr %in, <v
   %wide.load = load <vscale x 16 x i8>, ptr %2, align 1
   %3 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i64>
   %4 = getelementptr inbounds i8, ptr %in, <vscale x 16 x i64> %3
-  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> undef)
+  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
   ret <vscale x 16 x i8> %wide.masked.gather
 }
 
@@ -121,7 +121,7 @@ define <vscale x 16 x i8> @narrow_i64_gather_index_i8_sext(ptr %out, ptr %in, <v
   %wide.load = load <vscale x 16 x i8>, ptr %2, align 1
   %3 = sext <vscale x 16 x i8> %wide.load to <vscale x 16 x i64>
   %4 = getelementptr inbounds i8, ptr %in, <vscale x 16 x i64> %3
-  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> undef)
+  %wide.masked.gather = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> %4, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
   ret <vscale x 16 x i8> %wide.masked.gather
 }
 
@@ -141,7 +141,7 @@ define <vscale x 8 x i16> @narrow_i64_gather_index_i16_zext(ptr %out, ptr %in, <
   %wide.load = load <vscale x 8 x i16>, ptr %2, align 1
   %3 = zext <vscale x 8 x i16> %wide.load to <vscale x 8 x i64>
   %4 = getelementptr inbounds i16, ptr %in, <vscale x 8 x i64> %3
-  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i16> undef)
+  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> undef)
   ret <vscale x 8 x i16> %wide.masked.gather
 }
 
@@ -161,7 +161,7 @@ define <vscale x 8 x i16> @narrow_i64_gather_index_i16_sext(ptr %out, ptr %in, <
   %wide.load = load <vscale x 8 x i16>, ptr %2, align 1
   %3 = sext <vscale x 8 x i16> %wide.load to <vscale x 8 x i64>
   %4 = getelementptr inbounds i16, ptr %in, <vscale x 8 x i64> %3
-  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i16> undef)
+  %wide.masked.gather = call <vscale x 8 x i16> @llvm.masked.gather.nxv8i16.nxv8p0(<vscale x 8 x ptr> %4, i32 1, <vscale x 8 x i1> splat (i1 true), <vscale x 8 x i16> undef)
   ret <vscale x 8 x i16> %wide.masked.gather
 }
 
@@ -177,7 +177,7 @@ define <vscale x 4 x i32> @no_narrow_i64_gather_index_i32(ptr %out, ptr %in, <vs
   %wide.load = load <vscale x 4 x i32>, ptr %2, align 1
   %3 = zext <vscale x 4 x i32> %wide.load to <vscale x 4 x i64>
   %4 = getelementptr inbounds i32, ptr %in, <vscale x 4 x i64> %3
-  %wide.masked.gather = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %4, i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> undef)
+  %wide.masked.gather = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> %4, i32 1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> undef)
   ret <vscale x 4 x i32> %wide.masked.gather
 }
 
@@ -192,7 +192,7 @@ define <vscale x 2 x i64> @no_narrow_i64_gather_index_i64(ptr %out, ptr %in, <vs
   %2 = bitcast ptr %1 to ptr
   %wide.load = load <vscale x 2 x i64>, ptr %2, align 1
   %3 = getelementptr inbounds i64, ptr %in, <vscale x 2 x i64> %wide.load
-  %wide.masked.gather = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> %3, i32 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> undef)
+  %wide.masked.gather = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> %3, i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> undef)
   ret <vscale x 2 x i64> %wide.masked.gather
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll
index 7936094af1c0a..c73370d50287b 100644
--- a/llvm/test/CodeGen/AArch64/sve-hadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll
@@ -22,7 +22,7 @@ entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
   %m = add nsw <vscale x 2 x i128> %s0s, %s1s
-  %s = ashr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i128> %m, splat (i128 1)
   %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %s2
 }
@@ -47,7 +47,7 @@ entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
   %m = add nsw <vscale x 2 x i128> %s0s, %s1s
-  %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %m, splat (i128 1)
   %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %s2
 }
@@ -72,7 +72,7 @@ entry:
   %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
   %m = add nuw nsw <vscale x 2 x i128> %s0s, %s1s
-  %s = lshr <vscale x 2 x i128> %m, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %m, splat (i128 1)
   %s2 = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %s2
 }
@@ -97,7 +97,7 @@ entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
   %m = add nsw <vscale x 2 x i64> %s0s, %s1s
-  %s = ashr <vscale x 2 x i64> %m, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %s2
 }
@@ -114,7 +114,7 @@ entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
   %m = add nsw <vscale x 2 x i64> %s0s, %s1s
-  %s = lshr <vscale x 2 x i64> %m, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %s2
 }
@@ -138,7 +138,7 @@ entry:
   %s0s = zext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = zext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
   %m = add nuw nsw <vscale x 2 x i64> %s0s, %s1s
-  %s = lshr <vscale x 2 x i64> %m, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %s2
 }
@@ -163,7 +163,7 @@ entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
   %m = add nsw <vscale x 4 x i64> %s0s, %s1s
-  %s = ashr <vscale x 4 x i64> %m, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %s2
 }
@@ -188,7 +188,7 @@ entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
   %m = add nsw <vscale x 4 x i64> %s0s, %s1s
-  %s = lshr <vscale x 4 x i64> %m, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %s2
 }
@@ -213,7 +213,7 @@ entry:
   %s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = zext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
   %m = add nuw nsw <vscale x 4 x i64> %s0s, %s1s
-  %s = lshr <vscale x 4 x i64> %m, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %m, splat (i64 1)
   %s2 = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %s2
 }
@@ -239,7 +239,7 @@ entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
   %m = add nsw <vscale x 2 x i32> %s0s, %s1s
-  %s = ashr <vscale x 2 x i32> %m, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %s2
 }
@@ -258,7 +258,7 @@ entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
   %m = add nsw <vscale x 2 x i32> %s0s, %s1s
-  %s = lshr <vscale x 2 x i32> %m, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %s2
 }
@@ -283,7 +283,7 @@ entry:
   %s0s = zext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = zext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
   %m = add nuw nsw <vscale x 2 x i32> %s0s, %s1s
-  %s = lshr <vscale x 2 x i32> %m, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %s2
 }
@@ -309,7 +309,7 @@ entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
   %m = add nsw <vscale x 4 x i32> %s0s, %s1s
-  %s = ashr <vscale x 4 x i32> %m, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %s2
 }
@@ -327,7 +327,7 @@ entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
   %m = add nsw <vscale x 4 x i32> %s0s, %s1s
-  %s = lshr <vscale x 4 x i32> %m, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %s2
 }
@@ -352,7 +352,7 @@ entry:
   %s0s = zext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = zext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
   %m = add nuw nsw <vscale x 4 x i32> %s0s, %s1s
-  %s = lshr <vscale x 4 x i32> %m, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %s2
 }
@@ -377,7 +377,7 @@ entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
   %m = add nsw <vscale x 8 x i32> %s0s, %s1s
-  %s = ashr <vscale x 8 x i32> %m, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %s2
 }
@@ -402,7 +402,7 @@ entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
   %m = add nsw <vscale x 8 x i32> %s0s, %s1s
-  %s = lshr <vscale x 8 x i32> %m, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %s2
 }
@@ -427,7 +427,7 @@ entry:
   %s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = zext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
   %m = add nuw nsw <vscale x 8 x i32> %s0s, %s1s
-  %s = lshr <vscale x 8 x i32> %m, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %m, splat (i32 1)
   %s2 = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %s2
 }
@@ -453,7 +453,7 @@ entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
   %m = add nsw <vscale x 4 x i16> %s0s, %s1s
-  %s = ashr <vscale x 4 x i16> %m, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %s2
 }
@@ -472,7 +472,7 @@ entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
   %m = add nsw <vscale x 4 x i16> %s0s, %s1s
-  %s = lshr <vscale x 4 x i16> %m, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %s2
 }
@@ -497,7 +497,7 @@ entry:
   %s0s = zext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = zext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
   %m = add nuw nsw <vscale x 4 x i16> %s0s, %s1s
-  %s = lshr <vscale x 4 x i16> %m, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %s2
 }
@@ -523,7 +523,7 @@ entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
   %m = add nsw <vscale x 8 x i16> %s0s, %s1s
-  %s = ashr <vscale x 8 x i16> %m, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %s2
 }
@@ -541,7 +541,7 @@ entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
   %m = add nsw <vscale x 8 x i16> %s0s, %s1s
-  %s = lshr <vscale x 8 x i16> %m, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %s2
 }
@@ -566,7 +566,7 @@ entry:
   %s0s = zext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = zext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
   %m = add nuw nsw <vscale x 8 x i16> %s0s, %s1s
-  %s = lshr <vscale x 8 x i16> %m, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %s2
 }
@@ -591,7 +591,7 @@ entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
   %m = add nsw <vscale x 16 x i16> %s0s, %s1s
-  %s = ashr <vscale x 16 x i16> %m, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = ashr <vscale x 16 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %s2
 }
@@ -616,7 +616,7 @@ entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
   %m = add nsw <vscale x 16 x i16> %s0s, %s1s
-  %s = lshr <vscale x 16 x i16> %m, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %s2
 }
@@ -641,7 +641,7 @@ entry:
   %s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = zext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
   %m = add nuw nsw <vscale x 16 x i16> %s0s, %s1s
-  %s = lshr <vscale x 16 x i16> %m, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %m, splat (i16 1)
   %s2 = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %s2
 }
@@ -665,9 +665,9 @@ define <vscale x 2 x i64> @rhadds_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i6
 entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
-  %add = add <vscale x 2 x i128> %s0s, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i128> %s0s, splat (i128 1)
   %add2 = add <vscale x 2 x i128> %add, %s1s
-  %s = ashr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i128> %add2, splat (i128 1)
   %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %result
 }
@@ -691,9 +691,9 @@ define <vscale x 2 x i64> @rhadds_v2i64_lsh(<vscale x 2 x i64> %s0, <vscale x 2
 entry:
   %s0s = sext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = sext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
-  %add = add <vscale x 2 x i128> %s0s, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i128> %s0s, splat (i128 1)
   %add2 = add <vscale x 2 x i128> %add, %s1s
-  %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %add2, splat (i128 1)
   %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %result
 }
@@ -717,9 +717,9 @@ define <vscale x 2 x i64> @rhaddu_v2i64(<vscale x 2 x i64> %s0, <vscale x 2 x i6
 entry:
   %s0s = zext <vscale x 2 x i64> %s0 to <vscale x 2 x i128>
   %s1s = zext <vscale x 2 x i64> %s1 to <vscale x 2 x i128>
-  %add = add nuw nsw <vscale x 2 x i128> %s0s, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 2 x i128> %s0s, splat (i128 1)
   %add2 = add nuw nsw <vscale x 2 x i128> %add, %s1s
-  %s = lshr <vscale x 2 x i128> %add2, shufflevector (<vscale x 2 x i128> insertelement (<vscale x 2 x i128> poison, i128 1, i32 0), <vscale x 2 x i128> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i128> %add2, splat (i128 1)
   %result = trunc <vscale x 2 x i128> %s to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %result
 }
@@ -746,9 +746,9 @@ define <vscale x 2 x i32> @rhadds_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i3
 entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
-  %add = add <vscale x 2 x i64> %s0s, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 2 x i64> %add, %s1s
-  %s = ashr <vscale x 2 x i64> %add2, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %result
 }
@@ -767,9 +767,9 @@ define <vscale x 2 x i32> @rhadds_v2i32_lsh(<vscale x 2 x i32> %s0, <vscale x 2
 entry:
   %s0s = sext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = sext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
-  %add = add <vscale x 2 x i64> %s0s, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 2 x i64> %add, %s1s
-  %s = lshr <vscale x 2 x i64> %add2, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %result
 }
@@ -795,9 +795,9 @@ define <vscale x 2 x i32> @rhaddu_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i3
 entry:
   %s0s = zext <vscale x 2 x i32> %s0 to <vscale x 2 x i64>
   %s1s = zext <vscale x 2 x i32> %s1 to <vscale x 2 x i64>
-  %add = add nuw nsw <vscale x 2 x i64> %s0s, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 2 x i64> %s0s, splat (i64 1)
   %add2 = add nuw nsw <vscale x 2 x i64> %add, %s1s
-  %s = lshr <vscale x 2 x i64> %add2, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 2 x i64> %s to <vscale x 2 x i32>
   ret <vscale x 2 x i32> %result
 }
@@ -821,9 +821,9 @@ define <vscale x 4 x i32> @rhadds_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i3
 entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
-  %add = add <vscale x 4 x i64> %s0s, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 4 x i64> %add, %s1s
-  %s = ashr <vscale x 4 x i64> %add2, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %result
 }
@@ -847,9 +847,9 @@ define <vscale x 4 x i32> @rhadds_v4i32_lsh(<vscale x 4 x i32> %s0, <vscale x 4
 entry:
   %s0s = sext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = sext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
-  %add = add <vscale x 4 x i64> %s0s, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i64> %s0s, splat (i64 1)
   %add2 = add <vscale x 4 x i64> %add, %s1s
-  %s = lshr <vscale x 4 x i64> %add2, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %result
 }
@@ -873,9 +873,9 @@ define <vscale x 4 x i32> @rhaddu_v4i32(<vscale x 4 x i32> %s0, <vscale x 4 x i3
 entry:
   %s0s = zext <vscale x 4 x i32> %s0 to <vscale x 4 x i64>
   %s1s = zext <vscale x 4 x i32> %s1 to <vscale x 4 x i64>
-  %add = add nuw nsw <vscale x 4 x i64> %s0s, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 4 x i64> %s0s, splat (i64 1)
   %add2 = add nuw nsw <vscale x 4 x i64> %add, %s1s
-  %s = lshr <vscale x 4 x i64> %add2, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i64> %add2, splat (i64 1)
   %result = trunc <vscale x 4 x i64> %s to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %result
 }
@@ -894,9 +894,9 @@ define <vscale x 2 x i16> @rhadds_v2i16(<vscale x 2 x i16> %s0, <vscale x 2 x i1
 entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
-  %add = add <vscale x 2 x i32> %s0s, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 2 x i32> %add, %s1s
-  %s = ashr <vscale x 2 x i32> %add2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = ashr <vscale x 2 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %result
 }
@@ -916,9 +916,9 @@ define <vscale x 2 x i16> @rhadds_v2i16_lsh(<vscale x 2 x i16> %s0, <vscale x 2
 entry:
   %s0s = sext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = sext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
-  %add = add <vscale x 2 x i32> %s0s, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add <vscale x 2 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 2 x i32> %add, %s1s
-  %s = lshr <vscale x 2 x i32> %add2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %result
 }
@@ -944,9 +944,9 @@ define <vscale x 2 x i16> @rhaddu_v2i16(<vscale x 2 x i16> %s0, <vscale x 2 x i1
 entry:
   %s0s = zext <vscale x 2 x i16> %s0 to <vscale x 2 x i32>
   %s1s = zext <vscale x 2 x i16> %s1 to <vscale x 2 x i32>
-  %add = add nuw nsw <vscale x 2 x i32> %s0s, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 2 x i32> %s0s, splat (i32 1)
   %add2 = add nuw nsw <vscale x 2 x i32> %add, %s1s
-  %s = lshr <vscale x 2 x i32> %add2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %s = lshr <vscale x 2 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 2 x i32> %s to <vscale x 2 x i16>
   ret <vscale x 2 x i16> %result
 }
@@ -973,9 +973,9 @@ define <vscale x 4 x i16> @rhadds_v4i16(<vscale x 4 x i16> %s0, <vscale x 4 x i1
 entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
-  %add = add <vscale x 4 x i32> %s0s, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 4 x i32> %add, %s1s
-  %s = ashr <vscale x 4 x i32> %add2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %result
 }
@@ -994,9 +994,9 @@ define <vscale x 4 x i16> @rhadds_v4i16_lsh(<vscale x 4 x i16> %s0, <vscale x 4
 entry:
   %s0s = sext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = sext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
-  %add = add <vscale x 4 x i32> %s0s, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 4 x i32> %add, %s1s
-  %s = lshr <vscale x 4 x i32> %add2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %result
 }
@@ -1022,9 +1022,9 @@ define <vscale x 4 x i16> @rhaddu_v4i16(<vscale x 4 x i16> %s0, <vscale x 4 x i1
 entry:
   %s0s = zext <vscale x 4 x i16> %s0 to <vscale x 4 x i32>
   %s1s = zext <vscale x 4 x i16> %s1 to <vscale x 4 x i32>
-  %add = add nuw nsw <vscale x 4 x i32> %s0s, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 4 x i32> %s0s, splat (i32 1)
   %add2 = add nuw nsw <vscale x 4 x i32> %add, %s1s
-  %s = lshr <vscale x 4 x i32> %add2, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 4 x i32> %s to <vscale x 4 x i16>
   ret <vscale x 4 x i16> %result
 }
@@ -1048,9 +1048,9 @@ define <vscale x 8 x i16> @rhadds_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i1
 entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
-  %add = add <vscale x 8 x i32> %s0s, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 8 x i32> %add, %s1s
-  %s = ashr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %result
 }
@@ -1074,9 +1074,9 @@ define <vscale x 8 x i16> @rhadds_v8i16_lsh(<vscale x 8 x i16> %s0, <vscale x 8
 entry:
   %s0s = sext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = sext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
-  %add = add <vscale x 8 x i32> %s0s, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i32> %s0s, splat (i32 1)
   %add2 = add <vscale x 8 x i32> %add, %s1s
-  %s = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %result
 }
@@ -1100,9 +1100,9 @@ define <vscale x 8 x i16> @rhaddu_v8i16(<vscale x 8 x i16> %s0, <vscale x 8 x i1
 entry:
   %s0s = zext <vscale x 8 x i16> %s0 to <vscale x 8 x i32>
   %s1s = zext <vscale x 8 x i16> %s1 to <vscale x 8 x i32>
-  %add = add nuw nsw <vscale x 8 x i32> %s0s, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 8 x i32> %s0s, splat (i32 1)
   %add2 = add nuw nsw <vscale x 8 x i32> %add, %s1s
-  %s = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %result = trunc <vscale x 8 x i32> %s to <vscale x 8 x i16>
   ret <vscale x 8 x i16> %result
 }
@@ -1121,9 +1121,9 @@ define <vscale x 4 x i8> @rhadds_v4i8(<vscale x 4 x i8> %s0, <vscale x 4 x i8> %
 entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
-  %add = add <vscale x 4 x i16> %s0s, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 4 x i16> %add, %s1s
-  %s = ashr <vscale x 4 x i16> %add2, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = ashr <vscale x 4 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %result
 }
@@ -1143,9 +1143,9 @@ define <vscale x 4 x i8> @rhadds_v4i8_lsh(<vscale x 4 x i8> %s0, <vscale x 4 x i
 entry:
   %s0s = sext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = sext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
-  %add = add <vscale x 4 x i16> %s0s, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add <vscale x 4 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 4 x i16> %add, %s1s
-  %s = lshr <vscale x 4 x i16> %add2, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %result
 }
@@ -1171,9 +1171,9 @@ define <vscale x 4 x i8> @rhaddu_v4i8(<vscale x 4 x i8> %s0, <vscale x 4 x i8> %
 entry:
   %s0s = zext <vscale x 4 x i8> %s0 to <vscale x 4 x i16>
   %s1s = zext <vscale x 4 x i8> %s1 to <vscale x 4 x i16>
-  %add = add nuw nsw <vscale x 4 x i16> %s0s, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 4 x i16> %s0s, splat (i16 1)
   %add2 = add nuw nsw <vscale x 4 x i16> %add, %s1s
-  %s = lshr <vscale x 4 x i16> %add2, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %s = lshr <vscale x 4 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 4 x i16> %s to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %result
 }
@@ -1200,9 +1200,9 @@ define <vscale x 8 x i8> @rhadds_v8i8(<vscale x 8 x i8> %s0, <vscale x 8 x i8> %
 entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
-  %add = add <vscale x 8 x i16> %s0s, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 8 x i16> %add, %s1s
-  %s = ashr <vscale x 8 x i16> %add2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = ashr <vscale x 8 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %result
 }
@@ -1221,9 +1221,9 @@ define <vscale x 8 x i8> @rhadds_v8i8_lsh(<vscale x 8 x i8> %s0, <vscale x 8 x i
 entry:
   %s0s = sext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = sext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
-  %add = add <vscale x 8 x i16> %s0s, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add <vscale x 8 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 8 x i16> %add, %s1s
-  %s = lshr <vscale x 8 x i16> %add2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %result
 }
@@ -1249,9 +1249,9 @@ define <vscale x 8 x i8> @rhaddu_v8i8(<vscale x 8 x i8> %s0, <vscale x 8 x i8> %
 entry:
   %s0s = zext <vscale x 8 x i8> %s0 to <vscale x 8 x i16>
   %s1s = zext <vscale x 8 x i8> %s1 to <vscale x 8 x i16>
-  %add = add nuw nsw <vscale x 8 x i16> %s0s, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 8 x i16> %s0s, splat (i16 1)
   %add2 = add nuw nsw <vscale x 8 x i16> %add, %s1s
-  %s = lshr <vscale x 8 x i16> %add2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s = lshr <vscale x 8 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 8 x i16> %s to <vscale x 8 x i8>
   ret <vscale x 8 x i8> %result
 }
@@ -1275,9 +1275,9 @@ define <vscale x 16 x i8> @rhadds_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i
 entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
-  %add = add <vscale x 16 x i16> %s0s, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add = add <vscale x 16 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 16 x i16> %add, %s1s
-  %s = ashr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = ashr <vscale x 16 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %result
 }
@@ -1301,9 +1301,9 @@ define <vscale x 16 x i8> @rhadds_v16i8_lsh(<vscale x 16 x i8> %s0, <vscale x 16
 entry:
   %s0s = sext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = sext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
-  %add = add <vscale x 16 x i16> %s0s, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add = add <vscale x 16 x i16> %s0s, splat (i16 1)
   %add2 = add <vscale x 16 x i16> %add, %s1s
-  %s = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %result
 }
@@ -1327,9 +1327,9 @@ define <vscale x 16 x i8> @rhaddu_v16i8(<vscale x 16 x i8> %s0, <vscale x 16 x i
 entry:
   %s0s = zext <vscale x 16 x i8> %s0 to <vscale x 16 x i16>
   %s1s = zext <vscale x 16 x i8> %s1 to <vscale x 16 x i16>
-  %add = add nuw nsw <vscale x 16 x i16> %s0s, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add = add nuw nsw <vscale x 16 x i16> %s0s, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add, %s1s
-  %s = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %s = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %result = trunc <vscale x 16 x i16> %s to <vscale x 16 x i8>
   ret <vscale x 16 x i8> %result
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
index 3ccbd58847401..c0ddceb42e1d0 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
@@ -771,7 +771,7 @@ define <vscale x 4 x i32> @sdiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %div = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = sdiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -783,7 +783,7 @@ define <vscale x 4 x i32> @udiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
-  %div = udiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 3, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %div = udiv <vscale x 4 x i32> %a, splat (i32 3)
   ret <vscale x 4 x i32> %div
 }
 
@@ -795,9 +795,9 @@ define <vscale x 8 x i16> @uqsub(<vscale x 8 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uqsub z0.h, z0.h, #32768 // =0x8000
 ; CHECK-NEXT:    ret
-  %cmp = icmp slt <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 0, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
-  %sub = xor <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 -32768, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
-  %sel = select <vscale x 8 x i1> %cmp, <vscale x 8 x i16> %sub, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> undef, i16 0, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp slt <vscale x 8 x i16> %a, zeroinitializer
+  %sub = xor <vscale x 8 x i16> %a, splat (i16 -32768)
+  %sel = select <vscale x 8 x i1> %cmp, <vscale x 8 x i16> %sub, <vscale x 8 x i16> zeroinitializer
   ret <vscale x 8 x i16> %sel
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index cb2b2f34ca5ef..fc2672f8c80a8 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -538,7 +538,7 @@ define <vscale x 2 x i64> @mls_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b,
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = add <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = add <vscale x 2 x i64> %1, splat (i64 4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -551,7 +551,7 @@ define <vscale x 2 x i64> @muladd_i64_negativeAddend(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = add <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = add <vscale x 2 x i64> %1, splat (i64 -4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -565,7 +565,7 @@ define <vscale x 4 x i32> @muladd_i32_positiveAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = add <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = add <vscale x 4 x i32> %1, splat (i32 65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -578,7 +578,7 @@ define <vscale x 4 x i32> @muladd_i32_negativeAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = add <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = add <vscale x 4 x i32> %1, splat (i32 -65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -591,7 +591,7 @@ define <vscale x 8 x i16> @muladd_i16_positiveAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = add <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = add <vscale x 8 x i16> %1, splat (i16 255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -604,7 +604,7 @@ define <vscale x 8 x i16> @muladd_i16_negativeAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = add <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = add <vscale x 8 x i16> %1, splat (i16 -255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -617,7 +617,7 @@ define <vscale x 16 x i8> @muladd_i8_positiveAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = add <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = add <vscale x 16 x i8> %1, splat (i8 15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -630,7 +630,7 @@ define <vscale x 16 x i8> @muladd_i8_negativeAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = add <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = add <vscale x 16 x i8> %1, splat (i8 -15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -644,7 +644,7 @@ define <vscale x 2 x i64> @mulsub_i64_positiveAddend(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = sub <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = sub <vscale x 2 x i64> %1, splat (i64 4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -658,7 +658,7 @@ define <vscale x 2 x i64> @mulsub_i64_negativeAddend(<vscale x 2 x i64> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 2 x i64> %a, %b
-  %2 = sub <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -4294967295, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %2 = sub <vscale x 2 x i64> %1, splat (i64 -4294967295)
   ret <vscale x 2 x i64> %2
 }
 
@@ -673,7 +673,7 @@ define <vscale x 4 x i32> @mulsub_i32_positiveAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = sub <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = sub <vscale x 4 x i32> %1, splat (i32 65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -687,7 +687,7 @@ define <vscale x 4 x i32> @mulsub_i32_negativeAddend(<vscale x 4 x i32> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 4 x i32> %a, %b
-  %2 = sub <vscale x 4 x i32> %1, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -65536, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %2 = sub <vscale x 4 x i32> %1, splat (i32 -65536)
   ret <vscale x 4 x i32> %2
 }
 
@@ -700,7 +700,7 @@ define <vscale x 8 x i16> @mulsub_i16_positiveAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = sub <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = sub <vscale x 8 x i16> %1, splat (i16 255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -714,7 +714,7 @@ define <vscale x 8 x i16> @mulsub_i16_negativeAddend(<vscale x 8 x i16> %a, <vsc
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = sub <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -255, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = sub <vscale x 8 x i16> %1, splat (i16 -255)
   ret <vscale x 8 x i16> %2
 }
 
@@ -727,7 +727,7 @@ define <vscale x 16 x i8> @mulsub_i8_positiveAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = sub <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = sub <vscale x 16 x i8> %1, splat (i8 15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -740,7 +740,7 @@ define <vscale x 16 x i8> @mulsub_i8_negativeAddend(<vscale x 16 x i8> %a, <vsca
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 16 x i8> %a, %b
-  %2 = sub <vscale x 16 x i8> %1, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -15, i8 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %2 = sub <vscale x 16 x i8> %1, splat (i8 -15)
   ret <vscale x 16 x i8> %2
 }
 
@@ -757,7 +757,7 @@ define <vscale x 8 x i16> @multiple_fused_ops(<vscale x 8 x i16> %a, <vscale x 8
 ; CHECK-NEXT:    ret
 {
   %1 = mul <vscale x 8 x i16> %a, %b
-  %2 = add  <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 200, i16 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = add  <vscale x 8 x i16> %1, splat (i16 200)
   %3 = mul <vscale x 8 x i16> %2, %a
   %4 = sub <vscale x 8 x i16> %3, %b
   ret <vscale x 8 x i16> %4
@@ -805,7 +805,7 @@ vector.body:                                      ; preds = %vector.body, %for.b
   %3 = getelementptr inbounds i32, ptr %src2, i64 %index
   %wide.masked.load12 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
   %4 = mul nsw <vscale x 4 x i32> %wide.masked.load12, %wide.masked.load
-  %5 = add nsw <vscale x 4 x i32> %4, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %5 = add nsw <vscale x 4 x i32> %4, splat (i32 1)
   %6 = getelementptr inbounds i32, ptr %dst, i64 %index
   tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
   %index.next = add i64 %index, %1
diff --git a/llvm/test/CodeGen/AArch64/sve-knownbits.ll b/llvm/test/CodeGen/AArch64/sve-knownbits.ll
index 2346d65995470..ac391948d6bd4 100644
--- a/llvm/test/CodeGen/AArch64/sve-knownbits.ll
+++ b/llvm/test/CodeGen/AArch64/sve-knownbits.ll
@@ -6,8 +6,8 @@ define <vscale x 8 x i16> @test_knownzero(<vscale x 8 x i16> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %a1 = shl <vscale x 8 x i16> %x, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
-  %a2 = and <vscale x 8 x i16> %a1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a1 = shl <vscale x 8 x i16> %x, splat (i16 8)
+  %a2 = and <vscale x 8 x i16> %a1, splat (i16 8)
   ret <vscale x 8 x i16> %a2
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
index 539f443de18a1..600e9c4805ff7 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
@@ -281,7 +281,7 @@ define <vscale x 4 x i32> @andnot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
-  %y1 = xor <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %y1 = xor <vscale x 4 x i32> %y, splat (i32 -1)
   %a = and <vscale x 4 x i32> %x, %y1
   %b = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %z
   ret <vscale x 4 x i32> %b
@@ -297,7 +297,7 @@ define <vscale x 8 x i16> @andnot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
-  %y1 = xor <vscale x 8 x i16> %y, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %y1 = xor <vscale x 8 x i16> %y, splat (i16 -1)
   %a = and <vscale x 8 x i16> %x, %y1
   %b = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %z
   ret <vscale x 8 x i16> %b
@@ -313,7 +313,7 @@ define <vscale x 16 x i8> @andnot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
-  %y1 = xor <vscale x 16 x i8> %y, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %y1 = xor <vscale x 16 x i8> %y, splat (i8 -1)
   %a = and <vscale x 16 x i8> %x, %y1
   %b = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %z
   ret <vscale x 16 x i8> %b
@@ -331,7 +331,7 @@ define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 4 x i32> %z, zeroinitializer
-  %y1 = xor <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %y1 = xor <vscale x 4 x i32> %y, splat (i32 -1)
   %a = or <vscale x 4 x i32> %x, %y1
   %b = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %a, <vscale x 4 x i32> %z
   ret <vscale x 4 x i32> %b
@@ -349,7 +349,7 @@ define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 8 x i16> %z, zeroinitializer
-  %y1 = xor <vscale x 8 x i16> %y, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %y1 = xor <vscale x 8 x i16> %y, splat (i16 -1)
   %a = or <vscale x 8 x i16> %x, %y1
   %b = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %a, <vscale x 8 x i16> %z
   ret <vscale x 8 x i16> %b
@@ -367,7 +367,7 @@ define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp eq <vscale x 16 x i8> %z, zeroinitializer
-  %y1 = xor <vscale x 16 x i8> %y, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %y1 = xor <vscale x 16 x i8> %y, splat (i8 -1)
   %a = or <vscale x 16 x i8> %x, %y1
   %b = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %a, <vscale x 16 x i8> %z
   ret <vscale x 16 x i8> %b
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
index 4413dcd89f482..0f09f7dac2982 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
@@ -122,7 +122,7 @@ define <vscale x 2 x i64> @mul_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> splat (i64 1)
   %b = mul <vscale x 2 x i64> %a, %x
   ret <vscale x 2 x i64> %b
 }
@@ -136,7 +136,7 @@ define <vscale x 4 x i32> @mul_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> splat (i32 1)
   %b = mul <vscale x 4 x i32> %a, %x
   ret <vscale x 4 x i32> %b
 }
@@ -150,7 +150,7 @@ define <vscale x 8 x i16> @mul_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> splat (i16 1)
   %b = mul <vscale x 8 x i16> %a, %x
   ret <vscale x 8 x i16> %b
 }
@@ -164,7 +164,7 @@ define <vscale x 16 x i8> @mul_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> splat (i8 1)
   %b = mul <vscale x 16 x i8> %a, %x
   ret <vscale x 16 x i8> %b
 }
@@ -178,7 +178,7 @@ define <vscale x 2 x i64> @and_nxv2i64_x(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %y, <vscale x 2 x i64> splat (i64 -1)
   %b = and <vscale x 2 x i64> %a, %x
   ret <vscale x 2 x i64> %b
 }
@@ -192,7 +192,7 @@ define <vscale x 4 x i32> @and_nxv4i32_x(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> splat (i32 -1)
   %b = and <vscale x 4 x i32> %a, %x
   ret <vscale x 4 x i32> %b
 }
@@ -206,7 +206,7 @@ define <vscale x 8 x i16> @and_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %y, <vscale x 8 x i16> splat (i16 -1)
   %b = and <vscale x 8 x i16> %a, %x
   ret <vscale x 8 x i16> %b
 }
@@ -220,7 +220,7 @@ define <vscale x 16 x i8> @and_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %y, <vscale x 16 x i8> splat (i8 -1)
   %b = and <vscale x 16 x i8> %a, %x
   ret <vscale x 16 x i8> %b
 }
@@ -647,7 +647,7 @@ define <vscale x 4 x float> @fadd_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> splat (float -0.000000e+00)
   %b = fadd <vscale x 4 x float> %a, %x
   ret <vscale x 4 x float> %b
 }
@@ -662,7 +662,7 @@ define <vscale x 8 x half> @fadd_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH8000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> splat (half 0xH8000)
   %b = fadd <vscale x 8 x half> %a, %x
   ret <vscale x 8 x half> %b
 }
@@ -677,7 +677,7 @@ define <vscale x 2 x double> @fadd_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> splat (double -0.000000e+00)
   %b = fadd <vscale x 2 x double> %a, %x
   ret <vscale x 2 x double> %b
 }
@@ -737,7 +737,7 @@ define <vscale x 4 x float> @fmul_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> splat (float 1.000000e+00)
   %b = fmul <vscale x 4 x float> %a, %x
   ret <vscale x 4 x float> %b
 }
@@ -752,7 +752,7 @@ define <vscale x 8 x half> @fmul_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> splat (half 0xH3C00)
   %b = fmul <vscale x 8 x half> %a, %x
   ret <vscale x 8 x half> %b
 }
@@ -767,7 +767,7 @@ define <vscale x 2 x double> @fmul_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> splat (double 1.000000e+00)
   %b = fmul <vscale x 2 x double> %a, %x
   ret <vscale x 2 x double> %b
 }
@@ -783,7 +783,7 @@ define <vscale x 4 x float> @fdiv_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %y, <vscale x 4 x float> splat (float 1.000000e+00)
   %b = fdiv <vscale x 4 x float> %x, %a
   ret <vscale x 4 x float> %b
 }
@@ -799,7 +799,7 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %y, <vscale x 8 x half> splat (half 0xH3C00)
   %b = fdiv <vscale x 8 x half> %x, %a
   ret <vscale x 8 x half> %b
 }
@@ -815,7 +815,7 @@ define <vscale x 2 x double> @fdiv_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %y, <vscale x 2 x double> splat (double 1.000000e+00)
   %b = fdiv <vscale x 2 x double> %x, %a
   ret <vscale x 2 x double> %b
 }
@@ -831,7 +831,7 @@ define <vscale x 4 x float> @fma_nxv4f32_x(<vscale x 4 x float> %x, <vscale x 4
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
   %m = fmul fast <vscale x 4 x float> %y, %z
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %m, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %m, <vscale x 4 x float> splat (float -0.000000e+00)
   %b = fadd fast <vscale x 4 x float> %a, %x
   ret <vscale x 4 x float> %b
 }
@@ -847,7 +847,7 @@ define <vscale x 8 x half> @fma_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
   %m = fmul fast <vscale x 8 x half> %y, %z
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %m, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH8000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %m, <vscale x 8 x half> splat (half 0xH8000)
   %b = fadd fast <vscale x 8 x half> %a, %x
   ret <vscale x 8 x half> %b
 }
@@ -863,7 +863,7 @@ define <vscale x 2 x double> @fma_nxv2f64_x(<vscale x 2 x double> %x, <vscale x
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
   %m = fmul fast <vscale x 2 x double> %y, %z
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %m, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %m, <vscale x 2 x double> splat (double -0.000000e+00)
   %b = fadd fast <vscale x 2 x double> %a, %x
   ret <vscale x 2 x double> %b
 }
@@ -998,7 +998,7 @@ define <vscale x 2 x i64> @mul_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 1)
   %b = mul <vscale x 2 x i64> %a, %y
   ret <vscale x 2 x i64> %b
 }
@@ -1013,7 +1013,7 @@ define <vscale x 4 x i32> @mul_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 1)
   %b = mul <vscale x 4 x i32> %a, %y
   ret <vscale x 4 x i32> %b
 }
@@ -1028,7 +1028,7 @@ define <vscale x 8 x i16> @mul_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 1)
   %b = mul <vscale x 8 x i16> %a, %y
   ret <vscale x 8 x i16> %b
 }
@@ -1043,7 +1043,7 @@ define <vscale x 16 x i8> @mul_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> splat (i8 1)
   %b = mul <vscale x 16 x i8> %a, %y
   ret <vscale x 16 x i8> %b
 }
@@ -1058,7 +1058,7 @@ define <vscale x 2 x i64> @and_nxv2i64_y(<vscale x 2 x i64> %x, <vscale x 2 x i6
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 2 x i64> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 -1)
   %b = and <vscale x 2 x i64> %a, %y
   ret <vscale x 2 x i64> %b
 }
@@ -1073,7 +1073,7 @@ define <vscale x 4 x i32> @and_nxv4i32_y(<vscale x 4 x i32> %x, <vscale x 4 x i3
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 -1)
   %b = and <vscale x 4 x i32> %a, %y
   ret <vscale x 4 x i32> %b
 }
@@ -1088,7 +1088,7 @@ define <vscale x 8 x i16> @and_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i1
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 -1)
   %b = and <vscale x 8 x i16> %a, %y
   ret <vscale x 8 x i16> %b
 }
@@ -1103,7 +1103,7 @@ define <vscale x 16 x i8> @and_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x i
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
-  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %a = select <vscale x 16 x i1> %c, <vscale x 16 x i8> %x, <vscale x 16 x i8> splat (i8 -1)
   %b = and <vscale x 16 x i8> %a, %y
   ret <vscale x 16 x i8> %b
 }
@@ -1547,7 +1547,7 @@ define <vscale x 4 x float> @fadd_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> splat (float -0.000000e+00)
   %b = fadd <vscale x 4 x float> %a, %y
   ret <vscale x 4 x float> %b
 }
@@ -1563,7 +1563,7 @@ define <vscale x 8 x half> @fadd_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH8000, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> splat (half 0xH8000)
   %b = fadd <vscale x 8 x half> %a, %y
   ret <vscale x 8 x half> %b
 }
@@ -1579,7 +1579,7 @@ define <vscale x 2 x double> @fadd_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> splat (double -0.000000e+00)
   %b = fadd <vscale x 2 x double> %a, %y
   ret <vscale x 2 x double> %b
 }
@@ -1643,7 +1643,7 @@ define <vscale x 4 x float> @fmul_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x float> %x, <vscale x 4 x float> splat (float 1.000000e+00)
   %b = fmul <vscale x 4 x float> %a, %y
   ret <vscale x 4 x float> %b
 }
@@ -1659,7 +1659,7 @@ define <vscale x 8 x half> @fmul_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
-  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH3C00, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer)
+  %a = select <vscale x 8 x i1> %c, <vscale x 8 x half> %x, <vscale x 8 x half> splat (half 0xH3C00)
   %b = fmul <vscale x 8 x half> %a, %y
   ret <vscale x 8 x half> %b
 }
@@ -1675,7 +1675,7 @@ define <vscale x 2 x double> @fmul_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
-  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = select <vscale x 2 x i1> %c, <vscale x 2 x double> %x, <vscale x 2 x double> splat (double 1.000000e+00)
   %b = fmul <vscale x 2 x double> %a, %y
   ret <vscale x 2 x double> %b
 }
@@ -1840,7 +1840,7 @@ define <vscale x 4 x i32> @mul_nxv4i32_multiuse_x(<vscale x 4 x i32> %x, <vscale
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 4 x i32> %n, zeroinitializer
-  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %a = select <vscale x 4 x i1> %c, <vscale x 4 x i32> %y, <vscale x 4 x i32> splat (i32 1)
   store <vscale x 4 x i32> %a, ptr %p
   %b = mul <vscale x 4 x i32> %a, %x
   ret <vscale x 4 x i32> %b
diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
index d06c13e47bef1..4607f225f81ea 100644
--- a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
@@ -9,7 +9,7 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #4
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 16, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %out = sdiv <vscale x 16 x i8> %a, splat (i8 16)
   ret <vscale x 16 x i8> %out
 }
 
@@ -20,7 +20,7 @@ define <vscale x 16 x i8> @sdiv_i8_neg(<vscale x 16 x i8> %a) #0 {
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #6
 ; CHECK-NEXT:    subr z0.b, z0.b, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 16 x i8> %a, shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 -64, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %out = sdiv <vscale x 16 x i8> %a, splat (i8 -64)
   ret <vscale x 16 x i8> %out
 }
 
@@ -30,7 +30,7 @@ define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #10
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1024, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %out = sdiv <vscale x 8 x i16> %a, splat (i16 1024)
   ret <vscale x 8 x i16> %out
 }
 
@@ -41,7 +41,7 @@ define <vscale x 8 x i16> @sdiv_i16_neg(<vscale x 8 x i16> %a) #0 {
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #12
 ; CHECK-NEXT:    subr z0.h, z0.h, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 -4096, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %out = sdiv <vscale x 8 x i16> %a, splat (i16 -4096)
   ret <vscale x 8 x i16> %out
 }
 
@@ -51,7 +51,7 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #23
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388608, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %out = sdiv <vscale x 4 x i32> %a, splat (i32 8388608)
   ret <vscale x 4 x i32> %out
 }
 
@@ -62,7 +62,7 @@ define <vscale x 4 x i32> @sdiv_i32_neg(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #25
 ; CHECK-NEXT:    subr z0.s, z0.s, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -33554432, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %out = sdiv <vscale x 4 x i32> %a, splat (i32 -33554432)
   ret <vscale x 4 x i32> %out
 }
 
@@ -72,7 +72,7 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #53
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9007199254740992, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %out = sdiv <vscale x 2 x i64> %a, splat (i64 9007199254740992)
   ret <vscale x 2 x i64> %out
 }
 
@@ -83,7 +83,7 @@ define <vscale x 2 x i64> @sdiv_i64_neg(<vscale x 2 x i64> %a) #0 {
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #55
 ; CHECK-NEXT:    subr z0.d, z0.d, #0 // =0x0
 ; CHECK-NEXT:    ret
-  %out = sdiv <vscale x 2 x i64> %a, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -36028797018963968, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %out = sdiv <vscale x 2 x i64> %a, splat (i64 -36028797018963968)
   ret <vscale x 2 x i64> %out
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-splat-sext.ll b/llvm/test/CodeGen/AArch64/sve-splat-sext.ll
index fb71ab1d5eeb8..f689aa6469a25 100644
--- a/llvm/test/CodeGen/AArch64/sve-splat-sext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-splat-sext.ll
@@ -8,8 +8,8 @@ define <vscale x 8 x i16> @sext_splat_v8i16_128() {
 ; CHECK-NEXT:    ret
   %i = insertelement <vscale x 8 x i16> poison, i16 128, i32 0
   %s = shufflevector <vscale x 8 x i16> %i, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
-  %a = shl <vscale x 8 x i16> %s, shufflevector (<vscale x 8 x i16> insertelement(<vscale x 8 x i16> undef, i16 8, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
-  %b = ashr <vscale x 8 x i16> %a, shufflevector (<vscale x 8 x i16> insertelement(<vscale x 8 x i16> undef, i16 8, i32 0), <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer)
+  %a = shl <vscale x 8 x i16> %s, splat (i16 8)
+  %b = ashr <vscale x 8 x i16> %a, splat (i16 8)
   ret <vscale x 8 x i16> %b
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
index d001ae9f77121..9c3d4b1e5a810 100644
--- a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
@@ -12,7 +12,7 @@ define <vscale x 4 x i32> @srem_combine_loop(<vscale x 4 x i32> %a) #0 {
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #1
 ; CHECK-NEXT:    mls z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
-  %rem = srem <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %rem = srem <vscale x 4 x i32> %a, splat (i32 2)
   ret <vscale x 4 x i32> %rem
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
index 0afd11d098a00..02b9562b8f52b 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
@@ -10,8 +10,8 @@ define void @add_lshr_rshrnb_b_6(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 32) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -28,8 +28,8 @@ define void @neg_add_lshr_rshrnb_b_6(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 1) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -45,8 +45,8 @@ define void @add_lshr_rshrnb_h_7(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 64, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 7, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 64) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 7) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -62,8 +62,8 @@ define void @add_lshr_rshrn_h_6(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 32, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 6, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 32) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 6) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -79,8 +79,8 @@ define void @add_lshr_rshrnb_h_2(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 2) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 2) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -92,8 +92,8 @@ define void @neg_add_lshr_rshrnb_h_0(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 1) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 -1) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -109,8 +109,8 @@ define void @neg_zero_shift(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 4 x i32>, ptr %ptr, align 2
-  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
-  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 0, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer) to <vscale x 4 x i32>)
+  %1 = add <vscale x 4 x i32> %load, trunc (<vscale x 4 x i64> splat (i64 1) to <vscale x 4 x i32>)
+  %2 = lshr <vscale x 4 x i32> %1, trunc (<vscale x 4 x i64> splat (i64 0) to <vscale x 4 x i32>)
   %3 = trunc <vscale x 4 x i32> %2 to <vscale x 4 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 4 x i16> %3, ptr %4, align 1
@@ -128,8 +128,8 @@ define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, <vscale x 16 x i
 ; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x1]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 16 x i16> %arg1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 32, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
-  %2 = lshr <vscale x 16 x i16> %1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 6, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %1 = add <vscale x 16 x i16> %arg1, splat (i16 32)
+  %2 = lshr <vscale x 16 x i16> %1, splat (i16 6)
   %3 = getelementptr inbounds i8, ptr %dest, i64 %index
   %load = load <vscale x 16 x i8>, ptr %3, align 2
   %4 = trunc <vscale x 16 x i16> %2 to <vscale x 16 x i8>
@@ -149,8 +149,8 @@ define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, <vscale x 8 x i3
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 8 x i32> %arg1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-  %2 = lshr <vscale x 8 x i32> %1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %1 = add <vscale x 8 x i32> %arg1, splat (i32 32)
+  %2 = lshr <vscale x 8 x i32> %1, splat (i32 6)
   %3 = getelementptr inbounds i16, ptr %dest, i64 %index
   %load = load <vscale x 8 x i16>, ptr %3, align 2
   %4 = trunc <vscale x 8 x i32> %2 to <vscale x 8 x i16>
@@ -170,8 +170,8 @@ define void @wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4 x i6
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 4 x i64> %arg1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2147483648, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-  %2 = lshr <vscale x 4 x i64> %1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 32, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %1 = add <vscale x 4 x i64> %arg1, splat (i64 2147483648)
+  %2 = lshr <vscale x 4 x i64> %1, splat (i64 32)
   %3 = getelementptr inbounds i32, ptr %dest, i64 %index
   %load = load <vscale x 4 x i32>, ptr %3, align 4
   %4 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i32>
@@ -195,8 +195,8 @@ define void @neg_wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
-  %1 = add <vscale x 4 x i64> %arg1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 140737488355328, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-  %2 = lshr <vscale x 4 x i64> %1, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 48, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+  %1 = add <vscale x 4 x i64> %arg1, splat (i64 140737488355328)
+  %2 = lshr <vscale x 4 x i64> %1, splat (i64 48)
   %3 = getelementptr inbounds i32, ptr %dest, i64 %index
   %load = load <vscale x 4 x i32>, ptr %3, align 4
   %4 = trunc <vscale x 4 x i64> %2 to <vscale x 4 x i32>
@@ -216,7 +216,7 @@ define void @neg_trunc_lsr_add_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, <vs
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
   %1 = add <vscale x 8 x i16> %load, %add_op1
-  %2 = lshr <vscale x 8 x i16> %1, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 6, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %2 = lshr <vscale x 8 x i16> %1, splat (i16 6)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   store <vscale x 8 x i8> %3, ptr %4, align 1
@@ -233,7 +233,7 @@ define void @neg_trunc_lsr_op1_not_splat(ptr %ptr, ptr %dst, i64 %index, <vscale
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 32, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %1 = add <vscale x 8 x i16> %load, splat (i16 32)
   %2 = lshr <vscale x 8 x i16> %1, %lshr_op1
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
@@ -253,8 +253,8 @@ define void @neg_add_has_two_uses(ptr %ptr, ptr %dst, ptr %dst2, i64 %index){
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x3]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 8 x i16>, ptr %ptr, align 2
-  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %load, trunc (<vscale x 8 x i32> splat (i32 32) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = add <vscale x 8 x i16> %1, %1
   %4 = getelementptr inbounds i16, ptr %dst2, i64 %index
   %5 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
@@ -273,8 +273,8 @@ define void @add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x1, x2, lsl #2]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 2 x i64>, ptr %ptr, align 2
-  %1 = add <vscale x 2 x i64> %load, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 32, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %2 = lshr <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %1 = add <vscale x 2 x i64> %load, splat (i64 32)
+  %2 = lshr <vscale x 2 x i64> %1, splat (i64 6)
   %3 = trunc <vscale x 2 x i64> %2 to <vscale x 2 x i32>
   %4 = getelementptr inbounds i32, ptr %dst, i64 %index
   store <vscale x 2 x i32> %3, ptr %4, align 1
@@ -291,8 +291,8 @@ define void @neg_add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
 ; CHECK-NEXT:    st1h { z0.d }, p0, [x1, x2, lsl #1]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 2 x i64>, ptr %ptr, align 2
-  %1 = add <vscale x 2 x i64> %load, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 32, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %2 = lshr <vscale x 2 x i64> %1, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %1 = add <vscale x 2 x i64> %load, splat (i64 32)
+  %2 = lshr <vscale x 2 x i64> %1, splat (i64 6)
   %3 = trunc <vscale x 2 x i64> %2 to <vscale x 2 x i16>
   %4 = getelementptr inbounds i16, ptr %dst, i64 %index
   store <vscale x 2 x i16> %3, ptr %4, align 1
@@ -307,8 +307,8 @@ define void @masked_store_rshrnb(ptr %ptr, ptr %dst, i64 %index, <vscale x 8 x i
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
 ; CHECK-NEXT:    ret
   %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %ptr, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x i16> poison)
-  %1 = add <vscale x 8 x i16> %wide.masked.load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
-  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %1 = add <vscale x 8 x i16> %wide.masked.load, trunc (<vscale x 8 x i32> splat (i32 32) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> splat (i32 6) to <vscale x 8 x i16>)
   %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
   %4 = getelementptr inbounds i8, ptr %dst, i64 %index
   tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> %3, ptr %4, i32 1, <vscale x 8 x i1> %mask)

From dbb65dd330cc1696d7ca3dedc7aa9fa12c55a075 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 14 Feb 2024 14:28:23 +0000
Subject: [PATCH 433/546] [LLVM][tests/CodeGen/RISCV] Convert instances of
 ConstantExpr based splats to use splat().

This is mostly NFC but some output does change due to consistently
inserting into poison rather than undef and using i64 as the index
type for inserts.
---
 llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll   | 16 ++--
 .../CodeGen/RISCV/rvv/mscatter-combine.ll     |  6 +-
 llvm/test/CodeGen/RISCV/rvv/pr61561.ll        | 10 +--
 llvm/test/CodeGen/RISCV/rvv/pr63459.ll        |  2 +-
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll     | 10 +--
 .../CodeGen/RISCV/rvv/strided-load-store.ll   | 38 ++++----
 .../RISCV/rvv/undef-earlyclobber-chain.ll     | 12 +--
 llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll       | 88 +++++++++----------
 llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll      |  6 +-
 .../test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll     | 26 +++---
 .../test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll     | 26 +++---
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll      | 10 +--
 llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll   |  4 +-
 llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll  |  4 +-
 llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll    | 88 +++++++++----------
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll  |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll       | 14 +--
 .../test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll     | 26 +++---
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      | 12 +--
 .../test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll     | 26 +++---
 .../CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll    | 12 +--
 llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll       | 12 +--
 llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll   |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll       |  6 +-
 .../CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll    | 10 +--
 llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll  |  6 +-
 llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll       | 14 +--
 34 files changed, 269 insertions(+), 269 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index 3a73f1729dedd..c310274d68508 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -606,7 +606,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_ult_two(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmsleu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp ult <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp ult <vscale x 16 x i32> %a, splat (i32 2)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -626,7 +626,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_ugt_one(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmsgtu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp ugt <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp ugt <vscale x 16 x i32> %a, splat (i32 1)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -646,7 +646,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_eq_one(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmseq.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp eq <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp eq <vscale x 16 x i32> %a, splat (i32 1)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -666,7 +666,7 @@ define <vscale x 16 x i1> @ctpop_nxv16i32_ne_one(<vscale x 16 x i32> %va) {
 ; CHECK-ZVBB-NEXT:    vmsne.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 16 x i32> @llvm.ctpop.nxv16i32(<vscale x 16 x i32> %va)
-  %cmp = icmp ne <vscale x 16 x i32> %a, shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+  %cmp = icmp ne <vscale x 16 x i32> %a, splat (i32 1)
   ret <vscale x 16 x i1> %cmp
 }
 
@@ -1020,7 +1020,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_ult_two(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmsleu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp ult <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 2, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp ult <vscale x 8 x i64> %a, splat (i64 2)
   ret <vscale x 8 x i1> %cmp
 }
 
@@ -1040,7 +1040,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_ugt_one(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmsgtu.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp ugt <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp ugt <vscale x 8 x i64> %a, splat (i64 1)
   ret <vscale x 8 x i1> %cmp
 }
 
@@ -1060,7 +1060,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_eq_one(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmseq.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp eq <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp eq <vscale x 8 x i64> %a, splat (i64 1)
   ret <vscale x 8 x i1> %cmp
 }
 
@@ -1080,7 +1080,7 @@ define <vscale x 8 x i1> @ctpop_nxv8i64_ne_one(<vscale x 8 x i64> %va) {
 ; CHECK-ZVBB-NEXT:    vmsne.vi v0, v8, 1
 ; CHECK-ZVBB-NEXT:    ret
   %a = call <vscale x 8 x i64> @llvm.ctpop.nxv8i64(<vscale x 8 x i64> %va)
-  %cmp = icmp ne <vscale x 8 x i64> %a, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %cmp = icmp ne <vscale x 8 x i64> %a, splat (i64 1)
   ret <vscale x 8 x i1> %cmp
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
index df944fada7964..c26532d355957 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
@@ -58,7 +58,7 @@ define void @strided_store_zero_start(i64 %n, ptr %p) {
 ; RV64-NEXT:    ret
   %step = tail call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
   %gep = getelementptr inbounds %struct, ptr %p, <vscale x 1 x i64> %step, i32 6
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> splat (i1 true))
   ret void
 }
 
@@ -93,7 +93,7 @@ define void @strided_store_offset_start(i64 %n, ptr %p) {
   %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
   %add = add <vscale x 1 x i64> %step, %.splat
   %gep = getelementptr inbounds %struct, ptr %p, <vscale x 1 x i64> %add, i32 6
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> splat (i1 true))
   ret void
 }
 
@@ -118,7 +118,7 @@ define void @stride_one_store(i64 %n, ptr %p) {
 ; RV64-NEXT:    ret
   %step = tail call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
   %gep = getelementptr inbounds i64, ptr %p, <vscale x 1 x i64> %step
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %gep, i32 8, <vscale x 1 x i1> splat (i1 true))
   ret void
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
index f27edd3611665..c5fd6943e51be 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
@@ -23,11 +23,11 @@ define <vscale x 4 x i8> @foo(ptr %p) {
 ; CHECK-NEXT:    ret
   %i13 = load <vscale x 4 x i16>, ptr %p, align 2
   %i14 = zext <vscale x 4 x i16> %i13 to <vscale x 4 x i32>
-  %i15 = shl nuw nsw <vscale x 4 x i32> %i14, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i16 = and <vscale x 4 x i32> %i15, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 248, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i17 = mul nuw nsw <vscale x 4 x i32> %i16, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3735, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i18 = add nuw nsw <vscale x 4 x i32> %i17, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 16384, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-  %i21 = lshr <vscale x 4 x i32> %i18, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 15, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %i15 = shl nuw nsw <vscale x 4 x i32> %i14, splat (i32 3)
+  %i16 = and <vscale x 4 x i32> %i15, splat (i32 248)
+  %i17 = mul nuw nsw <vscale x 4 x i32> %i16, splat (i32 3735)
+  %i18 = add nuw nsw <vscale x 4 x i32> %i17, splat (i32 16384)
+  %i21 = lshr <vscale x 4 x i32> %i18, splat (i32 15)
   %i22 = trunc <vscale x 4 x i32> %i21 to <vscale x 4 x i8>
   ret <vscale x 4 x i8> %i22
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr63459.ll b/llvm/test/CodeGen/RISCV/rvv/pr63459.ll
index c871e2992a5ef..5ef8e18bb2641 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr63459.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr63459.ll
@@ -14,7 +14,7 @@ define void @snork(ptr %arg, <vscale x 2 x i64> %arg1) {
 ; CHECK-NEXT:    ret
 bb:
   %getelementptr = getelementptr inbounds <vscale x 2 x i32>, ptr %arg, <vscale x 2 x i64> %arg1
-  tail call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x ptr> align 4 %getelementptr, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 4)
+  tail call void @llvm.vp.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> splat (i32 1), <vscale x 2 x ptr> align 4 %getelementptr, <vscale x 2 x i1> splat (i1 true), i32 4)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index 2d65c9d178b78..8f02ca6535810 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -743,7 +743,7 @@ define <vscale x 2 x i64> @hi_bits_known_zero() vscale_range(2, 4) {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %and = and <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %and = and <vscale x 2 x i64> %step, splat (i64 u0xfffffffffffffff8)
   ret <vscale x 2 x i64> %and
 }
 
@@ -758,8 +758,8 @@ define <vscale x 2 x i64> @hi_bits_known_zero_overflow() vscale_range(2, 4) {
 ; CHECK-NEXT:    vand.vi v8, v8, -8
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xffffffffffffffff, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %and = and <vscale x 2 x i64> %step.mul, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 u0xfffffffffffffff8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %step.mul = mul <vscale x 2 x i64> %step, splat (i64 u0xffffffffffffffff)
+  %and = and <vscale x 2 x i64> %step.mul, splat (i64 u0xfffffffffffffff8)
   ret <vscale x 2 x i64> %and
 }
 
@@ -771,7 +771,7 @@ define <vscale x 2 x i64> @lo_bits_known_zero() {
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    ret
   %step = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-  %step.mul = mul <vscale x 2 x i64> %step, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 8, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-  %and = and <vscale x 2 x i64> %step.mul, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 7, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %step.mul = mul <vscale x 2 x i64> %step, splat (i64 8)
+  %and = and <vscale x 2 x i64> %step.mul, splat (i64 7)
   ret <vscale x 2 x i64> %and
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index 54e5d39e24854..6b584cfb22a52 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -16,7 +16,7 @@ define <vscale x 1 x i64> @gather(ptr %a, i32 %len) {
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> undef, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> undef, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[TMP0]]
@@ -38,7 +38,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
   %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
   %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
-  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i64> undef)
+  %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
   %accum.next = add <vscale x 1 x i64> %accum, %gather
   %index.next = add nuw i64 %index, %0
   %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
@@ -59,7 +59,7 @@ define <vscale x 1 x i64> @gather_disjoint_or(ptr %a, i64 %len) {
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 1, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
-; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP0]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.riscv.masked.strided.load.nxv1i64.p0.i64(<vscale x 1 x i64> poison, ptr [[TMP0]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VSCALE]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 2
@@ -71,7 +71,7 @@ define <vscale x 1 x i64> @gather_disjoint_or(ptr %a, i64 %len) {
 vector.ph:
   %vscale = call i64 @llvm.vscale.i64()
   %step = tail call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-  %step.mul2 = shl <vscale x 1 x i64> %step, shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %step.mul2 = shl <vscale x 1 x i64> %step, splat (i64 1)
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
@@ -80,19 +80,19 @@ vector.body:                                      ; preds = %vector.body, %vecto
 
   %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
 
-  %vec.ind.or = or disjoint <vscale x 1 x i64> %vec.ind, shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %vec.ind.or = or disjoint <vscale x 1 x i64> %vec.ind, splat (i64 1)
 
   %gep = getelementptr i64, ptr %a, <vscale x 1 x i64> %vec.ind.or
   %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %gep,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 true),
   <vscale x 1 x i64> poison
   )
 
   %accum.next = add <vscale x 1 x i64> %accum, %gather
   %index.next = add nuw i64 %index, %vscale
-  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 2, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, splat (i64 2)
 
   %exit = icmp ne i64 %index.next, %len
   br i1 %exit, label %for.cond.cleanup, label %vector.body
@@ -111,7 +111,7 @@ define void @scatter(ptr %a, i32 %len) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]], i32 3
-; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.riscv.masked.strided.store.nxv1i64.p0.i64(<vscale x 1 x i64> zeroinitializer, ptr [[TMP1]], i64 16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -131,7 +131,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
   %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %vec.ind, i32 3
-  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer))
+  tail call void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64> zeroinitializer, <vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true))
   %index.next = add nuw i64 %index, %0
   %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
   %3 = icmp ne i64 %index.next, %wide.trip.count
@@ -155,7 +155,7 @@ define <vscale x 1 x i64> @gather_loopless(ptr %p, i64 %stride) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -175,7 +175,7 @@ define <vscale x 1 x i64> @straightline_offset_add(ptr %p, i64 %offset) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -188,13 +188,13 @@ define <vscale x 1 x i64> @straightline_offset_disjoint_or(ptr %p, i64 %offset)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[X]]
 ;
   %step = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
-  %step.shl = shl <vscale x 1 x i64> %step, shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-  %offsetv = or disjoint <vscale x 1 x i64> %step.shl, shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %step.shl = shl <vscale x 1 x i64> %step, splat (i64 1)
+  %offsetv = or disjoint <vscale x 1 x i64> %step.shl, splat (i64 1)
   %ptrs = getelementptr i32, ptr %p, <vscale x 1 x i64> %offsetv
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 true),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -213,7 +213,7 @@ define <vscale x 1 x i64> @straightline_offset_shl(ptr %p) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -237,7 +237,7 @@ define <vscale x 1 x i64> @neg_shl_is_not_commutative(ptr %p) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -258,7 +258,7 @@ define <vscale x 1 x i64> @straightline_offset_shl_nonc(ptr %p, i64 %shift) {
   %x = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer),
+  <vscale x 1 x i1> splat (i1 1),
   <vscale x 1 x i64> poison
   )
   ret <vscale x 1 x i64> %x
@@ -279,7 +279,7 @@ define void @scatter_loopless(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
   <vscale x 1 x i64> %x,
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  <vscale x 1 x i1> splat (i1 1)
   )
   ret void
 }
@@ -296,7 +296,7 @@ define void @constant_stride(<vscale x 1 x i64> %x, ptr %p, i64 %stride) {
   <vscale x 1 x i64> %x,
   <vscale x 1 x ptr> %ptrs,
   i32 8,
-  <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 1, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  <vscale x 1 x i1> splat (i1 1)
   )
   ret void
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
index 0901c261af1ac..f41a3ec72aed7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
@@ -79,18 +79,18 @@ start:
 Cond1:                             ; preds = %start
     %v15 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
     %v17 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15, i64 0)
-    %vs12.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs12.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 1)
     %v18 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i, i64 0)
-    %vs16.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs16.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 3)
     %v20 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i, i64 0)
     br label %UseSR
 
 Cond2:                           ; preds = %start
     %v15.2 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
     %v17.2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15.2, i64 1)
-    %vs12.i.i.i.2 = add <vscale x 1 x i16> %v15.2, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs12.i.i.i.2 = add <vscale x 1 x i16> %v15.2, splat (i16 1)
     %v18.2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i.2, i64 1)
-    %vs16.i.i.i.2 = add <vscale x 1 x i16> %v15.2, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+    %vs16.i.i.i.2 = add <vscale x 1 x i16> %v15.2, splat (i16 3)
     %v20.2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i.2, i64 1)
     br label %UseSR
 
@@ -132,9 +132,9 @@ define internal void @SubRegLivenessUndef() {
 loopIR.preheader.i.i:
   %v15 = tail call <vscale x 1 x i16> @llvm.experimental.stepvector.nxv1i16()
   %v17 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %v15, i64 0)
-  %vs12.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+  %vs12.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 1)
   %v18 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs12.i.i.i, i64 0)
-  %vs16.i.i.i = add <vscale x 1 x i16> %v15, shufflevector (<vscale x 1 x i16> insertelement (<vscale x 1 x i16> poison, i16 3, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer)
+  %vs16.i.i.i = add <vscale x 1 x i16> %v15, splat (i16 3)
   %v20 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv1i16(<vscale x 8 x i16> poison, <vscale x 1 x i16> %vs16.i.i.i, i64 0)
   br label %loopIR3.i.i
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
index f076c3c621cdb..95866543828fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-vp.ll
@@ -20,7 +20,7 @@ define <vscale x 1 x i8> @vandn_vv_vp_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 -1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8> %not.a, <vscale x 1 x i8> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i8> %x
 }
@@ -38,7 +38,7 @@ define <vscale x 1 x i8> @vandn_vv_vp_swapped_nxv1i8(<vscale x 1 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 -1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i8> @llvm.vp.xor.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i8> @llvm.vp.and.nxv1i8(<vscale x 1 x i8> %b, <vscale x 1 x i8> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i8> %x
 }
@@ -79,7 +79,7 @@ define <vscale x 2 x i8> @vandn_vv_vp_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 -1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> %not.a, <vscale x 2 x i8> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i8> %x
 }
@@ -97,7 +97,7 @@ define <vscale x 2 x i8> @vandn_vv_vp_swapped_nxv2i8(<vscale x 2 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 -1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i8> @llvm.vp.xor.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i8> @llvm.vp.and.nxv2i8(<vscale x 2 x i8> %b, <vscale x 2 x i8> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i8> %x
 }
@@ -138,7 +138,7 @@ define <vscale x 4 x i8> @vandn_vv_vp_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 -1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> %not.a, <vscale x 4 x i8> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i8> %x
 }
@@ -156,7 +156,7 @@ define <vscale x 4 x i8> @vandn_vv_vp_swapped_nxv4i8(<vscale x 4 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 -1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i8> @llvm.vp.xor.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i8> @llvm.vp.and.nxv4i8(<vscale x 4 x i8> %b, <vscale x 4 x i8> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i8> %x
 }
@@ -197,7 +197,7 @@ define <vscale x 8 x i8> @vandn_vv_vp_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 -1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> %not.a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %x
 }
@@ -215,7 +215,7 @@ define <vscale x 8 x i8> @vandn_vv_vp_swapped_nxv8i8(<vscale x 8 x i8> %a, <vsca
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 -1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i8> @llvm.vp.xor.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i8> @llvm.vp.and.nxv8i8(<vscale x 8 x i8> %b, <vscale x 8 x i8> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %x
 }
@@ -256,7 +256,7 @@ define <vscale x 16 x i8> @vandn_vv_vp_nxv16i8(<vscale x 16 x i8> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> %not.a, <vscale x 16 x i8> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i8> %x
 }
@@ -274,7 +274,7 @@ define <vscale x 16 x i8> @vandn_vv_vp_swapped_nxv16i8(<vscale x 16 x i8> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 -1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> %b, <vscale x 16 x i8> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i8> %x
 }
@@ -315,7 +315,7 @@ define <vscale x 32 x i8> @vandn_vv_vp_nxv32i8(<vscale x 32 x i8> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 -1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i8> @llvm.vp.and.nxv32i8(<vscale x 32 x i8> %not.a, <vscale x 32 x i8> %b, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i8> %x
 }
@@ -333,7 +333,7 @@ define <vscale x 32 x i8> @vandn_vv_vp_swapped_nxv32i8(<vscale x 32 x i8> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 -1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i8> @llvm.vp.xor.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i8> @llvm.vp.and.nxv32i8(<vscale x 32 x i8> %b, <vscale x 32 x i8> %not.a, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i8> %x
 }
@@ -374,7 +374,7 @@ define <vscale x 64 x i8> @vandn_vv_vp_nxv64i8(<vscale x 64 x i8> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 -1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer), <vscale x 64 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 -1), <vscale x 64 x i1> %mask, i32 %evl)
   %x = call <vscale x 64 x i8> @llvm.vp.and.nxv64i8(<vscale x 64 x i8> %not.a, <vscale x 64 x i8> %b, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %x
 }
@@ -392,7 +392,7 @@ define <vscale x 64 x i8> @vandn_vv_vp_swapped_nxv64i8(<vscale x 64 x i8> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 -1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer), <vscale x 64 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 64 x i8> @llvm.vp.xor.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 -1), <vscale x 64 x i1> %mask, i32 %evl)
   %x = call <vscale x 64 x i8> @llvm.vp.and.nxv64i8(<vscale x 64 x i8> %b, <vscale x 64 x i8> %not.a, <vscale x 64 x i1> %mask, i32 %evl)
   ret <vscale x 64 x i8> %x
 }
@@ -433,7 +433,7 @@ define <vscale x 1 x i16> @vandn_vv_vp_nxv1i16(<vscale x 1 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 -1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i16> @llvm.vp.and.nxv1i16(<vscale x 1 x i16> %not.a, <vscale x 1 x i16> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i16> %x
 }
@@ -451,7 +451,7 @@ define <vscale x 1 x i16> @vandn_vv_vp_swapped_nxv1i16(<vscale x 1 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 -1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i16> @llvm.vp.xor.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i16> @llvm.vp.and.nxv1i16(<vscale x 1 x i16> %b, <vscale x 1 x i16> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i16> %x
 }
@@ -492,7 +492,7 @@ define <vscale x 2 x i16> @vandn_vv_vp_nxv2i16(<vscale x 2 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 -1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> %not.a, <vscale x 2 x i16> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i16> %x
 }
@@ -510,7 +510,7 @@ define <vscale x 2 x i16> @vandn_vv_vp_swapped_nxv2i16(<vscale x 2 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 -1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i16> @llvm.vp.xor.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i16> @llvm.vp.and.nxv2i16(<vscale x 2 x i16> %b, <vscale x 2 x i16> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i16> %x
 }
@@ -551,7 +551,7 @@ define <vscale x 4 x i16> @vandn_vv_vp_nxv4i16(<vscale x 4 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 -1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> %not.a, <vscale x 4 x i16> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i16> %x
 }
@@ -569,7 +569,7 @@ define <vscale x 4 x i16> @vandn_vv_vp_swapped_nxv4i16(<vscale x 4 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 -1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i16> @llvm.vp.xor.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i16> @llvm.vp.and.nxv4i16(<vscale x 4 x i16> %b, <vscale x 4 x i16> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i16> %x
 }
@@ -610,7 +610,7 @@ define <vscale x 8 x i16> @vandn_vv_vp_nxv8i16(<vscale x 8 x i16> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> %not.a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i16> %x
 }
@@ -628,7 +628,7 @@ define <vscale x 8 x i16> @vandn_vv_vp_swapped_nxv8i16(<vscale x 8 x i16> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 -1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i16> @llvm.vp.xor.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i16> @llvm.vp.and.nxv8i16(<vscale x 8 x i16> %b, <vscale x 8 x i16> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i16> %x
 }
@@ -669,7 +669,7 @@ define <vscale x 16 x i16> @vandn_vv_vp_nxv16i16(<vscale x 16 x i16> %a, <vscale
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 -1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> %not.a, <vscale x 16 x i16> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i16> %x
 }
@@ -687,7 +687,7 @@ define <vscale x 16 x i16> @vandn_vv_vp_swapped_nxv16i16(<vscale x 16 x i16> %a,
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 -1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i16> @llvm.vp.xor.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16> %b, <vscale x 16 x i16> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i16> %x
 }
@@ -728,7 +728,7 @@ define <vscale x 32 x i16> @vandn_vv_vp_nxv32i16(<vscale x 32 x i16> %a, <vscale
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 -1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i16> @llvm.vp.and.nxv32i16(<vscale x 32 x i16> %not.a, <vscale x 32 x i16> %b, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i16> %x
 }
@@ -746,7 +746,7 @@ define <vscale x 32 x i16> @vandn_vv_vp_swapped_nxv32i16(<vscale x 32 x i16> %a,
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 -1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer), <vscale x 32 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 32 x i16> @llvm.vp.xor.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 -1), <vscale x 32 x i1> %mask, i32 %evl)
   %x = call <vscale x 32 x i16> @llvm.vp.and.nxv32i16(<vscale x 32 x i16> %b, <vscale x 32 x i16> %not.a, <vscale x 32 x i1> %mask, i32 %evl)
   ret <vscale x 32 x i16> %x
 }
@@ -787,7 +787,7 @@ define <vscale x 1 x i32> @vandn_vv_vp_nxv1i32(<vscale x 1 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 -1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i32> @llvm.vp.and.nxv1i32(<vscale x 1 x i32> %not.a, <vscale x 1 x i32> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i32> %x
 }
@@ -805,7 +805,7 @@ define <vscale x 1 x i32> @vandn_vv_vp_swapped_nxv1i32(<vscale x 1 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 -1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i32> @llvm.vp.xor.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i32> @llvm.vp.and.nxv1i32(<vscale x 1 x i32> %b, <vscale x 1 x i32> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i32> %x
 }
@@ -846,7 +846,7 @@ define <vscale x 2 x i32> @vandn_vv_vp_nxv2i32(<vscale x 2 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> %not.a, <vscale x 2 x i32> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i32> %x
 }
@@ -864,7 +864,7 @@ define <vscale x 2 x i32> @vandn_vv_vp_swapped_nxv2i32(<vscale x 2 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i32> @llvm.vp.xor.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i32> @llvm.vp.and.nxv2i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i32> %x
 }
@@ -905,7 +905,7 @@ define <vscale x 4 x i32> @vandn_vv_vp_nxv4i32(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %not.a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i32> %x
 }
@@ -923,7 +923,7 @@ define <vscale x 4 x i32> @vandn_vv_vp_swapped_nxv4i32(<vscale x 4 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 -1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i32> %x
 }
@@ -964,7 +964,7 @@ define <vscale x 8 x i32> @vandn_vv_vp_nxv8i32(<vscale x 8 x i32> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> %not.a, <vscale x 8 x i32> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i32> %x
 }
@@ -982,7 +982,7 @@ define <vscale x 8 x i32> @vandn_vv_vp_swapped_nxv8i32(<vscale x 8 x i32> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i32> @llvm.vp.xor.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i32> @llvm.vp.and.nxv8i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i32> %x
 }
@@ -1023,7 +1023,7 @@ define <vscale x 16 x i32> @vandn_vv_vp_nxv16i32(<vscale x 16 x i32> %a, <vscale
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 -1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> %not.a, <vscale x 16 x i32> %b, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i32> %x
 }
@@ -1041,7 +1041,7 @@ define <vscale x 16 x i32> @vandn_vv_vp_swapped_nxv16i32(<vscale x 16 x i32> %a,
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 -1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 16 x i32> @llvm.vp.xor.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 -1), <vscale x 16 x i1> %mask, i32 %evl)
   %x = call <vscale x 16 x i32> @llvm.vp.and.nxv16i32(<vscale x 16 x i32> %b, <vscale x 16 x i32> %not.a, <vscale x 16 x i1> %mask, i32 %evl)
   ret <vscale x 16 x i32> %x
 }
@@ -1082,7 +1082,7 @@ define <vscale x 1 x i64> @vandn_vv_vp_nxv1i64(<vscale x 1 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 -1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %not.a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %x
 }
@@ -1100,7 +1100,7 @@ define <vscale x 1 x i64> @vandn_vv_vp_swapped_nxv1i64(<vscale x 1 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v9, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 -1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 -1), <vscale x 1 x i1> %mask, i32 %evl)
   %x = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %b, <vscale x 1 x i64> %not.a, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %x
 }
@@ -1173,7 +1173,7 @@ define <vscale x 2 x i64> @vandn_vv_vp_nxv2i64(<vscale x 2 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> %not.a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i64> %x
 }
@@ -1191,7 +1191,7 @@ define <vscale x 2 x i64> @vandn_vv_vp_swapped_nxv2i64(<vscale x 2 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v10, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 2 x i64> @llvm.vp.xor.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i1> %mask, i32 %evl)
   %x = call <vscale x 2 x i64> @llvm.vp.and.nxv2i64(<vscale x 2 x i64> %b, <vscale x 2 x i64> %not.a, <vscale x 2 x i1> %mask, i32 %evl)
   ret <vscale x 2 x i64> %x
 }
@@ -1264,7 +1264,7 @@ define <vscale x 4 x i64> @vandn_vv_vp_nxv4i64(<vscale x 4 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 -1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> %not.a, <vscale x 4 x i64> %b, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i64> %x
 }
@@ -1282,7 +1282,7 @@ define <vscale x 4 x i64> @vandn_vv_vp_swapped_nxv4i64(<vscale x 4 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v12, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 -1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 4 x i64> @llvm.vp.xor.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 -1), <vscale x 4 x i1> %mask, i32 %evl)
   %x = call <vscale x 4 x i64> @llvm.vp.and.nxv4i64(<vscale x 4 x i64> %b, <vscale x 4 x i64> %not.a, <vscale x 4 x i1> %mask, i32 %evl)
   ret <vscale x 4 x i64> %x
 }
@@ -1355,7 +1355,7 @@ define <vscale x 8 x i64> @vandn_vv_vp_nxv8i64(<vscale x 8 x i64> %a, <vscale x
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 -1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %not.a, <vscale x 8 x i64> %b, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
 }
@@ -1373,7 +1373,7 @@ define <vscale x 8 x i64> @vandn_vv_vp_swapped_nxv8i64(<vscale x 8 x i64> %a, <v
 ; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vandn.vv v8, v16, v8, v0.t
 ; CHECK-ZVKB-NEXT:    ret
-  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 -1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %not.a = call <vscale x 8 x i64> @llvm.vp.xor.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 -1), <vscale x 8 x i1> %mask, i32 %evl)
   %x = call <vscale x 8 x i64> @llvm.vp.and.nxv8i64(<vscale x 8 x i64> %b, <vscale x 8 x i64> %not.a, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i64> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 4440ea56ba901..5cfa98916a2de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -24,7 +24,7 @@ define <vscale x 2 x float> @vfpext_nxv2f16_nxv2f32_unmasked(<vscale x 2 x half>
 ; CHECK-NEXT:    vfwcvt.f.f.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x float> %v
 }
 
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @vfpext_nxv2f16_nxv2f64_unmasked(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.f.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x double> %v
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x double> @vfpext_nxv2f32_nxv2f64_unmasked(<vscale x 2 x floa
 ; CHECK-NEXT:    vfwcvt.f.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x double> %v
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
index 9061c38975e28..e5048eaf9d0c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp-mask.ll
@@ -42,7 +42,7 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -66,7 +66,7 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -91,6 +91,6 @@ define <vscale x 2 x i1> @vfptosi_nxv2i1_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 9e7d6f92d84e9..15c4bf255e6dc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -67,7 +67,7 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -145,7 +145,7 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -187,7 +187,7 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.x.f.v v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -313,7 +313,7 @@ define <vscale x 2 x i8> @vfptosi_nxv2i8_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -339,7 +339,7 @@ define <vscale x 2 x i16> @vfptosi_nxv2i16_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -363,7 +363,7 @@ define <vscale x 2 x i32> @vfptosi_nxv2i32_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vfncvt.rtz.x.f.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -385,7 +385,7 @@ define <vscale x 2 x i64> @vfptosi_nxv2i64_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -466,6 +466,6 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
index 6646171fcd15e..4b609d07c1e7a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp-mask.ll
@@ -42,7 +42,7 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -66,7 +66,7 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -91,6 +91,6 @@ define <vscale x 2 x i1> @vfptoui_nxv2i1_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index 486efbe66a6fe..a2591e7dc35f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -67,7 +67,7 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v8, v8, 0
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -105,7 +105,7 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -145,7 +145,7 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.xu.f.v v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -187,7 +187,7 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.rtz.xu.f.v v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f32_unmasked(<vscale x 2 x float> %
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f32_unmasked(<vscale x 2 x float>
 ; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -313,7 +313,7 @@ define <vscale x 2 x i8> @vfptoui_nxv2i8_nxv2f64_unmasked(<vscale x 2 x double>
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -339,7 +339,7 @@ define <vscale x 2 x i16> @vfptoui_nxv2i16_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -363,7 +363,7 @@ define <vscale x 2 x i32> @vfptoui_nxv2i32_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vfncvt.rtz.xu.f.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -385,7 +385,7 @@ define <vscale x 2 x i64> @vfptoui_nxv2i64_nxv2f64_unmasked(<vscale x 2 x double
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -466,6 +466,6 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32_unmasked(<vscale x 32 x fl
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index c6554561be339..4e84a31d71b51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -24,7 +24,7 @@ define <vscale x 2 x half> @vfptrunc_nxv2f16_nxv2f32_unmasked(<vscale x 2 x floa
 ; CHECK-NEXT:    vfncvt.f.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x half> %v
 }
 
@@ -50,7 +50,7 @@ define <vscale x 2 x half> @vfptrunc_nxv2f16_nxv2f64_unmasked(<vscale x 2 x doub
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x half> %v
 }
 
@@ -74,7 +74,7 @@ define <vscale x 2 x float> @vfptrunc_nxv2f32_nxv2f64_unmasked(<vscale x 2 x dou
 ; CHECK-NEXT:    vfncvt.f.f.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f64.nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f64.nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x float> %v
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll
index 64887da78cb7f..1ef0ed858d80a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwadd-vp.ll
@@ -22,8 +22,8 @@ define <vscale x 2 x float> @vfwadd_same_operand(<vscale x 2 x half> %arg, i32 s
 ; ZVFHMIN-NEXT:    vfadd.vv v8, v9, v9
 ; ZVFHMIN-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %vl)
-  %tmp2 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %tmp, <vscale x 2 x float> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> splat (i1 true), i32 %vl)
+  %tmp2 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %tmp, <vscale x 2 x float> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x float> %tmp2
 }
 
@@ -48,9 +48,9 @@ define <vscale x 2 x float> @vfwadd_tu(<vscale x 2 x half> %arg, <vscale x 2 x f
 ; ZVFHMIN-NEXT:    vmv1r.v v8, v9
 ; ZVFHMIN-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp3 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %arg1, <vscale x 2 x float> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp4 = call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> %tmp3, <vscale x 2 x float> %arg1, i32 %arg2)
+  %tmp = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> %arg, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp3 = call <vscale x 2 x float> @llvm.vp.fadd.nxv2f32(<vscale x 2 x float> %arg1, <vscale x 2 x float> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp4 = call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x float> %tmp3, <vscale x 2 x float> %arg1, i32 %arg2)
   ret <vscale x 2 x float> %tmp4
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll
index 07536407ace8d..e1988c058fac3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-sdnode.ll
@@ -597,7 +597,7 @@ define <vscale x 4 x i32> @combine_mul_add_imm1(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmadd.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = add <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %x = add <vscale x 4 x i32> %a, splat (i32 1)
   %y = mul <vscale x 4 x i32> %x, %b
   ret <vscale x 4 x i32> %y
 }
@@ -608,7 +608,7 @@ define <vscale x 4 x i32> @combine_mul_add_imm1_2(<vscale x 4 x i32> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmadd.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = add <vscale x 4 x i32> %a, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %x = add <vscale x 4 x i32> %a, splat (i32 1)
   %y = mul <vscale x 4 x i32> %b, %x
   ret <vscale x 4 x i32> %y
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll
index a3c896ecca22a..186ffb64e5902 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vnmsub-sdnode.ll
@@ -597,7 +597,7 @@ define <vscale x 4 x i32> @combine_mul_sub_imm1(<vscale x 4 x i32> %a, <vscale x
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnmsub.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %a
+  %x = sub <vscale x 4 x i32> splat (i32 1), %a
   %y = mul <vscale x 4 x i32> %x, %b
   ret <vscale x 4 x i32> %y
 }
@@ -608,7 +608,7 @@ define <vscale x 4 x i32> @combine_mul_sub_imm1_2(<vscale x 4 x i32> %a, <vscale
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnmsub.vv v8, v10, v10
 ; CHECK-NEXT:    ret
-  %x = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %a
+  %x = sub <vscale x 4 x i32> splat (i32 1), %a
   %y = mul <vscale x 4 x i32> %b, %x
   ret <vscale x 4 x i32> %y
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index b8a091b242591..16abf2bd28acc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -66,7 +66,7 @@ define <vscale x 1 x i8> @vror_vi_nxv1i8(<vscale x 1 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i8> @llvm.fshr.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 1))
   ret <vscale x 1 x i8> %x
 }
 
@@ -84,7 +84,7 @@ define <vscale x 1 x i8> @vror_vi_rotl_nxv1i8(<vscale x 1 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> shufflevector(<vscale x 1 x i8> insertelement(<vscale x 1 x i8> poison, i8 1, i32 0), <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i8> @llvm.fshl.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %a, <vscale x 1 x i8> splat (i8 1))
   ret <vscale x 1 x i8> %x
 }
 
@@ -150,7 +150,7 @@ define <vscale x 2 x i8> @vror_vi_nxv2i8(<vscale x 2 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i8> @llvm.fshr.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 1))
   ret <vscale x 2 x i8> %x
 }
 
@@ -168,7 +168,7 @@ define <vscale x 2 x i8> @vror_vi_rotl_nxv2i8(<vscale x 2 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> shufflevector(<vscale x 2 x i8> insertelement(<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i8> @llvm.fshl.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %a, <vscale x 2 x i8> splat (i8 1))
   ret <vscale x 2 x i8> %x
 }
 
@@ -234,7 +234,7 @@ define <vscale x 4 x i8> @vror_vi_nxv4i8(<vscale x 4 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i8> @llvm.fshr.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 1))
   ret <vscale x 4 x i8> %x
 }
 
@@ -252,7 +252,7 @@ define <vscale x 4 x i8> @vror_vi_rotl_nxv4i8(<vscale x 4 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> shufflevector(<vscale x 4 x i8> insertelement(<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i8> @llvm.fshl.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %a, <vscale x 4 x i8> splat (i8 1))
   ret <vscale x 4 x i8> %x
 }
 
@@ -318,7 +318,7 @@ define <vscale x 8 x i8> @vror_vi_nxv8i8(<vscale x 8 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i8> @llvm.fshr.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 1))
   ret <vscale x 8 x i8> %x
 }
 
@@ -336,7 +336,7 @@ define <vscale x 8 x i8> @vror_vi_rotl_nxv8i8(<vscale x 8 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i8> @llvm.fshl.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %a, <vscale x 8 x i8> splat (i8 1))
   ret <vscale x 8 x i8> %x
 }
 
@@ -402,7 +402,7 @@ define <vscale x 16 x i8> @vror_vi_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 1))
   ret <vscale x 16 x i8> %x
 }
 
@@ -420,7 +420,7 @@ define <vscale x 16 x i8> @vror_vi_rotl_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 1, i32 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 1))
   ret <vscale x 16 x i8> %x
 }
 
@@ -486,7 +486,7 @@ define <vscale x 32 x i8> @vror_vi_nxv32i8(<vscale x 32 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i8> @llvm.fshr.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 1))
   ret <vscale x 32 x i8> %x
 }
 
@@ -504,7 +504,7 @@ define <vscale x 32 x i8> @vror_vi_rotl_nxv32i8(<vscale x 32 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> shufflevector(<vscale x 32 x i8> insertelement(<vscale x 32 x i8> poison, i8 1, i32 0), <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i8> @llvm.fshl.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %a, <vscale x 32 x i8> splat (i8 1))
   ret <vscale x 32 x i8> %x
 }
 
@@ -570,7 +570,7 @@ define <vscale x 64 x i8> @vror_vi_nxv64i8(<vscale x 64 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer))
+  %x = call <vscale x 64 x i8> @llvm.fshr.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 1))
   ret <vscale x 64 x i8> %x
 }
 
@@ -588,7 +588,7 @@ define <vscale x 64 x i8> @vror_vi_rotl_nxv64i8(<vscale x 64 x i8> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 7
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> shufflevector(<vscale x 64 x i8> insertelement(<vscale x 64 x i8> poison, i8 1, i32 0), <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer))
+  %x = call <vscale x 64 x i8> @llvm.fshl.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %a, <vscale x 64 x i8> splat (i8 1))
   ret <vscale x 64 x i8> %x
 }
 
@@ -654,7 +654,7 @@ define <vscale x 1 x i16> @vror_vi_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i16> @llvm.fshr.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 1))
   ret <vscale x 1 x i16> %x
 }
 
@@ -672,7 +672,7 @@ define <vscale x 1 x i16> @vror_vi_rotl_nxv1i16(<vscale x 1 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> shufflevector(<vscale x 1 x i16> insertelement(<vscale x 1 x i16> poison, i16 1, i32 0), <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i16> @llvm.fshl.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %a, <vscale x 1 x i16> splat (i16 1))
   ret <vscale x 1 x i16> %x
 }
 
@@ -738,7 +738,7 @@ define <vscale x 2 x i16> @vror_vi_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i16> @llvm.fshr.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 1))
   ret <vscale x 2 x i16> %x
 }
 
@@ -756,7 +756,7 @@ define <vscale x 2 x i16> @vror_vi_rotl_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> shufflevector(<vscale x 2 x i16> insertelement(<vscale x 2 x i16> poison, i16 1, i32 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i16> @llvm.fshl.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %a, <vscale x 2 x i16> splat (i16 1))
   ret <vscale x 2 x i16> %x
 }
 
@@ -822,7 +822,7 @@ define <vscale x 4 x i16> @vror_vi_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i16> @llvm.fshr.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 1))
   ret <vscale x 4 x i16> %x
 }
 
@@ -840,7 +840,7 @@ define <vscale x 4 x i16> @vror_vi_rotl_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> shufflevector(<vscale x 4 x i16> insertelement(<vscale x 4 x i16> poison, i16 1, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i16> @llvm.fshl.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %a, <vscale x 4 x i16> splat (i16 1))
   ret <vscale x 4 x i16> %x
 }
 
@@ -906,7 +906,7 @@ define <vscale x 8 x i16> @vror_vi_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 1))
   ret <vscale x 8 x i16> %x
 }
 
@@ -924,7 +924,7 @@ define <vscale x 8 x i16> @vror_vi_rotl_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 1, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 1))
   ret <vscale x 8 x i16> %x
 }
 
@@ -990,7 +990,7 @@ define <vscale x 16 x i16> @vror_vi_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i16> @llvm.fshr.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 1))
   ret <vscale x 16 x i16> %x
 }
 
@@ -1008,7 +1008,7 @@ define <vscale x 16 x i16> @vror_vi_rotl_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> shufflevector(<vscale x 16 x i16> insertelement(<vscale x 16 x i16> poison, i16 1, i32 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i16> @llvm.fshl.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %a, <vscale x 16 x i16> splat (i16 1))
   ret <vscale x 16 x i16> %x
 }
 
@@ -1074,7 +1074,7 @@ define <vscale x 32 x i16> @vror_vi_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i16> @llvm.fshr.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 1))
   ret <vscale x 32 x i16> %x
 }
 
@@ -1092,7 +1092,7 @@ define <vscale x 32 x i16> @vror_vi_rotl_nxv32i16(<vscale x 32 x i16> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 15
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> shufflevector(<vscale x 32 x i16> insertelement(<vscale x 32 x i16> poison, i16 1, i32 0), <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer))
+  %x = call <vscale x 32 x i16> @llvm.fshl.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %a, <vscale x 32 x i16> splat (i16 1))
   ret <vscale x 32 x i16> %x
 }
 
@@ -1171,7 +1171,7 @@ define <vscale x 1 x i32> @vror_vi_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i32> @llvm.fshr.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 1))
   ret <vscale x 1 x i32> %x
 }
 
@@ -1189,7 +1189,7 @@ define <vscale x 1 x i32> @vror_vi_rotl_nxv1i32(<vscale x 1 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> shufflevector(<vscale x 1 x i32> insertelement(<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i32> @llvm.fshl.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %a, <vscale x 1 x i32> splat (i32 1))
   ret <vscale x 1 x i32> %x
 }
 
@@ -1268,7 +1268,7 @@ define <vscale x 2 x i32> @vror_vi_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i32> @llvm.fshr.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1))
   ret <vscale x 2 x i32> %x
 }
 
@@ -1286,7 +1286,7 @@ define <vscale x 2 x i32> @vror_vi_rotl_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> shufflevector(<vscale x 2 x i32> insertelement(<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i32> @llvm.fshl.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %a, <vscale x 2 x i32> splat (i32 1))
   ret <vscale x 2 x i32> %x
 }
 
@@ -1365,7 +1365,7 @@ define <vscale x 4 x i32> @vror_vi_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
   ret <vscale x 4 x i32> %x
 }
 
@@ -1383,7 +1383,7 @@ define <vscale x 4 x i32> @vror_vi_rotl_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 1))
   ret <vscale x 4 x i32> %x
 }
 
@@ -1462,7 +1462,7 @@ define <vscale x 8 x i32> @vror_vi_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i32> @llvm.fshr.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 1))
   ret <vscale x 8 x i32> %x
 }
 
@@ -1480,7 +1480,7 @@ define <vscale x 8 x i32> @vror_vi_rotl_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> shufflevector(<vscale x 8 x i32> insertelement(<vscale x 8 x i32> poison, i32 1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i32> @llvm.fshl.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %a, <vscale x 8 x i32> splat (i32 1))
   ret <vscale x 8 x i32> %x
 }
 
@@ -1559,7 +1559,7 @@ define <vscale x 16 x i32> @vror_vi_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i32> @llvm.fshr.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 1))
   ret <vscale x 16 x i32> %x
 }
 
@@ -1577,7 +1577,7 @@ define <vscale x 16 x i32> @vror_vi_rotl_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 31
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> shufflevector(<vscale x 16 x i32> insertelement(<vscale x 16 x i32> poison, i32 1, i32 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer))
+  %x = call <vscale x 16 x i32> @llvm.fshl.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %a, <vscale x 16 x i32> splat (i32 1))
   ret <vscale x 16 x i32> %x
 }
 
@@ -1657,7 +1657,7 @@ define <vscale x 1 x i64> @vror_vi_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i64> @llvm.fshr.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 1))
   ret <vscale x 1 x i64> %x
 }
 
@@ -1676,7 +1676,7 @@ define <vscale x 1 x i64> @vror_vi_rotl_nxv1i64(<vscale x 1 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer))
+  %x = call <vscale x 1 x i64> @llvm.fshl.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %a, <vscale x 1 x i64> splat (i64 1))
   ret <vscale x 1 x i64> %x
 }
 
@@ -1756,7 +1756,7 @@ define <vscale x 2 x i64> @vror_vi_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 1))
   ret <vscale x 2 x i64> %x
 }
 
@@ -1775,7 +1775,7 @@ define <vscale x 2 x i64> @vror_vi_rotl_nxv2i64(<vscale x 2 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %x = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 1))
   ret <vscale x 2 x i64> %x
 }
 
@@ -1855,7 +1855,7 @@ define <vscale x 4 x i64> @vror_vi_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i64> @llvm.fshr.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 1))
   ret <vscale x 4 x i64> %x
 }
 
@@ -1874,7 +1874,7 @@ define <vscale x 4 x i64> @vror_vi_rotl_nxv4i64(<vscale x 4 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> shufflevector(<vscale x 4 x i64> insertelement(<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer))
+  %x = call <vscale x 4 x i64> @llvm.fshl.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %a, <vscale x 4 x i64> splat (i64 1))
   ret <vscale x 4 x i64> %x
 }
 
@@ -1954,7 +1954,7 @@ define <vscale x 8 x i64> @vror_vi_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 1
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i64> @llvm.fshr.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 1))
   ret <vscale x 8 x i64> %x
 }
 
@@ -1973,6 +1973,6 @@ define <vscale x 8 x i64> @vror_vi_rotl_nxv8i64(<vscale x 8 x i64> %a) {
 ; CHECK-ZVKB-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-ZVKB-NEXT:    vror.vi v8, v8, 63
 ; CHECK-ZVKB-NEXT:    ret
-  %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> shufflevector(<vscale x 8 x i64> insertelement(<vscale x 8 x i64> poison, i64 1, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer))
+  %x = call <vscale x 8 x i64> @llvm.fshl.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %a, <vscale x 8 x i64> splat (i64 1))
   ret <vscale x 8 x i64> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 1b568bf8801b1..0d52dd794fd56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -762,7 +762,7 @@ define <vscale x 2 x i1> @select_one(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y,
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmorn.mm v0, v8, v0
 ; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %a
 }
 
@@ -782,7 +782,7 @@ define <vscale x 2 x i1> @select_x_one(<vscale x 2 x i1> %x, <vscale x 2 x i1> %
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmorn.mm v0, v8, v0
 ; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> %y, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x i1> %a
 }
 
@@ -802,7 +802,7 @@ define <vscale x 2 x i1> @select_one_x(<vscale x 2 x i1> %x, <vscale x 2 x i1> %
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmor.mm v0, v0, v8
 ; CHECK-NEXT:    ret
-  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %y, i32 %evl)
+  %a = call <vscale x 2 x i1> @llvm.vp.select.nxv2i1(<vscale x 2 x i1> %x, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> %y, i32 %evl)
  ret <vscale x 2 x i1> %a
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll
index 66f9e8dc9c5fc..04aed5d81db99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp-mask.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vsext_nxv2i1_nxv2i16_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vsext_nxv2i1_nxv2i32_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,6 +70,6 @@ define <vscale x 2 x i64> @vsext_nxv2i1_nxv2i64_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 8aaa74c8e21fc..834e7dd85aea0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vsext_nxv2i8_nxv2i16_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vsext_nxv2i8_nxv2i32_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vsext.vf4 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,7 +70,7 @@ define <vscale x 2 x i64> @vsext_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vsext.vf8 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -94,7 +94,7 @@ define <vscale x 2 x i32> @vsext_nxv2i16_nxv2i32_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -118,7 +118,7 @@ define <vscale x 2 x i64> @vsext_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vsext.vf4 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -142,7 +142,7 @@ define <vscale x 2 x i64> @vsext_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vsext.vf2 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -195,6 +195,6 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    vsext.vf4 v24, v8
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 32 x i32> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll
index 43451f446b373..6e09ceefb7292 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp-mask.ll
@@ -25,7 +25,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i1_unmasked(<vscale x 2 x i1> %v
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -52,7 +52,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i1_unmasked(<vscale x 2 x i1> %
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -79,6 +79,6 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i1_unmasked(<vscale x 2 x i1>
 ; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index 62848ea2279a3..016a43784733d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -69,7 +69,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i8_unmasked(<vscale x 2 x i8> %v
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -107,7 +107,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i16_unmasked(<vscale x 2 x i16>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -147,7 +147,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i32_unmasked(<vscale x 2 x i32>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -189,7 +189,7 @@ define <vscale x 2 x half> @vsitofp_nxv2f16_nxv2i64_unmasked(<vscale x 2 x i64>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.sitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i8_unmasked(<vscale x 2 x i8> %
 ; CHECK-NEXT:    vsext.vf2 v9, v8
 ; CHECK-NEXT:    vfwcvt.f.x.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i16_unmasked(<vscale x 2 x i16>
 ; CHECK-NEXT:    vfwcvt.f.x.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i32_unmasked(<vscale x 2 x i32>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x float> @vsitofp_nxv2f32_nxv2i64_unmasked(<vscale x 2 x i64>
 ; CHECK-NEXT:    vfncvt.f.x.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -307,7 +307,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i8_unmasked(<vscale x 2 x i8>
 ; CHECK-NEXT:    vsext.vf4 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -331,7 +331,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i16_unmasked(<vscale x 2 x i16
 ; CHECK-NEXT:    vsext.vf2 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.x.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -355,7 +355,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-NEXT:    vfwcvt.f.x.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -377,7 +377,7 @@ define <vscale x 2 x double> @vsitofp_nxv2f64_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -486,6 +486,6 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x float> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll
index 4eb80e36001aa..ad8097631acd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp-mask.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i1> @vtrunc_nxv2i1_nxv2i16_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i1> @vtrunc_nxv2i1_nxv2i32_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i1> %v
 }
 
@@ -71,6 +71,6 @@ define <vscale x 2 x i1> @vtrunc_nxv2i1_nxv2i64_unmasked(<vscale x 2 x i64> %a,
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i1> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index a624a42b3873b..a7b4d6616b7b5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -44,7 +44,7 @@ define <vscale x 2 x i8> @vtrunc_nxv2i8_nxv2i16_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -70,7 +70,7 @@ define <vscale x 2 x i8> @vtrunc_nxv2i8_nxv2i32_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -100,7 +100,7 @@ define <vscale x 2 x i8> @vtrunc_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i64> %a,
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i8> %v
 }
 
@@ -122,7 +122,7 @@ define <vscale x 2 x i16> @vtrunc_nxv2i16_nxv2i32_unmasked(<vscale x 2 x i32> %a
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -148,7 +148,7 @@ define <vscale x 2 x i16> @vtrunc_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i64> %a
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -205,7 +205,7 @@ define <vscale x 2 x i32> @vtrunc_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i64> %a
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll
index 128bb80971ac7..cf4bb161ea75b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp-mask.ll
@@ -25,7 +25,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i1_unmasked(<vscale x 2 x i1> %v
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -52,7 +52,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i1_unmasked(<vscale x 2 x i1> %
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -79,6 +79,6 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i1_unmasked(<vscale x 2 x i1>
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index 8ca27484d69fc..668d9373b81d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -69,7 +69,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i8_unmasked(<vscale x 2 x i8> %v
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -107,7 +107,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i16_unmasked(<vscale x 2 x i16>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -147,7 +147,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i32_unmasked(<vscale x 2 x i32>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -189,7 +189,7 @@ define <vscale x 2 x half> @vuitofp_nxv2f16_nxv2i64_unmasked(<vscale x 2 x i64>
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
-  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x half> @llvm.vp.uitofp.nxv2f16.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
 
@@ -213,7 +213,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i8_unmasked(<vscale x 2 x i8> %
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -237,7 +237,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i16_unmasked(<vscale x 2 x i16>
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -259,7 +259,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i32_unmasked(<vscale x 2 x i32>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -283,7 +283,7 @@ define <vscale x 2 x float> @vuitofp_nxv2f32_nxv2i64_unmasked(<vscale x 2 x i64>
 ; CHECK-NEXT:    vfncvt.f.xu.w v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x float> %v
 }
 
@@ -307,7 +307,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i8_unmasked(<vscale x 2 x i8>
 ; CHECK-NEXT:    vzext.vf4 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -331,7 +331,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i16_unmasked(<vscale x 2 x i16
 ; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -355,7 +355,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -377,7 +377,7 @@ define <vscale x 2 x double> @vuitofp_nxv2f64_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x double> %v
 }
 
@@ -486,6 +486,6 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %evl)
+  %v = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x float> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
index ad7ad991e082c..02af09f028fc1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-mask-sdnode.ll
@@ -12,7 +12,7 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwadd.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %sa, %y
@@ -29,7 +29,7 @@ define <vscale x 8 x i64> @vwaddu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwaddu.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %sa, %y
@@ -47,7 +47,7 @@ define <vscale x 8 x i64> @vwaddu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwaddu.vv v16, v8, v12
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %sy = zext <vscale x 8 x i32> %y to <vscale x 8 x i64>
@@ -65,7 +65,7 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32_commutative(<vscale x 8 x i32> %x
 ; CHECK-NEXT:    vwadd.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %y, %sa
@@ -82,8 +82,8 @@ define <vscale x 8 x i64> @vwadd_wv_mask_v8i32_nonzero(<vscale x 8 x i32> %x, <v
 ; CHECK-NEXT:    vmerge.vvm v24, v12, v8, v0
 ; CHECK-NEXT:    vwadd.wv v8, v16, v24
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
+    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> splat (i32 1)
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = add <vscale x 8 x i64> %sa, %y
     ret <vscale x 8 x i64> %ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll
index c3ffee6969d70..a0b7726d3cb5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-vp.ll
@@ -13,9 +13,9 @@ define <vscale x 2 x i32> @vwadd_tu(<vscale x 2 x i8> %arg, <vscale x 2 x i32> %
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
+  %tmp = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
   ret <vscale x 2 x i32> %tmp4
 }
 
@@ -31,9 +31,9 @@ define <vscale x 2 x i32> @vwaddu_tu(<vscale x 2 x i8> %arg, <vscale x 2 x i32>
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 bb:
-  %tmp = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 %arg2)
-  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
+  %tmp = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %arg, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp3 = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %arg1, <vscale x 2 x i32> %tmp, <vscale x 2 x i1> splat (i1 true), i32 %arg2)
+  %tmp4 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i32> %tmp3, <vscale x 2 x i32> %arg1, i32 %arg2)
   ret <vscale x 2 x i32> %tmp4
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
index 8a0af38f724c4..770bb566c764c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-sdnode.ll
@@ -236,7 +236,7 @@ define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a) {
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
-  %z = shl <vscale x 2 x i64> %x, shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 2, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %z = shl <vscale x 2 x i64> %x, splat (i64 2)
   ret <vscale x 2 x i64> %z
 }
 
@@ -444,7 +444,7 @@ define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a) {
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
-  %z = shl <vscale x 4 x i32> %x, shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+  %z = shl <vscale x 4 x i32> %x, splat (i32 2)
   ret <vscale x 4 x i32> %z
 }
 
@@ -624,6 +624,6 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a) {
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
-  %z = shl <vscale x 8 x i16> %x, shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 2, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %z = shl <vscale x 8 x i16> %x, splat (i16 2)
   ret <vscale x 8 x i16> %z
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
index e7ac8ee175641..bb3076b3a945e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsll-vp.ll
@@ -263,7 +263,7 @@ define <vscale x 2 x i64> @vwsll_vi_nxv2i64(<vscale x 2 x i32> %a, <vscale x 2 x
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
-  %z = call <vscale x 2 x i64> @llvm.vp.shl.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> shufflevector(<vscale x 2 x i64> insertelement(<vscale x 2 x i64> poison, i64 2, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %m, i32 %vl)
+  %z = call <vscale x 2 x i64> @llvm.vp.shl.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> splat (i64 2), <vscale x 2 x i1> %m, i32 %vl)
   ret <vscale x 2 x i64> %z
 }
 
@@ -497,7 +497,7 @@ define <vscale x 4 x i32> @vwsll_vi_nxv4i32(<vscale x 4 x i16> %a, <vscale x 4 x
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
-  %z = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> shufflevector(<vscale x 4 x i32> insertelement(<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> %m, i32 %vl)
+  %z = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> splat (i32 2), <vscale x 4 x i1> %m, i32 %vl)
   ret <vscale x 4 x i32> %z
 }
 
@@ -703,6 +703,6 @@ define <vscale x 8 x i16> @vwsll_vi_nxv8i16(<vscale x 8 x i8> %a, <vscale x 8 x
 ; CHECK-ZVBB-NEXT:    vmv2r.v v8, v10
 ; CHECK-ZVBB-NEXT:    ret
   %x = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
-  %z = call <vscale x 8 x i16> @llvm.vp.shl.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> shufflevector(<vscale x 8 x i16> insertelement(<vscale x 8 x i16> poison, i16 2, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %m, i32 %vl)
+  %z = call <vscale x 8 x i16> @llvm.vp.shl.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> splat (i16 2), <vscale x 8 x i1> %m, i32 %vl)
   ret <vscale x 8 x i16> %z
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
index 0cc0063c1d41c..04ece9d94880c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwsub-mask-sdnode.ll
@@ -12,7 +12,7 @@ define <vscale x 8 x i64> @vwsub_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwsub.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = sub <vscale x 8 x i64> %y, %sa
@@ -29,7 +29,7 @@ define <vscale x 8 x i64> @vwsubu_wv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwsubu.wv v16, v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = sub <vscale x 8 x i64> %y, %sa
@@ -47,7 +47,7 @@ define <vscale x 8 x i64> @vwsubu_vv_mask_v8i32(<vscale x 8 x i32> %x, <vscale x
 ; CHECK-NEXT:    vwsubu.vv v16, v12, v8
 ; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
     %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> zeroinitializer
     %sa = zext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %sy = zext <vscale x 8 x i32> %y to <vscale x 8 x i64>
@@ -65,8 +65,8 @@ define <vscale x 8 x i64> @vwsub_wv_mask_v8i32_nonzero(<vscale x 8 x i32> %x, <v
 ; CHECK-NEXT:    vmerge.vvm v24, v12, v8, v0
 ; CHECK-NEXT:    vwsub.wv v8, v16, v24
 ; CHECK-NEXT:    ret
-    %mask = icmp slt <vscale x 8 x i32> %x, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 42, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+    %mask = icmp slt <vscale x 8 x i32> %x, splat (i32 42)
+    %a = select <vscale x 8 x i1> %mask, <vscale x 8 x i32> %x, <vscale x 8 x i32> splat (i32 1)
     %sa = sext <vscale x 8 x i32> %a to <vscale x 8 x i64>
     %ret = sub <vscale x 8 x i64> %y, %sa
     ret <vscale x 8 x i64> %ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll
index 41668d806ec7e..e14236c0258c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp-mask.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vzext_nxv2i1_nxv2i16_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vzext_nxv2i1_nxv2i32_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,6 +70,6 @@ define <vscale x 2 x i64> @vzext_nxv2i1_nxv2i64_unmasked(<vscale x 2 x i1> %a, i
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 365c221c9b9fc..400f89b1ef77d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -22,7 +22,7 @@ define <vscale x 2 x i16> @vzext_nxv2i8_nxv2i16_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i16> %v
 }
 
@@ -46,7 +46,7 @@ define <vscale x 2 x i32> @vzext_nxv2i8_nxv2i32_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vzext.vf4 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -70,7 +70,7 @@ define <vscale x 2 x i64> @vzext_nxv2i8_nxv2i64_unmasked(<vscale x 2 x i8> %a, i
 ; CHECK-NEXT:    vzext.vf8 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -94,7 +94,7 @@ define <vscale x 2 x i32> @vzext_nxv2i16_nxv2i32_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vzext.vf2 v9, v8
 ; CHECK-NEXT:    vmv.v.v v8, v9
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i32> %v
 }
 
@@ -118,7 +118,7 @@ define <vscale x 2 x i64> @vzext_nxv2i16_nxv2i64_unmasked(<vscale x 2 x i16> %a,
 ; CHECK-NEXT:    vzext.vf4 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -142,7 +142,7 @@ define <vscale x 2 x i64> @vzext_nxv2i32_nxv2i64_unmasked(<vscale x 2 x i32> %a,
 ; CHECK-NEXT:    vzext.vf2 v10, v8
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 2 x i64> %v
 }
 
@@ -195,6 +195,6 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32_unmasked(<vscale x 32 x i8> %
 ; CHECK-NEXT:    vzext.vf4 v24, v8
 ; CHECK-NEXT:    vmv.v.v v8, v24
 ; CHECK-NEXT:    ret
-  %v = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> shufflevector (<vscale x 32 x i1> insertelement (<vscale x 32 x i1> undef, i1 true, i32 0), <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer), i32 %vl)
+  %v = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i1> splat (i1 true), i32 %vl)
   ret <vscale x 32 x i32> %v
 }

From fd07b8f809eb64af9b29331ff6b94904b3159f84 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 15 Feb 2024 10:55:52 +0000
Subject: [PATCH 434/546] [LLVM][tests/Transforms/InstCombine] Convert
 instances of ConstantExpr based splats to use splat().

This is mostly NFC but some output does change due to consistently
inserting into poison rather than undef and using i64 as the index
type for inserts.
---
 .../AArch64/VectorUtils_heuristics.ll         |  4 ++--
 .../InstCombine/AArch64/sve-intrinsic-sdiv.ll | 18 +++++++-------
 llvm/test/Transforms/InstCombine/add.ll       |  8 +++----
 llvm/test/Transforms/InstCombine/bitcast.ll   |  2 +-
 llvm/test/Transforms/InstCombine/div.ll       |  6 ++---
 llvm/test/Transforms/InstCombine/fdiv.ll      |  2 +-
 llvm/test/Transforms/InstCombine/fmul.ll      |  6 ++---
 llvm/test/Transforms/InstCombine/icmp-vec.ll  |  4 ++--
 .../test/Transforms/InstCombine/intrinsics.ll |  4 ++--
 .../InstCombine/load-store-forward.ll         | 24 +++++++++----------
 .../Transforms/InstCombine/logical-select.ll  |  6 ++---
 .../InstCombine/masked_intrinsics.ll          |  6 ++---
 .../Transforms/InstCombine/mul-masked-bits.ll |  2 +-
 .../Transforms/InstCombine/rem-mul-shl.ll     |  8 +++----
 llvm/test/Transforms/InstCombine/select.ll    | 16 ++++++-------
 llvm/test/Transforms/InstCombine/shift.ll     | 12 +++++-----
 llvm/test/Transforms/InstCombine/sub.ll       |  4 ++--
 .../Transforms/InstCombine/udiv-simplify.ll   |  6 ++---
 .../InstCombine/vec_shuffle-inseltpoison.ll   |  4 ++--
 .../Transforms/InstCombine/vec_shuffle.ll     |  4 ++--
 20 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll b/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll
index b98036693ba07..cf5fc5ed7da04 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/VectorUtils_heuristics.ll
@@ -10,9 +10,9 @@ target triple = "aarch64-unknown-linux-gnu"
 ; CHECK: unreachable
 define void @novel_algorithm() {
 entry:
-  %a = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> undef, i1 true, i32 0), <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i8> undef)
+  %a = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr undef, i32 1, <vscale x 16 x i1> splat (i1 true), <vscale x 16 x i8> undef)
   %b = add <vscale x 16 x i8> undef, %a
-  call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %b, ptr undef, i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> undef, i1 true, i32 0), <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer))
+  call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %b, ptr undef, i32 1, <vscale x 16 x i1> splat (i1 true))
   unreachable
 }
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll
index 6d91fd6bb850a..68d871355e2a2 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-sdiv.ll
@@ -8,7 +8,7 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.asrd.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], i32 23)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388608, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 8388608))
   ret <vscale x 4 x i32> %out
 }
 
@@ -18,7 +18,7 @@ define <vscale x 4 x i32> @sdiv_i32_neg(<vscale x 4 x i32> %a, <vscale x 4 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.neg.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[PG]], <vscale x 4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388608, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -8388608))
   ret <vscale x 4 x i32> %out
 }
 
@@ -27,7 +27,7 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.asrd.nxv2i64(<vscale x 2 x i1> [[PG:%.*]], <vscale x 2 x i64> [[A:%.*]], i32 23)
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
 ;
-  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8388608, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 8388608))
   ret <vscale x 2 x i64> %out
 }
 
@@ -37,25 +37,25 @@ define <vscale x 2 x i64> @sdiv_i64_neg(<vscale x 2 x i64> %a, <vscale x 2 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.neg.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[PG]], <vscale x 2 x i64> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP2]]
 ;
-  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -8388608, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer))
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sdiv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 -8388608))
   ret <vscale x 2 x i64> %out
 }
 
 define <vscale x 4 x i32> @sdiv_i32_not_base2(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: @sdiv_i32_not_base2(
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388607, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 8388607))
   ret <vscale x 4 x i32> %out
 }
 
 define <vscale x 4 x i32> @sdiv_i32_not_base2_neg(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg) #0 {
 ; CHECK-LABEL: @sdiv_i32_not_base2_neg(
-; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388607, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -8388607, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 -8388607))
   ret <vscale x 4 x i32> %out
 }
 
@@ -64,7 +64,7 @@ define <vscale x 4 x i32> @sdiv_i32_not_zero(<vscale x 4 x i32> %a, <vscale x 4
 ; CHECK-NEXT:    [[OUT:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> [[PG:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[OUT]]
 ;
-  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 0, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sdiv.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 0))
   ret <vscale x 4 x i32> %out
 }
 
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 6242fc6f528a4..522dcf8db27f4 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -2574,12 +2574,12 @@ define i16 @add_sub_zext_constant(i8 %x) {
 
 define <vscale x 1 x i32> @add_to_or_scalable(<vscale x 1 x i32> %in) {
 ; CHECK-LABEL: @add_to_or_scalable(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <vscale x 1 x i32> [[IN:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    [[ADD:%.*]] = or disjoint <vscale x 1 x i32> [[SHL]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[SHL:%.*]] = shl <vscale x 1 x i32> [[IN:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint <vscale x 1 x i32> [[SHL]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[ADD]]
 ;
-  %shl = shl <vscale x 1 x i32> %in, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-  %add = add <vscale x 1 x i32> %shl, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 1, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %shl = shl <vscale x 1 x i32> %in, splat (i32 1)
+  %add = add <vscale x 1 x i32> %shl, splat (i32 1)
   ret <vscale x 1 x i32> %add
 }
 
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index 58bd81297b0dd..176b432ea0b5c 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -577,7 +577,7 @@ define <vscale x 1 x i32> @ScalableAll111(<vscale x 1 x i32> %in) {
 ; CHECK-LABEL: @ScalableAll111(
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[IN:%.*]]
 ;
-  %out = and <vscale x 1 x i32> %in, bitcast (<vscale x 2 x i16> shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> undef, i16 -1, i32 0), <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer) to <vscale x 1 x i32>)
+  %out = and <vscale x 1 x i32> %in, bitcast (<vscale x 2 x i16> splat (i16 -1) to <vscale x 1 x i32>)
   ret <vscale x 1 x i32> %out
 }
 
diff --git a/llvm/test/Transforms/InstCombine/div.ll b/llvm/test/Transforms/InstCombine/div.ll
index 80847dd588ea9..1309dee817cf6 100644
--- a/llvm/test/Transforms/InstCombine/div.ll
+++ b/llvm/test/Transforms/InstCombine/div.ll
@@ -1065,7 +1065,7 @@ define <vscale x 2 x i8> @sdiv_by_negconst_nxv2i8(<vscale x 2 x i8> %x) {
 ; CHECK-NEXT:    [[DIV_NEG:%.*]] = sdiv <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 108, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i8> [[DIV_NEG]]
 ;
-  %div = sdiv <vscale x 2 x i8> %x, shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -108, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+  %div = sdiv <vscale x 2 x i8> %x, splat (i8 -108)
   %sub = sub <vscale x 2 x i8> zeroinitializer, %div
   ret <vscale x 2 x i8> %sub
 }
@@ -1083,11 +1083,11 @@ define <2 x i8> @sdiv_by_minSigned_v2i8(<2 x i8> %x) {
 
 define <vscale x 2 x i8> @sdiv_by_minSigned_nxv2i8(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: @sdiv_by_minSigned_nxv2i8(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -128, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <vscale x 2 x i8> [[X:%.*]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -128, i64 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[DIV_NEG:%.*]] = sext <vscale x 2 x i1> [[TMP1]] to <vscale x 2 x i8>
 ; CHECK-NEXT:    ret <vscale x 2 x i8> [[DIV_NEG]]
 ;
-  %div = sdiv <vscale x 2 x i8> %x, shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 -128, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
+  %div = sdiv <vscale x 2 x i8> %x, splat (i8 -128)
   %sub = sub <vscale x 2 x i8> zeroinitializer, %div
   ret <vscale x 2 x i8> %sub
 }
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 448f9f8d7dd6e..a0710c2bb0484 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -90,7 +90,7 @@ define <vscale x 2 x float> @exact_inverse_scalable_splat(<vscale x 2 x float> %
 ; CHECK-NEXT:    [[DIV:%.*]] = fmul <vscale x 2 x float> [[X:%.*]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 2.500000e-01, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[DIV]]
 ;
-  %div = fdiv <vscale x 2 x float> %x, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 4.0, i64 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %div = fdiv <vscale x 2 x float> %x, splat (float 4.0)
   ret <vscale x 2 x float> %div
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index fe99d7284487b..7e7373e6ef5bd 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -794,12 +794,12 @@ define <2 x float> @fmul_fadd_distribute_vec(<2 x float> %x) {
 
 define <vscale x 2 x float> @fmul_fadd_distribute_scalablevec(<vscale x 2 x float> %x) {
 ; CHECK-LABEL: @fmul_fadd_distribute_scalablevec(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <vscale x 2 x float> [[X:%.*]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 6.000000e+03, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul reassoc <vscale x 2 x float> [[X:%.*]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 6.000000e+03, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc <vscale x 2 x float> [[TMP1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.200000e+07, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[T3]]
 ;
-  %t1 = fadd <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 2.0e+3, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), %x
-  %t3 = fmul reassoc <vscale x 2 x float> %t1, shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 6.0e+3, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %t1 = fadd <vscale x 2 x float> splat (float 2.0e+3), %x
+  %t3 = fmul reassoc <vscale x 2 x float> %t1, splat (float 6.0e+3)
 
 
   ret <vscale x 2 x float> %t3
diff --git a/llvm/test/Transforms/InstCombine/icmp-vec.ll b/llvm/test/Transforms/InstCombine/icmp-vec.ll
index 38e30334342b1..63dd535c527ef 100644
--- a/llvm/test/Transforms/InstCombine/icmp-vec.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-vec.ll
@@ -394,11 +394,11 @@ define <2 x i1> @icmp_logical_or_vec(<2 x i64> %x, <2 x i64> %y, <2 x i1> %false
 define <vscale x 2 x i1> @icmp_logical_or_scalablevec(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i1> %falseval) {
 ; CHECK-LABEL: @icmp_logical_or_scalablevec(
 ; CHECK-NEXT:    [[CMP_NE:%.*]] = icmp ne <vscale x 2 x i64> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 2 x i1> [[CMP_NE]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> [[FALSEVAL:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <vscale x 2 x i1> [[CMP_NE]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> [[FALSEVAL:%.*]]
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[SEL]]
 ;
   %cmp.ne = icmp ne <vscale x 2 x i64> %x, zeroinitializer
-  %sel = select <vscale x 2 x i1> %cmp.ne, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> undef, i1 true, i32 0), <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i1> %falseval
+  %sel = select <vscale x 2 x i1> %cmp.ne, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i1> %falseval
   ret <vscale x 2 x i1> %sel
 }
 
diff --git a/llvm/test/Transforms/InstCombine/intrinsics.ll b/llvm/test/Transforms/InstCombine/intrinsics.ll
index 3ac7c728e1984..d90b0ebd400c7 100644
--- a/llvm/test/Transforms/InstCombine/intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsics.ll
@@ -129,9 +129,9 @@ define <vscale x 1 x i1> @cttz_knownbits_scalable_vec(<vscale x 1 x i32> %arg) {
 ; CHECK-LABEL: @cttz_knownbits_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x i1> zeroinitializer
 ;
-  %or = or <vscale x 1 x i32> %arg, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 4, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %or = or <vscale x 1 x i32> %arg, splat (i32 4)
   %cnt = call <vscale x 1 x i32> @llvm.cttz.nxv1i32(<vscale x 1 x i32> %or, i1 true) nounwind readnone
-  %res = icmp eq <vscale x 1 x i32> %cnt, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 4, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %res = icmp eq <vscale x 1 x i32> %cnt, splat (i32 4)
   ret <vscale x 1 x i1> %res
 }
 
diff --git a/llvm/test/Transforms/InstCombine/load-store-forward.ll b/llvm/test/Transforms/InstCombine/load-store-forward.ll
index aa593b95b3601..dbc68044c11a0 100644
--- a/llvm/test/Transforms/InstCombine/load-store-forward.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-forward.ll
@@ -108,7 +108,7 @@ define i32 @load_i32_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load i32, ptr %a, align 4
   ret i32 %0
 }
@@ -116,12 +116,12 @@ entry:
 define i64 @load_i64_store_nxv8i8(ptr %a) {
 ; CHECK-LABEL: @load_i64_store_nxv8i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), ptr [[A:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[A]], align 8
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
 entry:
-  store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 8 x i8> splat (i8 1), ptr %a, align 16
   %load = load i64, ptr %a, align 8
   ret i64 %load
 }
@@ -134,7 +134,7 @@ define i64 @load_i64_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %load = load i64, ptr %a, align 8
   ret i64 %load
 }
@@ -147,7 +147,7 @@ define i8 @load_i8_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %load = load i8, ptr %a, align 1
   ret i8 %load
 }
@@ -160,7 +160,7 @@ define float @load_f32_store_nxv4f32(ptr %a) {
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.0, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
   %0 = load float, ptr %a, align 4
   ret float %0
 }
@@ -173,7 +173,7 @@ define i32 @load_i32_store_nxv4f32(ptr %a) {
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.0, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
   %load = load i32, ptr %a, align 4
   ret i32 %load
 }
@@ -186,7 +186,7 @@ define <4 x i32> @load_v4i32_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load <4 x i32>, ptr %a, align 16
   ret <4 x i32> %0
 }
@@ -199,7 +199,7 @@ define <4 x i16> @load_v4i16_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret <4 x i16> [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load <4 x i16>, ptr %a, align 16
   ret <4 x i16> %0
 }
@@ -208,12 +208,12 @@ entry:
 define i64 @load_i64_store_nxv4i8(ptr %a) {
 ; CHECK-LABEL: @load_i64_store_nxv4i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), ptr [[A:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i64 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), ptr [[A:%.*]], align 16
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[A]], align 8
 ; CHECK-NEXT:    ret i64 [[LOAD]]
 ;
 entry:
-  store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i8> splat (i8 1), ptr %a, align 16
   %load = load i64, ptr %a, align 8
   ret i64 %load
 }
@@ -228,7 +228,7 @@ define <vscale x 4 x i8> @load_nxv4i8_store_nxv4i32(ptr %a) {
 ; CHECK-NEXT:    ret <vscale x 4 x i8> [[TMP0]]
 ;
 entry:
-  store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), ptr %a, align 16
+  store <vscale x 4 x i32> splat (i32 1), ptr %a, align 16
   %0 = load <vscale x 4 x i8>, ptr %a, align 16
   ret <vscale x 4 x i8> %0
 }
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index af1a3e1455e49..f0ea09c088474 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -480,7 +480,7 @@ define <vscale x 1 x i1> @vec_of_bools_scalable(<vscale x 1 x i1> %a, <vscale x
 ; CHECK-NEXT:    [[R:%.*]] = select <vscale x 1 x i1> [[A:%.*]], <vscale x 1 x i1> [[C:%.*]], <vscale x 1 x i1> [[D:%.*]]
 ; CHECK-NEXT:    ret <vscale x 1 x i1> [[R]]
 ;
-  %b = xor <vscale x 1 x i1> %a, shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  %b = xor <vscale x 1 x i1> %a, splat (i1 true)
   %t11 = and <vscale x 1 x i1> %a, %c
   %t12 = and <vscale x 1 x i1> %b, %d
   %r = or <vscale x 1 x i1> %t11, %t12
@@ -513,7 +513,7 @@ define <vscale x 1 x i64> @vec_of_casted_bools_scalable(<vscale x 1 x i64> %a, <
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[OR]]
 ;
   %scond = sext <vscale x 8 x i1> %cond to <vscale x 8 x i8>
-  %notcond = xor <vscale x 8 x i1> %cond, shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i32 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+  %notcond = xor <vscale x 8 x i1> %cond, splat (i1 true)
   %snotcond = sext <vscale x 8 x i1> %notcond to <vscale x 8 x i8>
   %bc1 = bitcast <vscale x 8 x i8> %scond to <vscale x 1 x i64>
   %bc2 = bitcast <vscale x 8 x i8> %snotcond to <vscale x 1 x i64>
@@ -750,7 +750,7 @@ define <vscale x 2 x i64> @bitcast_vec_cond_scalable(<vscale x 16 x i1> %cond, <
 ;
   %s = sext <vscale x 16 x i1> %cond to <vscale x 16 x i8>
   %t9 = bitcast <vscale x 16 x i8> %s to <vscale x 2 x i64>
-  %nott9 = xor <vscale x 2 x i64> %t9, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 -1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %nott9 = xor <vscale x 2 x i64> %t9, splat (i64 -1)
   %t11 = and <vscale x 2 x i64> %nott9, %c
   %t12 = and <vscale x 2 x i64> %t9, %d
   %r = or <vscale x 2 x i64> %t11, %t12
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 2704905f7a358..15ffc881b5731 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -300,7 +300,7 @@ entry:
   %broadcast.splat = shufflevector <vscale x 4 x ptr> %broadcast.splatinsert, <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
   %broadcast.value = insertelement <vscale x 4 x i16> poison, i16 %val, i32 0
   %broadcast.splatvalue = shufflevector <vscale x 4 x i16> %broadcast.value, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
-  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %broadcast.splatvalue, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> zeroinitializer , i1 true, i32 0), <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i32> zeroinitializer))
+  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %broadcast.splatvalue, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> splat (i1 true))
   ret void
 }
 
@@ -336,7 +336,7 @@ entry:
   %broadcast.splatinsert = insertelement <vscale x 4 x ptr> poison, ptr %dst, i32 0
   %broadcast.splat = shufflevector <vscale x 4 x ptr> %broadcast.splatinsert, <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
   %wide.load = load <vscale x 4 x i16>, ptr %src, align 2
-  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %wide.load, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+  call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> %wide.load, <vscale x 4 x ptr> %broadcast.splat, i32 2, <vscale x 4 x i1> splat (i1 true))
   ret void
 }
 
@@ -389,7 +389,7 @@ define <vscale x 2 x i64> @gather_nxv2i64_uniform_ptrs_all_active_mask(ptr %src)
 ;
   %broadcast.splatinsert = insertelement <vscale x 2 x ptr> poison, ptr %src, i32 0
   %broadcast.splat = shufflevector <vscale x 2 x ptr> %broadcast.splatinsert, <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
-  %res = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %broadcast.splat, i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> undef)
+  %res = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x ptr> %broadcast.splat, i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> undef)
   ret <vscale x 2 x i64> %res
 }
 
diff --git a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
index bc0bee8fc6039..da7cc2db09781 100644
--- a/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
+++ b/llvm/test/Transforms/InstCombine/mul-masked-bits.ll
@@ -86,7 +86,7 @@ define <vscale x 2 x i32> @combine_mul_self_demandedbits_vector2(<vscale x 2 x i
 ;
   %1 = freeze <vscale x 2 x i32> %x
   %2 = mul <vscale x 2 x i32> %1, %1
-  %3 = and <vscale x 2 x i32> %2, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -3, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %3 = and <vscale x 2 x i32> %2, splat (i32 -3)
   ret <vscale x 2 x i32> %3
 }
 
diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
index 75f4d3f4fb07d..9e2df157c2c85 100644
--- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
+++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll
@@ -35,8 +35,8 @@ define <vscale x 16 x i8> @urem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(<vscale x 16
 ; CHECK-LABEL: @urem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(
 ; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
 ;
-  %BO0 = mul nuw <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 15, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
-  %BO1 = mul <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 5, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %BO0 = mul nuw <vscale x 16 x i8> %X, splat (i8 15)
+  %BO1 = mul <vscale x 16 x i8> %X, splat (i8 5)
   %r = urem <vscale x 16 x i8> %BO0, %BO1
   ret <vscale x 16 x i8> %r
 }
@@ -253,8 +253,8 @@ define <vscale x 16 x i8> @srem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(<vscale x 16
 ; CHECK-LABEL: @srem_XY_XZ_with_CY_rem_CZ_eq_0_scalable(
 ; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
 ;
-  %BO0 = mul nsw <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 15, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
-  %BO1 = mul <vscale x 16 x i8> %X, shufflevector(<vscale x 16 x i8> insertelement(<vscale x 16 x i8> poison, i8 5, i64 0) , <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer)
+  %BO0 = mul nsw <vscale x 16 x i8> %X, splat (i8 15)
+  %BO1 = mul <vscale x 16 x i8> %X, splat (i8 5)
   %r = srem <vscale x 16 x i8> %BO0, %BO1
   ret <vscale x 16 x i8> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index c5f1b77c6d740..82baf05977dbb 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3400,11 +3400,11 @@ define i32 @select_cond_not_cond_cond2(i1 %cond) {
 ; scalable vector splat ConstantExprs.
 define <vscale x 2 x i32> @and_constant_select_svec(<vscale x 2 x i32> %x, <vscale x 2 x i1> %cond) {
 ; CHECK-LABEL: @and_constant_select_svec(
-; CHECK-NEXT:    [[A:%.*]] = and <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[A:%.*]] = and <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[B:%.*]] = select <vscale x 2 x i1> [[COND:%.*]], <vscale x 2 x i32> [[A]], <vscale x 2 x i32> [[X]]
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[B]]
 ;
-  %a = and <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = and <vscale x 2 x i32> %x, splat (i32 1)
   %b = select <vscale x 2 x i1> %cond, <vscale x 2 x i32> %a, <vscale x 2 x i32> %x
   ret <vscale x 2 x i32> %b
 }
@@ -3412,23 +3412,23 @@ define <vscale x 2 x i32> @and_constant_select_svec(<vscale x 2 x i32> %x, <vsca
 define <vscale x 2 x i32> @scalable_sign_bits(<vscale x 2 x i8> %x) {
 ; CHECK-LABEL: @scalable_sign_bits(
 ; CHECK-NEXT:    [[A:%.*]] = sext <vscale x 2 x i8> [[X:%.*]] to <vscale x 2 x i32>
-; CHECK-NEXT:    [[B:%.*]] = shl nsw <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 16, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[B:%.*]] = shl nsw <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 16, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[B]]
 ;
   %a = sext <vscale x 2 x i8> %x to <vscale x 2 x i32>
-  %b = shl <vscale x 2 x i32> %a, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 16, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %b = shl <vscale x 2 x i32> %a, splat (i32 16)
   ret <vscale x 2 x i32> %b
 }
 
 define <vscale x 2 x i1> @scalable_non_zero(<vscale x 2 x i32> %x) {
 ; CHECK-LABEL: @scalable_non_zero(
-; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[A:%.*]] = or <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ule <vscale x 2 x i32> [[A]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i1> [[CMP]]
 ;
-  %a = or <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-  %b = add <vscale x 2 x i32> %a, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -1, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
-  %cmp = icmp ult <vscale x 2 x i32> %b, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 56, i32 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
+  %a = or <vscale x 2 x i32> %x, splat (i32 1)
+  %b = add <vscale x 2 x i32> %a, splat (i32 -1)
+  %cmp = icmp ult <vscale x 2 x i32> %b, splat (i32 56)
   ret <vscale x 2 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index d783adbe93863..62f32c2868371 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -1361,11 +1361,11 @@ define <2 x i8> @ashr_demanded_bits_splat(<2 x i8> %x) {
 
 define <vscale x 8 x i8> @ashr_demanded_bits_splat2(<vscale x 8 x i8> %x) {
 ; CHECK-LABEL: @ashr_demanded_bits_splat2(
-; CHECK-NEXT:    [[SHR:%.*]] = ashr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[SHR:%.*]] = ashr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 8 x i8> [[SHR]]
 ;
-  %and = and <vscale x 8 x i8> %x, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 128, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-  %shr = ashr <vscale x 8 x i8> %and, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+  %and = and <vscale x 8 x i8> %x, splat (i8 128)
+  %shr = ashr <vscale x 8 x i8> %and, splat (i8 7)
   ret <vscale x 8 x i8> %shr
 }
 
@@ -1381,11 +1381,11 @@ define <2 x i8> @lshr_demanded_bits_splat(<2 x i8> %x) {
 
 define <vscale x 8 x i8> @lshr_demanded_bits_splat2(<vscale x 8 x i8> %x) {
 ; CHECK-LABEL: @lshr_demanded_bits_splat2(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <vscale x 8 x i8> [[X:%.*]], shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 8 x i8> [[SHR]]
 ;
-  %and = and <vscale x 8 x i8> %x, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 128, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
-  %shr = lshr <vscale x 8 x i8> %and, shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 7, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer)
+  %and = and <vscale x 8 x i8> %x, splat (i8 128)
+  %shr = lshr <vscale x 8 x i8> %and, splat (i8 7)
   ret <vscale x 8 x i8> %shr
 }
 
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 494cdf62c7975..76cd7ab5c10cd 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -844,7 +844,7 @@ define <vscale x 2 x i32> @test44scalablevec(<vscale x 2 x i32> %x) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw <vscale x 2 x i32> [[X:%.*]], shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> poison, i32 -32768, i64 0), <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i32> [[SUB]]
 ;
-  %sub = sub nsw <vscale x 2 x i32> %x, shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 32768, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %sub = sub nsw <vscale x 2 x i32> %x, splat (i32 32768)
   ret <vscale x 2 x i32> %sub
 }
 
@@ -864,7 +864,7 @@ define <vscale x 2 x i16> @test44scalablevecminval(<vscale x 2 x i16> %x) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add <vscale x 2 x i16> [[X:%.*]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 -32768, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x i16> [[SUB]]
 ;
-  %sub = sub nsw <vscale x 2 x i16> %x, shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> undef, i16 -32768, i32 0), <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer)
+  %sub = sub nsw <vscale x 2 x i16> %x, splat (i16 -32768)
   ret <vscale x 2 x i16> %sub
 }
 
diff --git a/llvm/test/Transforms/InstCombine/udiv-simplify.ll b/llvm/test/Transforms/InstCombine/udiv-simplify.ll
index 41a4e642b4e83..bd6e5efc05f18 100644
--- a/llvm/test/Transforms/InstCombine/udiv-simplify.ll
+++ b/llvm/test/Transforms/InstCombine/udiv-simplify.ll
@@ -157,11 +157,11 @@ define i8 @udiv_exact_demanded_low_bits_clear(i8 %a) {
 
 define <vscale x 1 x i32> @udiv_demanded3(<vscale x 1 x i32> %a) {
 ; CHECK-LABEL: @udiv_demanded3(
-; CHECK-NEXT:    [[U:%.*]] = udiv <vscale x 1 x i32> [[A:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 12, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[U:%.*]] = udiv <vscale x 1 x i32> [[A:%.*]], shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 12, i64 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x i32> [[U]]
 ;
-  %o = or <vscale x 1 x i32> %a, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 3, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
-  %u = udiv <vscale x 1 x i32> %o, shufflevector (<vscale x 1 x i32> insertelement (<vscale x 1 x i32> poison, i32 12, i32 0), <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer)
+  %o = or <vscale x 1 x i32> %a, splat (i32 3)
+  %u = udiv <vscale x 1 x i32> %o, splat (i32 12)
   ret <vscale x 1 x i32> %u
 }
 
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
index 81f0a966dbc96..ef085d3e7b50b 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle-inseltpoison.ll
@@ -1468,14 +1468,14 @@ define <4 x i32> @splat_assoc_add(<4 x i32> %x, <4 x i32> %y) {
 
 define <vscale x 4 x i32> @vsplat_assoc_add(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: @vsplat_assoc_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 317426, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = add <vscale x 4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
 ;
 
   %splatx = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-  %a = add <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %a = add <vscale x 4 x i32> %y, splat (i32 317426)
   %r = add <vscale x 4 x i32> %splatx, %a
   ret <vscale x 4 x i32> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
index 250a175ad0ebe..919e30f672e44 100644
--- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll
@@ -1478,14 +1478,14 @@ define <4 x i32> @splat_assoc_add(<4 x i32> %x, <4 x i32> %y) {
 
 define <vscale x 4 x i32> @vsplat_assoc_add(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: @vsplat_assoc_add(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <vscale x 4 x i32> [[X:%.*]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 317426, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[R:%.*]] = add <vscale x 4 x i32> [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[R]]
 ;
 
   %splatx = shufflevector <vscale x 4 x i32> %x, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
-  %a = add <vscale x 4 x i32> %y, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 317426, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %a = add <vscale x 4 x i32> %y, splat (i32 317426)
   %r = add <vscale x 4 x i32> %splatx, %a
   ret <vscale x 4 x i32> %r
 }

From 6a17929e9fe5c5558feadfa2dcc95d27b6a720af Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 15 Feb 2024 11:16:46 +0000
Subject: [PATCH 435/546] [LLVM][tests/Transforms/InstSimplify] Convert
 instances of ConstantExpr based splats to use splat().

This is mostly NFC but some output does change due to consistently
inserting into poison rather than undef and using i64 as the index
type for inserts.
---
 .../ConstProp/vscale-inseltpoison.ll          |  2 +-
 .../InstSimplify/ConstProp/vscale.ll          |  2 +-
 .../InstSimplify/cmp-vec-fast-path.ll         | 52 +++++++++----------
 llvm/test/Transforms/InstSimplify/fp-nan.ll   | 20 +++----
 llvm/test/Transforms/InstSimplify/shift.ll    |  2 +-
 .../InstSimplify/vscale-inseltpoison.ll       |  2 +-
 llvm/test/Transforms/InstSimplify/vscale.ll   |  2 +-
 7 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
index 63b9e12abfde5..e1c46a653f8df 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll
@@ -45,7 +45,7 @@ define <vscale x 4 x i32> @sub_splat() {
 ; CHECK-LABEL: @sub_splat(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -16, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ;
-  %r = sub <vscale x 4 x i32> zeroinitializer, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 16, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %r = sub <vscale x 4 x i32> zeroinitializer, splat (i32 16)
   ret <vscale x 4 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
index 585e736650b78..8ace257a8caec 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll
@@ -45,7 +45,7 @@ define <vscale x 4 x i32> @sub_splat() {
 ; CHECK-LABEL: @sub_splat(
 ; CHECK-NEXT:    ret <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 -16, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ;
-  %r = sub <vscale x 4 x i32> zeroinitializer, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 16, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+  %r = sub <vscale x 4 x i32> zeroinitializer, splat (i32 16)
   ret <vscale x 4 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll b/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll
index a07e6fae5d4f4..169ecc3c2d37e 100644
--- a/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll
+++ b/llvm/test/Transforms/InstSimplify/cmp-vec-fast-path.ll
@@ -31,7 +31,7 @@ define <vscale x 2 x i1> @i32cmp_eq_scalable_one() {
 ; CHECK-LABEL: @i32cmp_eq_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp eq <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp eq <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -63,7 +63,7 @@ define <vscale x 2 x i1> @i32cmp_ne_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ne_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp ne <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ne <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -95,7 +95,7 @@ define <vscale x 2 x i1> @i32cmp_ugt_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ugt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp ugt <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ugt <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -127,7 +127,7 @@ define <vscale x 2 x i1> @i32cmp_uge_scalable_one() {
 ; CHECK-LABEL: @i32cmp_uge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp uge <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp uge <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -159,7 +159,7 @@ define <vscale x 2 x i1> @i32cmp_ult_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ult_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp ult <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ult <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -191,7 +191,7 @@ define <vscale x 2 x i1> @i32cmp_ule_scalable_one() {
 ; CHECK-LABEL: @i32cmp_ule_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp ule <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp ule <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -223,7 +223,7 @@ define <vscale x 2 x i1> @i32cmp_sgt_scalable_one() {
 ; CHECK-LABEL: @i32cmp_sgt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp sgt <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp sgt <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -255,7 +255,7 @@ define <vscale x 2 x i1> @i32cmp_sge_scalable_one() {
 ; CHECK-LABEL: @i32cmp_sge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp sge <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp sge <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -287,7 +287,7 @@ define <vscale x 2 x i1> @i32cmp_slt_scalable_one() {
 ; CHECK-LABEL: @i32cmp_slt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = icmp slt <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp slt <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -319,7 +319,7 @@ define <vscale x 2 x i1> @i32cmp_sle_scalable_one() {
 ; CHECK-LABEL: @i32cmp_sle_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = icmp sle <vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = icmp sle <vscale x 2 x i32> splat (i32 1), splat (i32 1)
   ret <vscale x 2 x i1> %res
 }
 
@@ -351,7 +351,7 @@ define <vscale x 2 x i1> @floatcmp_false_scalable_one() {
 ; CHECK-LABEL: @floatcmp_false_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp false <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp false <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -383,7 +383,7 @@ define <vscale x 2 x i1> @floatcmp_oeq_scalable_one() {
 ; CHECK-LABEL: @floatcmp_oeq_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp oeq <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp oeq <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -415,7 +415,7 @@ define <vscale x 2 x i1> @floatcmp_ogt_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ogt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp ogt <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ogt <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -447,7 +447,7 @@ define <vscale x 2 x i1> @floatcmp_oge_scalable_one() {
 ; CHECK-LABEL: @floatcmp_oge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp oge <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp oge <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -479,7 +479,7 @@ define <vscale x 2 x i1> @floatcmp_olt_scalable_one() {
 ; CHECK-LABEL: @floatcmp_olt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp olt <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp olt <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -511,7 +511,7 @@ define <vscale x 2 x i1> @floatcmp_ole_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ole_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ole <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ole <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -543,7 +543,7 @@ define <vscale x 2 x i1> @floatcmp_one_scalable_one() {
 ; CHECK-LABEL: @floatcmp_one_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp one <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp one <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -575,7 +575,7 @@ define <vscale x 2 x i1> @floatcmp_ord_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ord_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ord <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ord <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -607,7 +607,7 @@ define <vscale x 2 x i1> @floatcmp_ueq_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ueq_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ueq <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ueq <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -639,7 +639,7 @@ define <vscale x 2 x i1> @floatcmp_ugt_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ugt_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp ugt <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ugt <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -671,7 +671,7 @@ define <vscale x 2 x i1> @floatcmp_uge_scalable_one() {
 ; CHECK-LABEL: @floatcmp_uge_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp uge <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp uge <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -703,7 +703,7 @@ define <vscale x 2 x i1> @floatcmp_ult_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ult_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp ult <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ult <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -735,7 +735,7 @@ define <vscale x 2 x i1> @floatcmp_ule_scalable_one() {
 ; CHECK-LABEL: @floatcmp_ule_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp ule <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp ule <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -767,7 +767,7 @@ define <vscale x 2 x i1> @floatcmp_une_scalable_one() {
 ; CHECK-LABEL: @floatcmp_une_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp une <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp une <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -799,7 +799,7 @@ define <vscale x 2 x i1> @floatcmp_uno_scalable_one() {
 ; CHECK-LABEL: @floatcmp_uno_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> zeroinitializer
 ;
-  %res = fcmp uno <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp uno <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
@@ -831,7 +831,7 @@ define <vscale x 2 x i1> @floatcmp_true_scalable_one() {
 ; CHECK-LABEL: @floatcmp_true_scalable_one(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %res = fcmp true <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer), shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 1.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer)
+  %res = fcmp true <vscale x 2 x float> splat (float 1.0), splat (float 1.0)
   ret <vscale x 2 x i1> %res
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/fp-nan.ll b/llvm/test/Transforms/InstSimplify/fp-nan.ll
index fad7a0593e372..cb0bed3790782 100644
--- a/llvm/test/Transforms/InstSimplify/fp-nan.ll
+++ b/llvm/test/Transforms/InstSimplify/fp-nan.ll
@@ -53,7 +53,7 @@ define <vscale x 1 x float> @fsub_nan_op1_scalable_vec_0(<vscale x 1 x float> %x
 ; CHECK-LABEL: @fsub_nan_op1_scalable_vec_0(
 ; CHECK-NEXT:    ret <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0x7FF9000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fsub <vscale x 1 x float> %x, shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0x7FF1000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fsub <vscale x 1 x float> %x, splat (float 0x7FF1000000000000)
   ret <vscale x 1 x float> %r
 }
 
@@ -61,7 +61,7 @@ define <vscale x 1 x float> @fsub_nan_op1_scalable_vec_1(<vscale x 1 x float> %x
 ; CHECK-LABEL: @fsub_nan_op1_scalable_vec_1(
 ; CHECK-NEXT:    ret <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0xFFF9000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fsub <vscale x 1 x float> %x, shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 0xFFF1000000000000, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fsub <vscale x 1 x float> %x, splat (float 0xFFF1000000000000)
   ret <vscale x 1 x float> %r
 }
 
@@ -87,7 +87,7 @@ define <vscale x 1 x double> @fmul_nan_op0_scalable_vec_0(<vscale x 1 x double>
 ; CHECK-LABEL: @fmul_nan_op0_scalable_vec_0(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF8000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fmul <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), %x
+  %r = fmul <vscale x 1 x double> splat (double 0xFFF0000000000001), %x
   ret <vscale x 1 x double> %r
 }
 
@@ -95,7 +95,7 @@ define <vscale x 1 x double> @fmul_nan_op0_scalable_vec_1(<vscale x 1 x double>
 ; CHECK-LABEL: @fmul_nan_op0_scalable_vec_1(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF8DEADDEADDEAD, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fmul <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF0DEADDEADDEAD, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), %x
+  %r = fmul <vscale x 1 x double> splat (double 0xFFF0DEADDEADDEAD), %x
   ret <vscale x 1 x double> %r
 }
 
@@ -113,7 +113,7 @@ define <vscale x 1 x double> @fmul_nan_op1_scalable_vec(<vscale x 1 x double> %x
 ; CHECK-LABEL: @fmul_nan_op1_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF8000000000000, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fmul <vscale x 1 x double> %x, shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF8000000000000, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fmul <vscale x 1 x double> %x, splat (double 0x7FF8000000000000)
   ret <vscale x 1 x double> %r
 }
 
@@ -131,7 +131,7 @@ define <vscale x 1 x double> @fdivl_nan_op0_scalable_vec(<vscale x 1 x double> %
 ; CHECK-LABEL: @fdivl_nan_op0_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF800000000000F, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fdiv <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF800000000000F, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), %x
+  %r = fdiv <vscale x 1 x double> splat (double 0xFFF800000000000F), %x
   ret <vscale x 1 x double> %r
 }
 
@@ -149,7 +149,7 @@ define <vscale x 1 x half> @fdiv_nan_op1_scalable_vec(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: @fdiv_nan_op1_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x half> shufflevector (<vscale x 1 x half> insertelement (<vscale x 1 x half> poison, half 0xH7FFF, i64 0), <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fdiv <vscale x 1 x half> %x, shufflevector (<vscale x 1 x half> insertelement (<vscale x 1 x half> poison, half 0xH7FFF, i64 0), <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fdiv <vscale x 1 x half> %x, splat (half 0xH7FFF)
   ret <vscale x 1 x half> %r
 }
 
@@ -221,7 +221,7 @@ define <vscale x 1 x double> @fneg_nan_2_scalable_vec() {
 ; CHECK-LABEL: @fneg_nan_2_scalable_vec(
 ; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF9234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
-  %r = fsub <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double -0.0, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fsub <vscale x 1 x double> splat (double -0.0), splat (double 0xFFF1234567890ABC)
   ret <vscale x 1 x double> %r
 }
 
@@ -239,7 +239,7 @@ define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_0() {
 ; CHECK-NEXT:    [[R:%.*]] = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x double> [[R]]
 ;
-  %r = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fneg <vscale x 1 x double> splat (double 0xFFF1234567890ABC)
   ret <vscale x 1 x double> %r
 }
 
@@ -249,7 +249,7 @@ define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_1() {
 ; CHECK-NEXT:    [[R:%.*]] = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 1 x double> [[R]]
 ;
-  %r = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %r = fneg <vscale x 1 x double> splat (double 0x7FF0000000000001)
   ret <vscale x 1 x double> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/shift.ll b/llvm/test/Transforms/InstSimplify/shift.ll
index 83751765df839..b562c3c164d52 100644
--- a/llvm/test/Transforms/InstSimplify/shift.ll
+++ b/llvm/test/Transforms/InstSimplify/shift.ll
@@ -347,7 +347,7 @@ define <vscale x 4 x i16> @lshr_scalable_overshift(<vscale x 4 x i16> %va) {
 ; CHECK-LABEL: @lshr_scalable_overshift(
 ; CHECK-NEXT:    ret <vscale x 4 x i16> poison
 ;
-  %vc = lshr <vscale x 4 x i16> %va, shufflevector (<vscale x 4 x i16> insertelement (<vscale x 4 x i16> poison, i16 16, i32 0), <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer)
+  %vc = lshr <vscale x 4 x i16> %va, splat (i16 16)
   ret <vscale x 4 x i16> %vc
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
index 5b34f417e4093..bd9650bd2d081 100644
--- a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll
@@ -134,7 +134,7 @@ define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
 ; CHECK-LABEL: @cmp_le_smax_always_true(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %cmp = icmp sle <vscale x 2 x i64> %x, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9223372036854775807, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %cmp = icmp sle <vscale x 2 x i64> %x, splat (i64 9223372036854775807)
   ret <vscale x 2 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/vscale.ll b/llvm/test/Transforms/InstSimplify/vscale.ll
index 66f629aac3a52..768c5f4ba9ea7 100644
--- a/llvm/test/Transforms/InstSimplify/vscale.ll
+++ b/llvm/test/Transforms/InstSimplify/vscale.ll
@@ -146,7 +146,7 @@ define <vscale x 2 x i1> @cmp_le_smax_always_true(<vscale x 2 x i64> %x) {
 ; CHECK-LABEL: @cmp_le_smax_always_true(
 ; CHECK-NEXT:    ret <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ;
-  %cmp = icmp sle <vscale x 2 x i64> %x, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 9223372036854775807, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %cmp = icmp sle <vscale x 2 x i64> %x, splat (i64 9223372036854775807)
   ret <vscale x 2 x i1> %cmp
 }
 

From 900bea9b1ce095123c03e5bb8834d8fb168378a8 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 15 Feb 2024 11:49:19 +0000
Subject: [PATCH 436/546] [LLVM][test] Convert remaining instances of
 ConstantExpr based splats to use splat().

This is mostly NFC but some output does change due to consistently
inserting into poison rather than undef and using i64 as the index
type for inserts.
---
 .../Analysis/CostModel/AArch64/ext-rhadd.ll   |  28 +--
 .../CodeGen/Generic/expand-vp-load-store.ll   |   8 +-
 .../AggressiveInstCombine/vector-or-load.ll   |   4 +-
 .../Transforms/MemCpyOpt/vscale-crashes.ll    |   4 +-
 .../RISCV/vpintrin-scalarization.ll           | 220 +++++++++---------
 5 files changed, 132 insertions(+), 132 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
index 94a37d944fc22..34791e876f113 100644
--- a/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/ext-rhadd.ll
@@ -37,9 +37,9 @@ define void @srhadd_i8_sext_i16_scalable(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
   %ext2 = sext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
-  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
-  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
   store <vscale x 16 x i8> %trunc, ptr %a
   ret void
@@ -58,9 +58,9 @@ define void @srhadd_i16_sext_i64_scalable(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 8 x i16>, ptr %b
   %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i64>
   %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i64>
-  %add1 = add nuw nsw <vscale x 8 x i64> %ext1, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 8 x i64> %ext1, splat (i64 1)
   %add2 = add nuw nsw <vscale x 8 x i64> %add1, %ext2
-  %shr = lshr <vscale x 8 x i64> %add2, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+  %shr = lshr <vscale x 8 x i64> %add2, splat (i64 1)
   %trunc = trunc <vscale x 8 x i64> %shr to <vscale x 8 x i16>
   store <vscale x 8 x i16> %trunc, ptr %a
   ret void
@@ -102,9 +102,9 @@ define void @urhadd_i8_zext_i64(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i64>
   %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i64>
-  %add1 = add nuw nsw <vscale x 16 x i64> %ext1, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i64> %ext1, splat (i64 1)
   %add2 = add nuw nsw <vscale x 16 x i64> %add1, %ext2
-  %shr = lshr <vscale x 16 x i64> %add2, shufflevector (<vscale x 16 x i64> insertelement (<vscale x 16 x i64> poison, i64 1, i64 0), <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i64> %add2, splat (i64 1)
   %trunc = trunc <vscale x 16 x i64> %shr to <vscale x 16 x i8>
   store <vscale x 16 x i8> %trunc, ptr %a
   ret void
@@ -123,9 +123,9 @@ define void @urhadd_i16_zext_i32(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 8 x i16>, ptr %b
   %ext1 = zext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
   %ext2 = zext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
-  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, splat (i32 1)
   %add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
-  %shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %shr = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
   store <vscale x 8 x i16> %trunc, ptr %a
   ret void
@@ -146,9 +146,9 @@ define void @ext_operand_mismatch(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = sext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
   %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
-  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
-  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
   store <vscale x 16 x i8> %trunc, ptr %a
   ret void
@@ -167,9 +167,9 @@ define void @add_multiple_uses(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 8 x i16>, ptr %b
   %ext1 = sext <vscale x 8 x i16> %ld1 to <vscale x 8 x i32>
   %ext2 = sext <vscale x 8 x i16> %ld2 to <vscale x 8 x i32>
-  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 8 x i32> %ext1, splat (i32 1)
   %add2 = add nuw nsw <vscale x 8 x i32> %add1, %ext2
-  %shr = lshr <vscale x 8 x i32> %add2, shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 1, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+  %shr = lshr <vscale x 8 x i32> %add2, splat (i32 1)
   %trunc = trunc <vscale x 8 x i32> %shr to <vscale x 8 x i16>
   %add.res = add nuw nsw <vscale x 8 x i32> %add1, %add2
   %res = trunc <vscale x 8 x i32> %add.res to <vscale x 8 x i16>
@@ -190,9 +190,9 @@ define void @shift_multiple_uses(ptr %a, ptr %b, ptr %dst) {
   %ld2 = load <vscale x 16 x i8>, ptr %b
   %ext1 = zext <vscale x 16 x i8> %ld1 to <vscale x 16 x i16>
   %ext2 = zext <vscale x 16 x i8> %ld2 to <vscale x 16 x i16>
-  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %add1 = add nuw nsw <vscale x 16 x i16> %ext1, splat (i16 1)
   %add2 = add nuw nsw <vscale x 16 x i16> %add1, %ext2
-  %shr = lshr <vscale x 16 x i16> %add2, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
+  %shr = lshr <vscale x 16 x i16> %add2, splat (i16 1)
   %trunc = trunc <vscale x 16 x i16> %shr to <vscale x 16 x i8>
   %add3 = add nuw nsw <vscale x 16 x i16> %shr, %add2
   %res = trunc <vscale x 16 x i16> %add3 to <vscale x 16 x i8>
diff --git a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
index 8984d020f99ed..5c6f1e858ce7c 100644
--- a/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp-load-store.ll
@@ -127,7 +127,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(ptr %ptr, i32 zeroext %ev
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0(ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> poison)
 ; CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP3]]
 ;
-  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x i64> %load
 }
 
@@ -140,7 +140,7 @@ define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask_vscale(ptr %ptr) {
 ;
   %vscale = call i32 @llvm.vscale.i32()
   %vlmax = mul nuw i32 %vscale, 1
-  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
+  %load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0(ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax)
   ret <vscale x 1 x i64> %load
 }
 
@@ -179,7 +179,7 @@ define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, ptr %ptr, i32
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv1i64.p0(<vscale x 1 x i64> [[VAL:%.*]], ptr [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
 ; CHECK-NEXT:    ret void
 ;
-  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
+  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret void
 }
 
@@ -192,7 +192,7 @@ define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, ptr %p
 ;
   %vscale = call i32 @llvm.vscale.i32()
   %vlmax = mul nuw i32 %vscale, 1
-  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
+  call void @llvm.vp.store.nxv1i64.p0(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> splat (i1 true), i32 %vlmax)
   ret void
 }
 
diff --git a/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll b/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll
index f7d48ce1e099c..170fde4ef7002 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/vector-or-load.ll
@@ -35,7 +35,7 @@ define <vscale x 8 x i16> @or-load-scalable-vector(ptr %p1) {
 ; CHECK-NEXT:    [[L2:%.*]] = load <vscale x 8 x i8>, ptr [[P2]], align 1
 ; CHECK-NEXT:    [[E1:%.*]] = zext <vscale x 8 x i8> [[L1]] to <vscale x 8 x i16>
 ; CHECK-NEXT:    [[E2:%.*]] = zext <vscale x 8 x i8> [[L2]] to <vscale x 8 x i16>
-; CHECK-NEXT:    [[S2:%.*]] = shl <vscale x 8 x i16> [[E2]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT:    [[S2:%.*]] = shl <vscale x 8 x i16> [[E2]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[OR:%.*]] = or <vscale x 8 x i16> [[E1]], [[S2]]
 ; CHECK-NEXT:    ret <vscale x 8 x i16> [[OR]]
 ;
@@ -44,7 +44,7 @@ define <vscale x 8 x i16> @or-load-scalable-vector(ptr %p1) {
   %l2 = load <vscale x 8 x i8>, ptr %p2, align 1
   %e1 = zext <vscale x 8 x i8> %l1 to <vscale x 8 x i16>
   %e2 = zext <vscale x 8 x i8> %l2 to <vscale x 8 x i16>
-  %s2 = shl <vscale x 8 x i16> %e2, shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 8, i32 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+  %s2 = shl <vscale x 8 x i16> %e2, splat (i16 8)
   %or = or <vscale x 8 x i16> %e1, %s2
   ret <vscale x 8 x i16> %or
 }
diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
index adab2238ab5b2..dee0af0cc09e0 100644
--- a/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/vscale-crashes.ll
@@ -71,7 +71,7 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
 ; CHECK-NEXT:    [[ALLOC:%.*]] = alloca <vscale x 4 x float>, align 16
 ; CHECK-NEXT:    [[IDX:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[STRIDE:%.*]] = getelementptr inbounds float, ptr [[ALLOC]], <vscale x 4 x i32> [[IDX]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[STRIDE]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x ptr> [[STRIDE]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[LI:%.*]] = load <vscale x 4 x float>, ptr [[ALLOC]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[LI]], ptr [[OUT:%.*]], align 4
 ; CHECK-NEXT:    ret void
@@ -79,7 +79,7 @@ define void @callslotoptzn(<vscale x 4 x float> %val, ptr %out) {
   %alloc = alloca <vscale x 4 x float>, align 16
   %idx = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
   %stride = getelementptr inbounds float, ptr %alloc, <vscale x 4 x i32> %idx
-  call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %val, <vscale x 4 x ptr> %stride, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+  call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> %val, <vscale x 4 x ptr> %stride, i32 4, <vscale x 4 x i1> splat (i1 true))
   %li = load <vscale x 4 x float>, ptr %alloc, align 4
   store <vscale x 4 x float> %li, ptr %out, align 4
   ret void
diff --git a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
index e95aea4eb487b..7dc0ba50c1f8c 100644
--- a/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/RISCV/vpintrin-scalarization.ll
@@ -51,7 +51,7 @@ define <vscale x 1 x i64> @add_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -59,7 +59,7 @@ define <vscale x 1 x i64> @add_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -68,13 +68,13 @@ define <vscale x 1 x i64> @add_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @add_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -94,7 +94,7 @@ define <vscale x 1 x i64> @sub_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -102,7 +102,7 @@ define <vscale x 1 x i64> @sub_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -111,13 +111,13 @@ define <vscale x 1 x i64> @sub_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @sub_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sub.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -137,7 +137,7 @@ define <vscale x 1 x i64> @mul_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -145,7 +145,7 @@ define <vscale x 1 x i64> @mul_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -154,13 +154,13 @@ define <vscale x 1 x i64> @mul_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @mul_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -180,7 +180,7 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -188,7 +188,7 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -197,13 +197,13 @@ define <vscale x 1 x i64> @sdiv_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @sdiv_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -244,7 +244,7 @@ define <vscale x 1 x i64> @udiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -252,7 +252,7 @@ define <vscale x 1 x i64> @udiv_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -261,13 +261,13 @@ define <vscale x 1 x i64> @udiv_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @udiv_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -308,7 +308,7 @@ define <vscale x 1 x i64> @srem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -316,7 +316,7 @@ define <vscale x 1 x i64> @srem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -325,13 +325,13 @@ define <vscale x 1 x i64> @srem_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @srem_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -372,7 +372,7 @@ define <vscale x 1 x i64> @urem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -380,7 +380,7 @@ define <vscale x 1 x i64> @urem_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -389,13 +389,13 @@ define <vscale x 1 x i64> @urem_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @urem_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -612,7 +612,7 @@ define <vscale x 1 x i64> @ashr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -620,7 +620,7 @@ define <vscale x 1 x i64> @ashr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -629,13 +629,13 @@ define <vscale x 1 x i64> @ashr_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @ashr_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.ashr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -655,7 +655,7 @@ define <vscale x 1 x i64> @lshr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -663,7 +663,7 @@ define <vscale x 1 x i64> @lshr_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -672,13 +672,13 @@ define <vscale x 1 x i64> @lshr_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @lshr_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.lshr.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -698,7 +698,7 @@ define <vscale x 1 x i64> @shl_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -706,7 +706,7 @@ define <vscale x 1 x i64> @shl_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -715,13 +715,13 @@ define <vscale x 1 x i64> @shl_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @shl_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -741,7 +741,7 @@ define <vscale x 1 x i64> @or_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y,
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -749,7 +749,7 @@ define <vscale x 1 x i64> @or_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y,
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -758,13 +758,13 @@ define <vscale x 1 x i64> @or_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <vs
 ; ALL-LABEL: @or_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.or.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -784,7 +784,7 @@ define <vscale x 1 x i64> @and_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -792,7 +792,7 @@ define <vscale x 1 x i64> @and_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -801,13 +801,13 @@ define <vscale x 1 x i64> @and_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @and_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.and.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -827,7 +827,7 @@ define <vscale x 1 x i64> @xor_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -835,7 +835,7 @@ define <vscale x 1 x i64> @xor_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %y
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -844,13 +844,13 @@ define <vscale x 1 x i64> @xor_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <v
 ; ALL-LABEL: @xor_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.xor.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -870,7 +870,7 @@ define <vscale x 1 x i64> @smin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -878,7 +878,7 @@ define <vscale x 1 x i64> @smin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -887,13 +887,13 @@ define <vscale x 1 x i64> @smin_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @smin_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -913,7 +913,7 @@ define <vscale x 1 x i64> @smax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -921,7 +921,7 @@ define <vscale x 1 x i64> @smax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -930,13 +930,13 @@ define <vscale x 1 x i64> @smax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @smax_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.smax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -956,7 +956,7 @@ define <vscale x 1 x i64> @umin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -964,7 +964,7 @@ define <vscale x 1 x i64> @umin_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.umin.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -984,7 +984,7 @@ define <vscale x 1 x i64> @umax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
@@ -992,7 +992,7 @@ define <vscale x 1 x i64> @umax_nxv1i64_allonesmask(<vscale x 1 x i64> %x, i64 %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -1001,13 +1001,13 @@ define <vscale x 1 x i64> @umax_nxv1i64_anymask(<vscale x 1 x i64> %x, i64 %y, <
 ; ALL-LABEL: @umax_nxv1i64_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x i64> [[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 42, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> [[X:%.*]], <vscale x 1 x i64> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x i64> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x i64> poison, i64 %y, i64 0
   %2 = shufflevector <vscale x 1 x i64> %1, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> shufflevector(<vscale x 1 x i64> insertelement(<vscale x 1 x i64> poison, i64 42, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x i64> @llvm.vp.umax.nxv1i64(<vscale x 1 x i64> %2, <vscale x 1 x i64> splat (i64 42), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x i64> %4
 }
@@ -1027,7 +1027,7 @@ define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1035,7 +1035,7 @@ define <vscale x 1 x float> @fadd_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1044,13 +1044,13 @@ define <vscale x 1 x float> @fadd_nxv1f32_anymask(<vscale x 1 x float> %x, float
 ; ALL-LABEL: @fadd_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1070,7 +1070,7 @@ define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1078,7 +1078,7 @@ define <vscale x 1 x float> @fsub_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1087,13 +1087,13 @@ define <vscale x 1 x float> @fsub_nxv1f32_anymask(<vscale x 1 x float> %x, float
 ; ALL-LABEL: @fsub_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fsub.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1113,7 +1113,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1121,7 +1121,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1130,13 +1130,13 @@ define <vscale x 1 x float> @fdiv_nxv1f32_anymask(<vscale x 1 x float> %x, float
 ; ALL-LABEL: @fdiv_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1156,7 +1156,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1164,7 +1164,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
   }
@@ -1173,13 +1173,13 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask(<vscale x 1 x float> %x, f
 ; ALL-LABEL: @frem_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1199,7 +1199,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1207,7 +1207,7 @@ define <vscale x 1 x float> @fdiv_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1216,13 +1216,13 @@ define <vscale x 1 x float> @fdiv_nxv1f32_anymask_knownvl(<vscale x 1 x float> %
 ; ALL-LABEL: @fdiv_nxv1f32_anymask_knownvl(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.fdiv.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1242,7 +1242,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1250,7 +1250,7 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1259,13 +1259,13 @@ define <vscale x 1 x float> @frem_nxv1f32_allonesmask_knownvl(<vscale x 1 x floa
 ; ALL-LABEL: @frem_nxv1f32_anymask_knownvl(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 4)
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 4)
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 4)
+  %3 = call <vscale x 1 x float> @llvm.vp.frem.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 4)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 4)
   ret <vscale x 1 x float> %4
 }
@@ -1285,7 +1285,7 @@ define <vscale x 1 x float> @copysign_nxv1f32_allonesmask(<vscale x 1 x float> %
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1293,7 +1293,7 @@ define <vscale x 1 x float> @copysign_nxv1f32_allonesmask(<vscale x 1 x float> %
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1302,13 +1302,13 @@ define <vscale x 1 x float> @copysign_nxv1f32_anymask(<vscale x 1 x float> %x, f
 ; ALL-LABEL: @copysign_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.copysign.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1328,7 +1328,7 @@ define <vscale x 1 x float> @minnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1336,7 +1336,7 @@ define <vscale x 1 x float> @minnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1345,13 +1345,13 @@ define <vscale x 1 x float> @minnum_nxv1f32_anymask(<vscale x 1 x float> %x, flo
 ; ALL-LABEL: @minnum_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.minnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1371,7 +1371,7 @@ define <vscale x 1 x float> @maxnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 1 x i1> [[SPLAT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
@@ -1379,7 +1379,7 @@ define <vscale x 1 x float> @maxnum_nxv1f32_allonesmask(<vscale x 1 x float> %x,
   %mask = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float > %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1388,13 +1388,13 @@ define <vscale x 1 x float> @maxnum_nxv1f32_anymask(<vscale x 1 x float> %x, flo
 ; ALL-LABEL: @maxnum_nxv1f32_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 1 x float> poison, float [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 1 x float> [[TMP1]], <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> [[TMP2]], <vscale x 1 x float> shufflevector (<vscale x 1 x float> insertelement (<vscale x 1 x float> poison, float 4.200000e+01, i64 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> [[X:%.*]], <vscale x 1 x float> [[TMP3]], <vscale x 1 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 1 x float> [[TMP4]]
 ;
   %1 = insertelement <vscale x 1 x float> poison, float %y, i64 0
   %2 = shufflevector <vscale x 1 x float> %1, <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer
-  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> shufflevector(<vscale x 1 x float> insertelement(<vscale x 1 x float> poison, float 42.0, i32 0), <vscale x 1 x float> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 1 x float> @llvm.vp.maxnum.nxv1f32(<vscale x 1 x float> %2, <vscale x 1 x float> splat (float 42.0), <vscale x 1 x i1> %mask, i32 %evl)
   %4 = call <vscale x 1 x float> @llvm.vp.fadd.nxv1f32(<vscale x 1 x float> %x, <vscale x 1 x float> %3, <vscale x 1 x i1> %mask, i32 %evl)
   ret <vscale x 1 x float> %4
 }
@@ -1414,7 +1414,7 @@ define <vscale x 8 x i8> @add_nxv8i8_allonesmask(<vscale x 8 x i8> %x, i8 %y, i3
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 8 x i1> [[SPLAT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x i8> [[TMP1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> [[X:%.*]], <vscale x 8 x i8> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 8 x i8> [[TMP4]]
 ;
@@ -1422,7 +1422,7 @@ define <vscale x 8 x i8> @add_nxv8i8_allonesmask(<vscale x 8 x i8> %x, i8 %y, i3
   %mask = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %1 = insertelement <vscale x 8 x i8> poison, i8 %y, i64 0
   %2 = shufflevector <vscale x 8 x i8> %1, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> splat (i8 42), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %3,  <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %4
 }
@@ -1431,13 +1431,13 @@ define <vscale x 8 x i8> @add_nxv8i8_anymask(<vscale x 8 x i8> %x, i8 %y, <vscal
 ; ALL-LABEL: @add_nxv8i8_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x i8> [[TMP1]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> [[TMP2]], <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 42, i64 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> [[X:%.*]], <vscale x 8 x i8> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 8 x i8> [[TMP4]]
 ;
   %1 = insertelement <vscale x 8 x i8> poison, i8 %y, i64 0
   %2 = shufflevector <vscale x 8 x i8> %1, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> shufflevector(<vscale x 8 x i8> insertelement(<vscale x 8 x i8> poison, i8 42, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x i8>  @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %2, <vscale x 8 x i8> splat (i8 42), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %3,  <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x i8> %4
 }
@@ -1457,7 +1457,7 @@ define <vscale x 8 x half> @fadd_nxv1f16_allonesmask(<vscale x 8 x half> %x, hal
 ; NO-VEC-COMBINE-NEXT:    [[MASK:%.*]] = shufflevector <vscale x 8 x i1> [[SPLAT]], <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
 ; NO-VEC-COMBINE-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x half> poison, half [[Y:%.*]], i64 0
 ; NO-VEC-COMBINE-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
+; NO-VEC-COMBINE-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK]], i32 [[EVL:%.*]])
 ; NO-VEC-COMBINE-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[X:%.*]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; NO-VEC-COMBINE-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 ;
@@ -1465,7 +1465,7 @@ define <vscale x 8 x half> @fadd_nxv1f16_allonesmask(<vscale x 8 x half> %x, hal
   %mask = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
   %1 = insertelement <vscale x 8 x half> poison, half %y, i64 0
   %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> shufflevector(<vscale x 8 x half> insertelement(<vscale x 8 x half> poison, half 42.0, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> splat (half 42.0), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %3, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x half> %4
 }
@@ -1474,13 +1474,13 @@ define <vscale x 8 x half> @fadd_nxv8f16_anymask(<vscale x 8 x half> %x, half %y
 ; ALL-LABEL: @fadd_nxv8f16_anymask(
 ; ALL-NEXT:    [[TMP1:%.*]] = insertelement <vscale x 8 x half> poison, half [[Y:%.*]], i64 0
 ; ALL-NEXT:    [[TMP2:%.*]] = shufflevector <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
+; ALL-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> shufflevector (<vscale x 8 x half> insertelement (<vscale x 8 x half> poison, half 0xH5140, i64 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> [[MASK:%.*]], i32 [[EVL:%.*]])
 ; ALL-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> [[X:%.*]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[MASK]], i32 [[EVL]])
 ; ALL-NEXT:    ret <vscale x 8 x half> [[TMP4]]
 ;
   %1 = insertelement <vscale x 8 x half> poison, half %y, i64 0
   %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
-  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> shufflevector(<vscale x 8 x half> insertelement(<vscale x 8 x half> poison, half 42.0, i32 0), <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i1> %mask, i32 %evl)
+  %3 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %2, <vscale x 8 x half> splat (half 42.0), <vscale x 8 x i1> %mask, i32 %evl)
   %4 = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %3, <vscale x 8 x i1> %mask, i32 %evl)
   ret <vscale x 8 x half> %4
 }

From 5ec535b1bda4c87aac951e65cc5b65c910e92579 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 27 Feb 2024 14:07:44 +0000
Subject: [PATCH 437/546] [AMDGPU] Regenerate mnemonic alias checks (#83130)

Regenerate checks for the full output from the assembler, not just the
encoding bytes, to make it obvious that the alias has been mapped to a
different mnemonic.
---
 llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s   |  16 +--
 llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s   | 124 ++++++++---------
 llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s    |   4 -
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s    |   4 +-
 llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s      |  16 +--
 llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s    |  24 ++--
 llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s    |   2 +-
 .../MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s |  16 +--
 .../MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s | 128 +++++++++---------
 llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s   |  22 +--
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s  |   4 +-
 11 files changed, 178 insertions(+), 182 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
index 4a55a33928211..eba48c602172f 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mtbuf_alias.s
@@ -1,25 +1,25 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 tbuffer_load_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_8_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_load_d16_format_x v4, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_load_format_d16_xy v4, off, s[8:11], s3, format:[BUF_FMT_8_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_load_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:4095 ; encoding: [0xff,0x8f,0x34,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_UINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_load_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:4095 ; encoding: [0xff,0x0f,0x5d,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s3, format:[BUF_FMT_8_8_USCALED] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_load_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:4095 ; encoding: [0xff,0x8f,0x85,0xe8,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_x v4, off, s[8:11], s3, format:[BUF_FMT_2_10_10_10_SINT] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_store_d16_format_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:4095 ; encoding: [0xff,0x0f,0x4e,0xe9,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_xy v4, off, s[8:11], s3, format:[BUF_FMT_8_8_8_8_UINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_store_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:4095 ; encoding: [0xff,0x8f,0x76,0xe9,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_16_16_16_UNORM] offset:4095
-// GFX11: encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_store_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:4095 ; encoding: [0xff,0x0f,0x9f,0xe9,0x00,0x04,0x02,0x03]
 
 tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s3, format:[BUF_FMT_16_16_16_16_SINT] offset:4095
-// GFX11: encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x02,0x03]
+// GFX11: tbuffer_store_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:4095 ; encoding: [0xff,0x8f,0xc7,0xe9,0x00,0x04,0x02,0x03]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
index fd58215f89ad5..c65dc4dbe032a 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_mubuf_alias.s
@@ -1,187 +1,187 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefix=GFX11 %s
 
 buffer_load_dword v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b96 v[5:7], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_b128 v[5:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_short_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_b16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x80,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_x v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_x v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_xy v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_xyz v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_format_xyzw v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_b16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x8c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_format_x v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x98,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_i8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x88,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_hi_u8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x84,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_i8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x7c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_d16_u8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x78,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sbyte v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_i8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_sshort v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_i16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ubyte v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_u8 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_load_ushort v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_load_u16 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_store_byte v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b8 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_short v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b16 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x64,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dword v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b32 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b64 v[1:2], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x6c,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b96 v[1:3], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_b128 v[1:4], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_x v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_x v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_xy v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_xyz v[1:2], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x38,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_format_xyzw v[1:2], off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x3c,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_hi_b8 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x90,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_hi_b16 v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x94,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:4095
-//  GFX11: encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x04]
+// GFX11: buffer_store_d16_hi_format_x v1, off, s[12:15], s4 offset:4095 ; encoding: [0xff,0x0f,0x9c,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_atomic_add v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_add_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_add_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_and v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_and_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xf0,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_and_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_cmpswap_b32 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xd0,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_cmpswap_b64 v[5:8], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_fcmpswap v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_cmpswap_f32 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x40,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_csub v255, off, s[8:11], s3 offset:4095 glc
-// GFX11: encoding: [0xff,0x4f,0xdc,0xe0,0x00,0xff,0x02,0x03]
+// GFX11: buffer_atomic_csub_u32 v255, off, s[8:11], s3 offset:4095 glc ; encoding: [0xff,0x4f,0xdc,0xe0,0x00,0xff,0x02,0x03]
 
 buffer_atomic_dec v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_dec_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x34,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_inc v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_inc_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xfc,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_inc_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_fmax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x48,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_i32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xe8,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_i64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umax v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xec,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_fmin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x44,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_i32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xe0,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_i64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umin v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xe4,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_min_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_or v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_or_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xf4,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_or_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_sub v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_sub_u32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xd8,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_sub_u64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_swap v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_swap_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xcc,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_swap_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
 buffer_atomic_xor v5, off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_xor_b32 v5, off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0xf8,0xe0,0x00,0x05,0x02,0x03]
 
 buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095
-// GFX11: encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x03]
+// GFX11: buffer_atomic_xor_b64 v[5:6], off, s[8:11], s3 offset:4095 ; encoding: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x03]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
index 97a55c6df75f5..1d0facd3d3ff9 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_smem_alias.s
@@ -1,9 +1,5 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
 
-//===----------------------------------------------------------------------===//
-// ENC_SMEM.
-//===----------------------------------------------------------------------===//
-
 s_load_dword s5, s[2:3], s0
 // GFX11: s_load_b32 s5, s[2:3], s0               ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s
index 857f2fdfc41bf..a8cc90ef6f8b5 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_alias.s
@@ -2,7 +2,7 @@
 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck -check-prefix=GFX11 %s
 
 v_cvt_pknorm_i16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_i16_f16 {{.*}} encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
+// GFX11: v_cvt_pk_norm_i16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
 
 v_cvt_pknorm_u16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_u16_f16 {{.*}} encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
+// GFX11: v_cvt_pk_norm_u16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
index 7076772484043..aa063c8800aa4 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds_alias.s
@@ -1,28 +1,28 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 ds_max_f32 v1, v2
-// GFX12: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
+// GFX12: ds_max_num_f32 v1, v2                   ; encoding: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
 
 ds_max_f64 v1, v[2:3]
-// GFX12: [0x00,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
+// GFX12: ds_max_num_f64 v1, v[2:3]               ; encoding: [0x00,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
 
 ds_max_rtn_f32 v5, v1, v2
-// GFX12: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
+// GFX12: ds_max_num_rtn_f32 v5, v1, v2           ; encoding: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
 ds_max_rtn_f64 v[5:6], v1, v[2:3]
-// GFX12: [0x00,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
+// GFX12: ds_max_num_rtn_f64 v[5:6], v1, v[2:3]   ; encoding: [0x00,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
 
 ds_min_f32 v1, v2
-// GFX12: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
+// GFX12: ds_min_num_f32 v1, v2                   ; encoding: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
 
 ds_min_f64 v1, v[2:3]
-// GFX12: [0x00,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
+// GFX12: ds_min_num_f64 v1, v[2:3]               ; encoding: [0x00,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
 
 ds_min_rtn_f32 v5, v1, v2
-// GFX12: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
+// GFX12: ds_min_num_rtn_f32 v5, v1, v2           ; encoding: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
 ds_min_rtn_f64 v[5:6], v1, v[2:3]
-// GFX12: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
+// GFX12: ds_min_num_rtn_f64 v[5:6], v1, v[2:3]   ; encoding: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
 
 ds_subrev_u32 v1, v2
 // GFX12: ds_rsub_u32 v1, v2                      ; encoding: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
index 6efc561991c73..e3ad0198cae54 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop2_alias.s
@@ -1,37 +1,37 @@
 // RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
 
 s_add_i32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x00,0x81]
+// GFX12: s_add_co_i32 s0, s1, s2                 ; encoding: [0x01,0x02,0x00,0x81]
 
 s_add_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x00,0x80]
+// GFX12: s_add_co_u32 s0, s1, s2                 ; encoding: [0x01,0x02,0x00,0x80]
 
 s_add_u64 s[0:1], s[2:3], s[4:5]
-// GFX12: encoding: [0x02,0x04,0x80,0xa9]
+// GFX12: s_add_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x80,0xa9]
 
 s_addc_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x00,0x82]
+// GFX12: s_add_co_ci_u32 s0, s1, s2              ; encoding: [0x01,0x02,0x00,0x82]
 
 s_sub_i32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x80,0x81]
+// GFX12: s_sub_co_i32 s0, s1, s2                 ; encoding: [0x01,0x02,0x80,0x81]
 
 s_sub_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x80,0x80]
+// GFX12: s_sub_co_u32 s0, s1, s2                 ; encoding: [0x01,0x02,0x80,0x80]
 
 s_sub_u64 s[0:1], s[2:3], s[4:5]
-// GFX12: encoding: [0x02,0x04,0x00,0xaa]
+// GFX12: s_sub_nc_u64 s[0:1], s[2:3], s[4:5]     ; encoding: [0x02,0x04,0x00,0xaa]
 
 s_subb_u32 s0, s1, s2
-// GFX12: encoding: [0x01,0x02,0x80,0x82]
+// GFX12: s_sub_co_ci_u32 s0, s1, s2              ; encoding: [0x01,0x02,0x80,0x82]
 
 s_min_f32 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x05,0xa1]
+// GFX12: s_min_num_f32 s5, s1, s2                ; encoding: [0x01,0x02,0x05,0xa1]
 
 s_max_f32 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x85,0xa1]
+// GFX12: s_max_num_f32 s5, s1, s2                ; encoding: [0x01,0x02,0x85,0xa1]
 
 s_max_f16 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x05,0xa6]
+// GFX12: s_max_num_f16 s5, s1, s2                ; encoding: [0x01,0x02,0x05,0xa6]
 
 s_min_f16 s5, s1, s2
-// GFX12: encoding: [0x01,0x02,0x85,0xa5]
+// GFX12: s_min_num_f16 s5, s1, s2                ; encoding: [0x01,0x02,0x85,0xa5]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s
index 283890be82acf..aae08fda8d6ea 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s
@@ -1,4 +1,4 @@
 // RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
 
 s_addk_i32 s0, 0x1234
-// GFX12: encoding: [0x34,0x12,0x80,0xb7]
+// GFX12: s_addk_co_i32 s0, 0x1234                ; encoding: [0x34,0x12,0x80,0xb7]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
index ebe3af66bb0c3..ef00edcf846e7 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mtbuf_alias.s
@@ -1,25 +1,25 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 tbuffer_load_format_d16_x v4, off, s[8:11], s3 format:[BUF_FMT_8_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_load_d16_format_x v4, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x22,0xc4,0x04,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 tbuffer_load_format_d16_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_load_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_SINT] offset:8388607 ; encoding: [0x03,0x40,0x22,0xc4,0x04,0x10,0x00,0x03,0x00,0xff,0xff,0x7f]
 
 tbuffer_load_format_d16_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_load_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_UINT] offset:8388607 ; encoding: [0x03,0x80,0x22,0xc4,0x04,0x10,0x80,0x05,0x00,0xff,0xff,0x7f]
 
 tbuffer_load_format_d16_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_load_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_8_8_USCALED] offset:8388607 ; encoding: [0x03,0xc0,0x22,0xc4,0x04,0x10,0x00,0x08,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:8388607
-// GFX12: encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_store_d16_format_x v4, off, s[8:11], s3 format:[BUF_FMT_2_10_10_10_SINT] offset:8388607 ; encoding: [0x03,0x00,0x23,0xc4,0x04,0x10,0x80,0x14,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:8388607
-// GFX12: encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_store_d16_format_xy v4, off, s[8:11], s3 format:[BUF_FMT_8_8_8_8_UINT] offset:8388607 ; encoding: [0x03,0x40,0x23,0xc4,0x04,0x10,0x00,0x17,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:8388607
-// GFX12: encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_store_d16_format_xyz v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_UNORM] offset:8388607 ; encoding: [0x03,0x80,0x23,0xc4,0x04,0x10,0x80,0x19,0x00,0xff,0xff,0x7f]
 
 tbuffer_store_format_d16_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
+// GFX12: tbuffer_store_d16_format_xyzw v[4:5], off, s[8:11], s3 format:[BUF_FMT_16_16_16_16_SINT] offset:8388607 ; encoding: [0x03,0xc0,0x23,0xc4,0x04,0x10,0x00,0x1c,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
index 42073707a55aa..7363750900c04 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf_alias.s
@@ -1,193 +1,193 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
 
 buffer_load_dword v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_b96 v[5:7], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_b128 v[5:8], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x05,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_short_d16 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_b16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_x v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_format_x v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_xy v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_format_xy v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_xyz v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_format_xyz v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_xyzw v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_format_xyzw v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x02,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_short_d16_hi v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_hi_b16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_format_d16_hi_x v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_hi_format_x v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x09,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sbyte_d16_hi v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_hi_i8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ubyte_d16_hi v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_hi_u8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x08,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sbyte_d16 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_i8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ubyte_d16 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_d16_u8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x07,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sbyte v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_i8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_sshort v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_i16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ubyte v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_u8 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_load_ushort v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_load_u16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x04,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_byte v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_b8 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_short v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_b16 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dword v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_b32 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x80,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_b64 v[1:2], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0xc0,0x06,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_b96 v[1:3], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_b128 v[1:4], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x07,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_x v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_format_x v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_xy v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_format_xy v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_xyz v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_format_xyz v[1:2], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x80,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_xyzw v[1:2], off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_format_xyzw v[1:2], off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0xc0,0x03,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_byte_d16_hi v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_hi_b8 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x00,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_short_d16_hi v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_hi_b16 v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0x40,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_store_format_d16_hi_x v1, off, s[12:15], s4 offset:8388607
-// GFX12: encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_store_d16_hi_format_x v1, off, s[12:15], s4 offset:8388607 ; encoding: [0x04,0xc0,0x09,0xc4,0x01,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_add v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_add_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_add_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_and v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_and_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_and_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_cmpswap_b32 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_cmpswap_b64 v[5:8], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_csub v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_csub_u32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_dec v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_dec_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_inc v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_inc_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_inc_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x13,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_fmax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_max_f32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_i32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_i64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umax v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_fmin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_i32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_i64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umin v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x0e,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_min_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_or v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_or_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_or_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_sub v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x0d,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_sub_u64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x11,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_swap v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_swap_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x0c,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_swap_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_xor v5, off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_xor_b32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x0f,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
 buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:8388607
-// GFX12: encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
+// GFX12: buffer_atomic_xor_b64 v[5:6], off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0xc0,0x12,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
index 0728b43dddd92..df453d96117a8 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat_alias.s
@@ -1,34 +1,34 @@
 ; RUN: llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck -check-prefix=GFX12 %s
 
 global_atomic_csub v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_csub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN
-// GFX12: global_atomic_sub_clamp_u32 {{.*}} encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_csub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN
-// GFX12: flat_atomic_sub_clamp_u32 {{.*}} encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_fmax v[0:1], v2 offset:64
-// GFX12: flat_atomic_max_num_f32 {{.*}} encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_max_f32 v[0:1], v2 offset:64
-// GFX12: flat_atomic_max_num_f32 {{.*}} encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_fmin v[0:1], v2 offset:64
-// GFX12: flat_atomic_min_num_f32 {{.*}} encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 flat_atomic_min_f32 v[0:1], v2 offset:64
-// GFX12: flat_atomic_min_num_f32 {{.*}} encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_fmax v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_max_num_f32 {{.*}} encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_max_f32 v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_max_num_f32 {{.*}} encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_fmin v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_min_num_f32 {{.*}} encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
 
 global_atomic_min_f32 v0, v2, s[0:1] offset:64
-// GFX12: global_atomic_min_num_f32 {{.*}} encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+// GFX12: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
index 9211877774aaf..f2ae7dd8c35a8 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
@@ -43,7 +43,7 @@ v_min_f64 v[5:6], s[2:3], s[4:5]
 // GFX12: v_min_num_f64_e64 v[5:6], s[2:3], s[4:5] ; encoding: [0x05,0x00,0x0d,0xd5,0x02,0x08,0x00,0x00]
 
 v_cvt_pknorm_i16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_i16_f16 {{.*}} encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
+// GFX12: v_cvt_pk_norm_i16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
 
 v_cvt_pknorm_u16_f16 v5, v1, v2
-// GFX11: v_cvt_pk_norm_u16_f16 {{.*}} encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
+// GFX12: v_cvt_pk_norm_u16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]

From f2bb6c4415954535b32780cd5fb48411ebe0042c Mon Sep 17 00:00:00 2001
From: Natalie Chouinard <sudonatalie@google.com>
Date: Tue, 27 Feb 2024 09:19:24 -0500
Subject: [PATCH 438/546] [NFC][HLSL] Fix broken test (#83062)

Noticed while implementing #82536 that this test was also missing the
call the FileCheck.
---
 clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
index b8514b0d13f11..7e7ebe930bd96 100644
--- a/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
+++ b/clang/test/CodeGenHLSL/semantics/GroupIndex-codegen.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - -hlsl-entry main %s | FileCheck %s
 
 [numthreads(1,1,1)]
 void main(unsigned GI : SV_GroupIndex) {
@@ -10,7 +10,7 @@ void main(unsigned GI : SV_GroupIndex) {
 // semantic parameters and provides the expected void(void) signature that
 // drivers expect for entry points.
 
-//CHECK: define void @main() #[[ENTRY_ATTR:#]]{
+//CHECK: define void @main() #[[#ENTRY_ATTR:]] {
 //CHECK-NEXT: entry:
 //CHECK-NEXT:   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
 //CHECK-NEXT:   call void @"?main@@YAXI@Z"(i32 %0)
@@ -19,4 +19,4 @@ void main(unsigned GI : SV_GroupIndex) {
 
 // Verify that the entry had the expected dx.shader attribute
 
-//CHECK: attributes #[[ENTRY_ATTR]] = { {{.*}}"dx.shader"="compute"{{.*}} }
+//CHECK: attributes #[[#ENTRY_ATTR]] = { {{.*}}"hlsl.shader"="compute"{{.*}} }

From 2e39b57837aa1790b3ee078fa532bb1748a609c7 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 27 Feb 2024 11:13:46 +0000
Subject: [PATCH 439/546] Reapply "[RemoveDIs] Print non-intrinsic debug info
 in textual IR output (#79281)"

This reapplication changes debug intrinsic declaration removal to only take
place when printing final IR, so that the processing format of the Module
does not affect the output.

This reverts commit d128448efdd4e2bf3c9bc9a5b43ae642aa78026f.
---
 llvm/include/llvm/IR/Module.h                 | 12 +++
 llvm/include/llvm/IR/PrintPasses.h            | 19 ++++
 llvm/lib/IR/AsmWriter.cpp                     | 75 +++++++---------
 llvm/lib/IR/BasicBlock.cpp                    |  4 -
 llvm/lib/IR/IRPrintingPasses.cpp              | 32 +++----
 llvm/lib/IR/Module.cpp                        | 22 +++++
 llvm/lib/IRPrinter/IRPrintingPasses.cpp       | 29 +++----
 .../Generic/inline-alloca-ordering.ll         |  7 +-
 .../DebugInfo/Generic/inline-dbg-values.ll    |  8 +-
 llvm/test/DebugInfo/dpvalue-print-nocrash.ll  |  3 +-
 .../print-non-instruction-debug-info.ll       | 86 +++++++++++++++++++
 .../pr33641_remove_arg_dbgvalue.ll            | 18 ++--
 .../DeadArgElim/2010-04-30-DbgInfo.ll         | 32 ++++++-
 .../IROutliner/outlining-debug-statements.ll  |  7 --
 14 files changed, 245 insertions(+), 109 deletions(-)
 create mode 100644 llvm/test/DebugInfo/print-non-instruction-debug-info.ll

diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 68a89dc45c283..e41a5940540b4 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -218,6 +218,11 @@ class LLVM_EXTERNAL_VISIBILITY Module {
   /// \ref BasicBlock.
   bool IsNewDbgInfoFormat;
 
+  /// Used when printing this module in the new debug info format; removes all
+  /// declarations of debug intrinsics that are replaced by non-intrinsic
+  /// records in the new format.
+  void removeDebugIntrinsicDeclarations();
+
   /// \see BasicBlock::convertToNewDbgValues.
   void convertToNewDbgValues() {
     for (auto &F : *this) {
@@ -234,6 +239,13 @@ class LLVM_EXTERNAL_VISIBILITY Module {
     IsNewDbgInfoFormat = false;
   }
 
+  void setIsNewDbgInfoFormat(bool UseNewFormat) {
+    if (UseNewFormat && !IsNewDbgInfoFormat)
+      convertToNewDbgValues();
+    else if (!UseNewFormat && IsNewDbgInfoFormat)
+      convertFromNewDbgValues();
+  }
+
   /// The Module constructor. Note that there is no default constructor. You
   /// must provide a name for the module upon construction.
   explicit Module(StringRef ModuleID, LLVMContext& C);
diff --git a/llvm/include/llvm/IR/PrintPasses.h b/llvm/include/llvm/IR/PrintPasses.h
index 95b97e76c867c..3803bd05cbe57 100644
--- a/llvm/include/llvm/IR/PrintPasses.h
+++ b/llvm/include/llvm/IR/PrintPasses.h
@@ -78,6 +78,25 @@ std::string doSystemDiff(StringRef Before, StringRef After,
                          StringRef OldLineFormat, StringRef NewLineFormat,
                          StringRef UnchangedLineFormat);
 
+/// Used to temporarily set the debug info format of a function, module, or
+/// basic block for the duration of this object's lifetime, after which the
+/// prior state will be restored.
+template <typename T> class ScopedDbgInfoFormatSetter {
+  T &Obj;
+  bool OldState;
+
+public:
+  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
+    Obj.setIsNewDbgInfoFormat(NewState);
+  }
+  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
+};
+
+template <typename T>
+ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
+    -> ScopedDbgInfoFormatSetter<T>;
+
 } // namespace llvm
 
 #endif // LLVM_IR_PRINTPASSES_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index fba404c9b027c..a3da6ca811489 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -861,7 +861,7 @@ class SlotTracker : public AbstractSlotTrackerStorage {
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
 
-  /// Add all of the metadata from an instruction.
+  /// Add all of the metadata from a DbgRecord.
   void processDbgRecordMetadata(const DbgRecord &DPV);
 };
 
@@ -1140,6 +1140,9 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   if (const DPValue *DPV = dyn_cast<const DPValue>(&DR)) {
+    // Process metadata used by DbgRecords; we only specifically care about the
+    // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
+    // Expression fields should only be printed inline and so do not use a slot.
     CreateMetadataSlot(DPV->getVariable());
     if (DPV->isDbgAssign())
       CreateMetadataSlot(DPV->getAssignID());
@@ -2703,6 +2706,7 @@ class AssemblyWriter {
   void printDPValue(const DPValue &DPI);
   void printDPLabel(const DPLabel &DPL);
   void printDbgRecord(const DbgRecord &DPI);
+  void printDbgRecordLine(const DbgRecord &DPI);
 
   void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
   void printUseLists(const Function *F);
@@ -3885,9 +3889,6 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  bool ConvertBack = F->IsNewDbgInfoFormat;
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertFromNewDbgValues();
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
@@ -4030,8 +4031,6 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "}\n";
   }
 
-  if (ConvertBack)
-    const_cast<Function *>(F)->convertToNewDbgValues();
   Machine.purgeFunction();
 }
 
@@ -4098,6 +4097,8 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (const Instruction &I : *BB) {
+    for (const DbgRecord &DR : I.getDbgValueRange())
+      printDbgRecordLine(DR);
     printInstructionLine(I);
   }
 
@@ -4611,12 +4612,10 @@ void AssemblyWriter::printDbgRecord(const DbgRecord &DR) {
     llvm_unreachable("Unexpected DbgRecord kind");
 }
 
-void AssemblyWriter::printDPValue(const DPValue &Value) {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
-  Out << "  DPValue ";
-
-  switch (Value.getType()) {
+void AssemblyWriter::printDPValue(const DPValue &DPV) {
+  auto WriterCtx = getContext();
+  Out << "#dbg_";
+  switch (DPV.getType()) {
   case DPValue::LocationType::Value:
     Out << "value";
     break;
@@ -4629,35 +4628,39 @@ void AssemblyWriter::printDPValue(const DPValue &Value) {
   default:
     llvm_unreachable("Tried to print a DPValue with an invalid LocationType!");
   }
-  Out << " { ";
-  auto WriterCtx = getContext();
-  WriteAsOperandInternal(Out, Value.getRawLocation(), WriterCtx, true);
+  Out << "(";
+  WriteAsOperandInternal(Out, DPV.getRawLocation(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getVariable(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getVariable(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Value.getExpression(), WriterCtx, true);
+  WriteAsOperandInternal(Out, DPV.getExpression(), WriterCtx, true);
   Out << ", ";
-  if (Value.isDbgAssign()) {
-    WriteAsOperandInternal(Out, Value.getAssignID(), WriterCtx, true);
+  if (DPV.isDbgAssign()) {
+    WriteAsOperandInternal(Out, DPV.getAssignID(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getRawAddress(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getRawAddress(), WriterCtx, true);
     Out << ", ";
-    WriteAsOperandInternal(Out, Value.getAddressExpression(), WriterCtx, true);
+    WriteAsOperandInternal(Out, DPV.getAddressExpression(), WriterCtx, true);
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, Value.getDebugLoc().get(), WriterCtx, true);
-  Out << " marker @" << Value.getMarker();
-  Out << " }";
+  WriteAsOperandInternal(Out, DPV.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  Out << ")";
+}
+
+/// printDbgRecordLine - Print a DbgRecord with indentation and a newline
+/// character.
+void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
+  // Print lengthier indentation to bring out-of-line with instructions.
+  Out << "    ";
+  printDbgRecord(DR);
+  Out << '\n';
 }
 
 void AssemblyWriter::printDPLabel(const DPLabel &Label) {
-  // There's no formal representation of a DPLabel -- print purely as
-  // a debugging aid.
-  Out << "  DPLabel { ";
   auto WriterCtx = getContext();
+  Out << "#dbg_label(";
   WriteAsOperandInternal(Out, Label.getLabel(), WriterCtx, true);
-  Out << " marker @" << Label.getMarker();
-  Out << " }";
+  Out << ")";
 }
 
 void AssemblyWriter::printMetadataAttachments(
@@ -4805,19 +4808,11 @@ void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
 
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
-  // RemoveDIs: always print with debug-info in intrinsic format.
-  bool ConvertAfter = IsNewDbgInfoFormat;
-  if (IsNewDbgInfoFormat)
-    const_cast<Module *>(this)->convertFromNewDbgValues();
-
   SlotTracker SlotTable(this);
   formatted_raw_ostream OS(ROS);
   AssemblyWriter W(OS, SlotTable, this, AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printModule(this);
-
-  if (ConvertAfter)
-    const_cast<Module *>(this)->convertToNewDbgValues();
 }
 
 void NamedMDNode::print(raw_ostream &ROS, bool IsForDebug) const {
@@ -4908,8 +4903,6 @@ void DPValue::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                      bool IsForDebug) const {
-  // There's no formal representation of a DPMarker -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4931,8 +4924,6 @@ void DPLabel::print(raw_ostream &ROS, bool IsForDebug) const {
 
 void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DPValue -- print purely as a
-  // debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
@@ -4950,8 +4941,6 @@ void DPValue::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 
 void DPLabel::print(raw_ostream &ROS, ModuleSlotTracker &MST,
                     bool IsForDebug) const {
-  // There's no formal representation of a DbgLabelRecord -- print purely as
-  // a debugging aid.
   formatted_raw_ostream OS(ROS);
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 6ea876fde5ec6..c110c4c1437c3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -61,10 +61,6 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
-  // Is the command line option set?
-  if (!UseNewDbgInfoFormat)
-    return;
-
   IsNewDbgInfoFormat = true;
 
   // Iterate over all instructions in the instruction list, collecting dbg.value
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index b19210e776ed5..84fb8e6c66b8f 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -23,6 +23,11 @@
 
 using namespace llvm;
 
+cl::opt<bool> WriteNewDbgInfoFormat(
+    "write-experimental-debuginfo",
+    cl::desc("Write debug info in the new non-intrinsic format"),
+    cl::init(false));
+
 namespace {
 
 class PrintModulePassWrapper : public ModulePass {
@@ -39,11 +44,14 @@ class PrintModulePassWrapper : public ModulePass {
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      M.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this module in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+    // Remove intrinsic declarations when printing in the new format.
+    // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to
+    // update test output.
+    if (WriteNewDbgInfoFormat)
+      M.removeDebugIntrinsicDeclarations();
 
     if (llvm::isFunctionInPrintList("*")) {
       if (!Banner.empty())
@@ -62,9 +70,6 @@ class PrintModulePassWrapper : public ModulePass {
       }
     }
 
-    if (IsNewDbgInfoFormat)
-      M.convertToNewDbgValues();
-
     return false;
   }
 
@@ -87,11 +92,9 @@ class PrintFunctionPassWrapper : public FunctionPass {
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    // RemoveDIs: there's no textual representation of the DPValue debug-info,
-    // convert to dbg.values before writing out.
-    bool IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
-    if (IsNewDbgInfoFormat)
-      F.convertFromNewDbgValues();
+    // RemoveDIs: Regardless of the format we've processed this function in, use
+    // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+    ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
@@ -101,9 +104,6 @@ class PrintFunctionPassWrapper : public FunctionPass {
         OS << Banner << '\n' << static_cast<Value &>(F);
     }
 
-    if (IsNewDbgInfoFormat)
-      F.convertToNewDbgValues();
-
     return false;
   }
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 1946db2ee0be7..a8696ed9e3ce5 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -85,6 +85,28 @@ Module::~Module() {
   IFuncList.clear();
 }
 
+void Module::removeDebugIntrinsicDeclarations() {
+  auto *DeclareIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_declare);
+  assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug declare intrinsic should have had uses removed.");
+  DeclareIntrinsicFn->eraseFromParent();
+  auto *ValueIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_value);
+  assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug value intrinsic should have had uses removed.");
+  ValueIntrinsicFn->eraseFromParent();
+  auto *AssignIntrinsicFn =
+      Intrinsic::getDeclaration(this, Intrinsic::dbg_assign);
+  assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) &&
+         "Debug assign intrinsic should have had uses removed.");
+  AssignIntrinsicFn->eraseFromParent();
+  auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label);
+  assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) &&
+         "Debug label intrinsic should have had uses removed.");
+  LabelntrinsicFn->eraseFromParent();
+}
+
 std::unique_ptr<RandomNumberGenerator>
 Module::createRNG(const StringRef Name) const {
   SmallString<32> Salt(Name);
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index 52b242b4dcd52..026fa4d746d8b 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -22,6 +22,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> WriteNewDbgInfoFormat;
+
 PrintModulePass::PrintModulePass() : OS(dbgs()) {}
 PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
                                  bool ShouldPreserveUseListOrder,
@@ -31,11 +33,14 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = M.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    M.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this module in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat);
+  // Remove intrinsic declarations when printing in the new format.
+  // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to
+  // update test output.
+  if (WriteNewDbgInfoFormat)
+    M.removeDebugIntrinsicDeclarations();
 
   if (llvm::isFunctionInPrintList("*")) {
     if (!Banner.empty())
@@ -63,9 +68,6 @@ PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
     Index->print(OS);
   }
 
-  if (ShouldConvert)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
@@ -75,11 +77,9 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  // RemoveDIs: there's no textual representation of the DPValue debug-info,
-  // convert to dbg.values before writing out.
-  bool ShouldConvert = F.IsNewDbgInfoFormat;
-  if (ShouldConvert)
-    F.convertFromNewDbgValues();
+  // RemoveDIs: Regardless of the format we've processed this function in, use
+  // `WriteNewDbgInfoFormat` to determine which format we use to write it.
+  ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat);
 
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
@@ -88,8 +88,5 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
       OS << Banner << '\n' << static_cast<Value &>(F);
   }
 
-  if (ShouldConvert)
-    F.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
index 9ff3d80af80d1..9f401ceb5b6f4 100644
--- a/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
+++ b/llvm/test/DebugInfo/Generic/inline-alloca-ordering.ll
@@ -15,8 +15,6 @@
 ;; doesn't transfer the dbg.value to the entry block. This needs Special
 ;; Handling once we get rid of debug-intrinsics.
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK:    define i32 @bar()
 ; CHECK-NEXT: %1 = alloca [65 x i32], align 16
 ; CHECK-NEXT: call void @ext()
@@ -24,9 +22,10 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 0, metadata !10, metadata !DIExpression()), !dbg !12
 ; CHECK-NEXT: call void @init(ptr %1)
 
+; CHECK: declare void @llvm.dbg.value(metadata,
+
 declare void @ext()
 declare void @init(ptr)
-declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 define internal i32 @foo() !dbg !4 {
   %1 = alloca [65 x i32], align 16
@@ -42,6 +41,8 @@ define i32 @bar() !dbg !16 {
   ret i32 %1
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
index 7d580d6fba37a..b0390b9f78f41 100644
--- a/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
+++ b/llvm/test/DebugInfo/Generic/inline-dbg-values.ll
@@ -7,8 +7,6 @@
 ;;
 ;; test should be inlined into test2
 
-; CHECK: declare void @llvm.dbg.value(metadata,
-
 ; CHECK: define i32 @test2
 ; CHECK-NEXT: entry:
 ; CHECK:      %k.addr.i = alloca i32, align 4
@@ -47,6 +45,8 @@
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[KVAR]], metadata !DIExpression()), !dbg ![[KLINE]]
 ; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 1, metadata ![[K2VAR]], metadata !DIExpression()), !dbg ![[GLINE]]
 ;
+; CHECK: declare void @llvm.dbg.value(metadata,
+;
 ;; Test that the metadata maps onto the correct things, and that the DILocations
 ;; attached to the intrinsics have been inlined.
 ;
@@ -100,8 +100,6 @@ return:                                           ; preds = %if.end, %if.then
   ret i32 %3, !dbg !20
 }
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
 declare i32 @_Z8test_exti(i32)
 
 define i32 @test2(i32 %foo, i32 %bar) !dbg !10 {
@@ -118,6 +116,8 @@ try.cont:                                         ; preds = %catch, %invoke.cont
   ret i32 0, !dbg !30
 }
 
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!31}
 
diff --git a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
index 9f120af13ac9c..0a618c6780d1d 100755
--- a/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
+++ b/llvm/test/DebugInfo/dpvalue-print-nocrash.ll
@@ -2,8 +2,7 @@
 ; RUN: opt -passes="instcombine" -debug %s -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
-; CHECK: CLONE:   DPValue value {
-; CHECK-SAME: marker @0x0
+; CHECK: CLONE:   #dbg_value(
 
 define ptr @func_10(i32 %p_11) {
 entry:
diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
new file mode 100644
index 0000000000000..f8271df146fe9
--- /dev/null
+++ b/llvm/test/DebugInfo/print-non-instruction-debug-info.ll
@@ -0,0 +1,86 @@
+;; Test that we can write in the new debug info format.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+;; Test also that the new flag is independent of the flag that enables use of
+;; these non-instruction debug info during LLVM passes.
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg
+; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg
+
+; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]])
+; CHECK-NEXT: entry:
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_B:[0-9a-zA-Z]+]] = alloca
+; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]])
+; CHECK-NEXT: {{^}}  %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5
+; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]])
+; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]])
+; NEWDBG-NEXT: {{^}}    #dbg_label(![[LABEL_ID:[0-9]+]])
+; CHECK-NEXT: {{^}}  store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]]
+; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]]
+; NEWDBG-NEXT: {{^}}    #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]])
+; CHECK-NEXT: {{^}}  ret i32
+
+; OLDDBG-DAG: declare void @llvm.dbg.value
+; OLDDBG-DAG: declare void @llvm.dbg.declare
+; OLDDBG-DAG: declare void @llvm.dbg.assign
+
+; CHECK-DAG: llvm.dbg.cu
+; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a"
+; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b"
+; CHECK-DAG: ![[LOC_1]] = !DILocation(line: 3, column: 15
+; CHECK-DAG: ![[LOC_2]] = !DILocation(line: 3, column: 20
+; CHECK-DAG: ![[LOC_3]] = !DILocation(line: 3, column: 25
+; CHECK-DAG: ![[LOC_4]] = !DILocation(line: 3, column: 30
+; CHECK-DAG: ![[LABEL_ID]] = !DILabel(
+
+define dso_local i32 @f(i32 %a) !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30
+  %b = alloca i32, !dbg !30, !DIAssignID !40
+  call void @llvm.dbg.declare(metadata ptr %b, metadata !21, metadata !DIExpression()), !dbg !31
+  %add = add i32 %a, 5, !dbg !31
+  call void @llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %add), metadata !20, metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg !32
+  call void @llvm.dbg.label(metadata !50), !dbg !32
+  store i32 %add, ptr %b, !dbg !32, !DIAssignID !40
+  call void @llvm.dbg.assign(metadata i32 %add, metadata !21, metadata !DIExpression(), metadata !40, metadata ptr %b, metadata !DIExpression()), !dbg !33
+  ret i32 %add, !dbg !33
+
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+declare void @llvm.dbg.label(metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 18.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "print.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 18.0.0"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!12, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!20, !21}
+!20 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 3, type: !12)
+!21 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !12)
+!30 = !DILocation(line: 3, column: 15, scope: !7)
+!31 = !DILocation(line: 3, column: 20, scope: !7)
+!32 = !DILocation(line: 3, column: 25, scope: !7)
+!33 = !DILocation(line: 3, column: 30, scope: !7)
+!40 = distinct !DIAssignID()
+!50 = !DILabel(scope: !7, name: "label", file: !1, line: 3)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
index 83226e56b5acd..e0777f9ecee8b 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/pr33641_remove_arg_dbgvalue.ll
@@ -30,8 +30,7 @@ define internal void @bar(%p_t %p)  {
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@bar
 ; CGSCC-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR0]] {
-; CGSCC-NEXT:    call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression())
-; CGSCC-SAME:         !dbg [[DBG5:![0-9]+]]
+; CGSCC-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[P]], metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;
   call void @llvm.dbg.value(metadata %p_t %p, metadata !4, metadata !5), !dbg !6
@@ -52,21 +51,20 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 !6 = !DILocation(line: 1, column: 1, scope: !3)
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
-; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
-; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; TUNIT: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; TUNIT: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; TUNIT: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; TUNIT: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
 ;.
-; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
-; CGSCC: [[META1:![0-9]+]] = !DIFile(filename: "test.c", directory: "")
+; CGSCC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+; CGSCC: [[META1]] = !DIFile(filename: "test.c", directory: "")
 ; CGSCC: [[META2:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
-; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: !4)
-; CGSCC: [[META4:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: !0)
-; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: !4)
+; CGSCC: [[META3]] = !DILocalVariable(name: "p", scope: [[META4:![0-9]+]])
+; CGSCC: [[META4]] = distinct !DISubprogram(name: "bar", scope: null, spFlags: DISPFlagDefinition, unit: [[META0]])
+; CGSCC: [[DBG5]] = !DILocation(line: 1, column: 1, scope: [[META4]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}
diff --git a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
index 0dee72f4f6b6f..5e49437db0a74 100644
--- a/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/2010-04-30-DbgInfo.ll
@@ -8,12 +8,21 @@
 
 define ptr @vfs_addname(ptr %name, i32 %len, i32 %hash, i32 %flags) nounwind ssp !dbg !1 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@vfs_addname
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[LEN:%.*]], i32 [[HASH:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[LEN]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META14:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[FLAGS]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fastcc ptr @add_name_internal(ptr [[NAME]], i32 [[HASH]]) #[[ATTR3:[0-9]+]], !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[TMP0]], !dbg [[DBG16]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !0, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %len, metadata !10, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %hash, metadata !11, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   call void @llvm.dbg.value(metadata i32 %flags, metadata !12, metadata !DIExpression()), !dbg !DILocation(scope: !1)
-; CHECK:  call fastcc ptr @add_name_internal(ptr %name, i32 %hash) [[NUW:#[0-9]+]], !dbg !{{[0-9]+}}
   %0 = call fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext 0, i32 %flags) nounwind, !dbg !13 ; <ptr> [#uses=1]
   ret ptr %0, !dbg !13
 }
@@ -22,6 +31,24 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 
 define internal fastcc ptr @add_name_internal(ptr %name, i32 %len, i32 %hash, i8 zeroext %extra, i32 %flags) noinline nounwind ssp !dbg !16 {
 ;
+; CHECK-LABEL: define {{[^@]+}}@add_name_internal
+; CHECK-SAME: (ptr [[NAME:%.*]], i32 [[HASH:%.*]]) #[[ATTR1:[0-9]+]] !dbg [[DBG18:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata ptr [[NAME]], metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 [[HASH]], metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i8 poison, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i32 poison, metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[HASH]], 0, !dbg [[DBG28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BB:%.*]], label [[BB1:%.*]], !dbg [[DBG28]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB2:%.*]], !dbg [[DBG30:![0-9]+]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]], !dbg [[DBG31:![0-9]+]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi ptr [ @.str, [[BB]] ], [ [[NAME]], [[BB1]] ]
+; CHECK-NEXT:    ret ptr [[DOT0]], !dbg [[DBG31]]
+;
 entry:
   call void @llvm.dbg.value(metadata ptr %name, metadata !15, metadata !DIExpression()), !dbg !DILocation(scope: !16)
   call void @llvm.dbg.value(metadata i32 %len, metadata !20, metadata !DIExpression()), !dbg !DILocation(scope: !16)
@@ -47,7 +74,6 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 ; CHECK: attributes #0 = { nounwind ssp }
 ; CHECK: attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #2 = { noinline nounwind ssp }
-; CHECK: attributes [[NUW]] = { nounwind }
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!30}
@@ -68,9 +94,7 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 !14 = distinct !DILexicalBlock(line: 12, column: 0, file: !28, scope: !1)
 !15 = !DILocalVariable(name: "name", line: 17, arg: 1, scope: !16, file: !2, type: !6)
 ; CHECK: !DISubprogram(name: "add_name_internal"
-; CHECK-SAME: type: ![[MD:[0-9]+]]
 !16 = distinct !DISubprogram(name: "add_name_internal", linkageName: "add_name_internal", line: 22, isLocal: true, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, file: !28, scope: !2, type: !17)
-; CHECK: ![[MD]] = !DISubroutineType(cc: DW_CC_nocall, types: !{{[0-9]+}})
 !17 = !DISubroutineType(types: !18)
 !18 = !{!6, !6, !9, !9, !19, !9}
 !19 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned char", size: 8, align: 8, encoding: DW_ATE_unsigned_char)
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index f932788c73e2a..bf846c310a525 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -51,14 +51,7 @@ entry:
   ret void
 }
 
-; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) #1 {
 ; CHECK: entry_to_outline:
-; CHECK-NEXT:    store i32 2, ptr [[ARG0]], align 4
-; CHECK-NEXT:    store i32 3, ptr [[ARG1]], align 4
-; CHECK-NEXT:    store i32 4, ptr [[ARG2]], align 4
-; CHECK-NEXT:    [[AL:%.*]] = load i32, ptr [[ARG0]], align 4
-; CHECK-NEXT:    [[BL:%.*]] = load i32, ptr [[ARG1]], align 4
-; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 !0 = !DIFile(filename: "foo.c", directory: "/tmp")
 !1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)

From 8c2ae42b3e1c6aa7c18f873edcebff7c0b45a37e Mon Sep 17 00:00:00 2001
From: Kupa-Martin <84517188+Kupa-Martin@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:58:59 -0300
Subject: [PATCH 440/546] =?UTF-8?q?[Clang][Sema]=20Fix=20missing=20warning?=
 =?UTF-8?q?=20when=20comparing=20mismatched=20enums=20in=20=E2=80=A6=20(#8?=
 =?UTF-8?q?1418)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…C mode

Factored logic from `CheckImplicitConversion` into new methods
`Expr::getEnumConstantDecl` and `Expr::getEnumCoercedType` for use in
`checkEnumArithmeticConversions`.

Fix #29217
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/include/clang/AST/Expr.h                | 13 ++++++
 clang/lib/AST/Expr.cpp                        | 15 +++++++
 clang/lib/Sema/SemaChecking.cpp               | 11 +----
 clang/lib/Sema/SemaExpr.cpp                   |  3 +-
 clang/test/Sema/builtins-elementwise-math.c   |  4 ++
 .../Sema/warn-compare-enum-types-mismatch.c   | 42 +++++++++++++++++++
 ...=> warn-conditional-enum-types-mismatch.c} |  2 +-
 clang/test/Sema/warn-overlap.c                |  4 +-
 9 files changed, 84 insertions(+), 13 deletions(-)
 create mode 100644 clang/test/Sema/warn-compare-enum-types-mismatch.c
 rename clang/test/Sema/{warn-conditional-emum-types-mismatch.c => warn-conditional-enum-types-mismatch.c} (88%)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 515dffa28df18..c60c5682dbd82 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -197,6 +197,9 @@ Improvements to Clang's time-trace
 
 Bug Fixes in This Version
 -------------------------
+- Fixed missing warnings when comparing mismatched enumeration constants
+  in C (`#29217 <https://github.com/llvm/llvm-project/issues/29217>`).
+
 - Clang now accepts elaborated-type-specifiers that explicitly specialize
   a member class template for an implicit instantiation of a class template.
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 3fc481a62a78a..bf0622bdeca30 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -153,6 +153,12 @@ class Expr : public ValueStmt {
     TR = t;
   }
 
+  /// If this expression is an enumeration constant, return the
+  /// enumeration type under which said constant was declared.
+  /// Otherwise return the expression's type.
+  /// Note this effectively circumvents the weak typing of C's enum constants
+  QualType getEnumCoercedType(const ASTContext &Ctx) const;
+
   ExprDependence getDependence() const {
     return static_cast<ExprDependence>(ExprBits.Dependent);
   }
@@ -471,6 +477,13 @@ class Expr : public ValueStmt {
   /// bit-fields, but it will return null for a conditional bit-field.
   FieldDecl *getSourceBitField();
 
+  /// If this expression refers to an enum constant, retrieve its declaration
+  EnumConstantDecl *getEnumConstantDecl();
+
+  const EnumConstantDecl *getEnumConstantDecl() const {
+    return const_cast<Expr *>(this)->getEnumConstantDecl();
+  }
+
   const FieldDecl *getSourceBitField() const {
     return const_cast<Expr*>(this)->getSourceBitField();
   }
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index cc0407131d793..b4de2155adceb 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -263,6 +263,14 @@ namespace {
   }
 }
 
+QualType Expr::getEnumCoercedType(const ASTContext &Ctx) const {
+  if (isa<EnumType>(this->getType()))
+    return this->getType();
+  else if (const auto *ECD = this->getEnumConstantDecl())
+    return Ctx.getTypeDeclType(cast<EnumDecl>(ECD->getDeclContext()));
+  return this->getType();
+}
+
 SourceLocation Expr::getExprLoc() const {
   switch (getStmtClass()) {
   case Stmt::NoStmtClass: llvm_unreachable("statement without class");
@@ -4098,6 +4106,13 @@ FieldDecl *Expr::getSourceBitField() {
   return nullptr;
 }
 
+EnumConstantDecl *Expr::getEnumConstantDecl() {
+  Expr *E = this->IgnoreParenImpCasts();
+  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
+    return dyn_cast<EnumConstantDecl>(DRE->getDecl());
+  return nullptr;
+}
+
 bool Expr::refersToVectorElement() const {
   // FIXME: Why do we not just look at the ObjectKind here?
   const Expr *E = this->IgnoreParens();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7d8a2cf280c33..0de76ee119cf8 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16148,15 +16148,8 @@ static void CheckImplicitConversion(Sema &S, Expr *E, QualType T,
   // Diagnose conversions between different enumeration types.
   // In C, we pretend that the type of an EnumConstantDecl is its enumeration
   // type, to give us better diagnostics.
-  QualType SourceType = E->getType();
-  if (!S.getLangOpts().CPlusPlus) {
-    if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E))
-      if (EnumConstantDecl *ECD = dyn_cast<EnumConstantDecl>(DRE->getDecl())) {
-        EnumDecl *Enum = cast<EnumDecl>(ECD->getDeclContext());
-        SourceType = S.Context.getTypeDeclType(Enum);
-        Source = S.Context.getCanonicalType(SourceType).getTypePtr();
-      }
-  }
+  QualType SourceType = E->getEnumCoercedType(S.Context);
+  Source = S.Context.getCanonicalType(SourceType).getTypePtr();
 
   if (const EnumType *SourceEnum = Source->getAs<EnumType>())
     if (const EnumType *TargetEnum = Target->getAs<EnumType>())
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 403839f77a2b7..2a0e86c37f1bf 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1497,7 +1497,8 @@ static void checkEnumArithmeticConversions(Sema &S, Expr *LHS, Expr *RHS,
   //
   // Warn on this in all language modes. Produce a deprecation warning in C++20.
   // Eventually we will presumably reject these cases (in C++23 onwards?).
-  QualType L = LHS->getType(), R = RHS->getType();
+  QualType L = LHS->getEnumCoercedType(S.Context),
+           R = RHS->getEnumCoercedType(S.Context);
   bool LEnum = L->isUnscopedEnumerationType(),
        REnum = R->isUnscopedEnumerationType();
   bool IsCompAssign = ACK == Sema::ACK_CompAssign;
diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c
index e7b36285fa7dc..2e05337273ee4 100644
--- a/clang/test/Sema/builtins-elementwise-math.c
+++ b/clang/test/Sema/builtins-elementwise-math.c
@@ -76,6 +76,7 @@ void test_builtin_elementwise_add_sat(int i, short s, double d, float4 v, int3 i
 
   enum f { three };
   enum f x = __builtin_elementwise_add_sat(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_add_sat(ext, ext);
@@ -134,6 +135,7 @@ void test_builtin_elementwise_sub_sat(int i, short s, double d, float4 v, int3 i
 
   enum f { three };
   enum f x = __builtin_elementwise_sub_sat(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_sub_sat(ext, ext);
@@ -189,6 +191,7 @@ void test_builtin_elementwise_max(int i, short s, double d, float4 v, int3 iv, u
 
   enum f { three };
   enum f x = __builtin_elementwise_max(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_max(ext, ext);
@@ -244,6 +247,7 @@ void test_builtin_elementwise_min(int i, short s, double d, float4 v, int3 iv, u
 
   enum f { three };
   enum f x = __builtin_elementwise_min(one, three);
+  // expected-warning@-1 {{comparison of different enumeration types ('enum e' and 'enum f')}}
 
   _BitInt(32) ext; // expected-warning {{'_BitInt' in C17 and earlier is a Clang extension}}
   ext = __builtin_elementwise_min(ext, ext);
diff --git a/clang/test/Sema/warn-compare-enum-types-mismatch.c b/clang/test/Sema/warn-compare-enum-types-mismatch.c
new file mode 100644
index 0000000000000..2b72aae16b977
--- /dev/null
+++ b/clang/test/Sema/warn-compare-enum-types-mismatch.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -x c -fsyntax-only -verify -Wenum-compare -Wno-unused-comparison %s
+// RUN: %clang_cc1 -x c++ -fsyntax-only -verify -Wenum-compare -Wno-unused-comparison %s
+
+typedef enum EnumA {
+  A
+} EnumA;
+
+enum EnumB {
+  B
+};
+
+enum {
+  C
+};
+
+void foo(void) {
+  enum EnumA a = A;
+  enum EnumB b = B;
+  A == B;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a == (B);
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a == b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  A > B;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  A >= b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a > b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  (A) <= ((B));
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a < B;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+  a < b;
+  // expected-warning@-1 {{comparison of different enumeration types}}
+
+  // In the following cases we purposefully differ from GCC and dont warn 
+  a == C; 
+  A < C;
+  b >= C; 
+}
diff --git a/clang/test/Sema/warn-conditional-emum-types-mismatch.c b/clang/test/Sema/warn-conditional-enum-types-mismatch.c
similarity index 88%
rename from clang/test/Sema/warn-conditional-emum-types-mismatch.c
rename to clang/test/Sema/warn-conditional-enum-types-mismatch.c
index c9e2eddc7764b..f039245b6fabe 100644
--- a/clang/test/Sema/warn-conditional-emum-types-mismatch.c
+++ b/clang/test/Sema/warn-conditional-enum-types-mismatch.c
@@ -21,7 +21,7 @@ int get_flag(int cond) {
   #ifdef __cplusplus
   // expected-warning@-2 {{conditional expression between different enumeration types ('ro' and 'rw')}}
   #else 
-  // expected-no-diagnostics
+  // expected-warning@-4 {{conditional expression between different enumeration types ('enum ro' and 'enum rw')}}
   #endif
 }
 
diff --git a/clang/test/Sema/warn-overlap.c b/clang/test/Sema/warn-overlap.c
index 1eddfd1077fd9..2db07ebcd17b8 100644
--- a/clang/test/Sema/warn-overlap.c
+++ b/clang/test/Sema/warn-overlap.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wtautological-overlap-compare %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wall -Wno-unused -Wno-loop-analysis %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wtautological-overlap-compare -Wno-enum-compare %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wall -Wno-unused -Wno-loop-analysis -Wno-enum-compare %s
 
 #define mydefine 2
 

From 13c359aa9b5eae0715494505d989cfc8ed1d39cf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <RKSimon@users.noreply.github.com>
Date: Tue, 27 Feb 2024 15:03:42 +0000
Subject: [PATCH 441/546] [X86] ReplaceNodeResults - truncate sub-128-bit
 vectors as shuffles directly (#83120)

We were scalarizing these truncations, but in most cases we can widen the source vector to 128-bits and perform the truncation as a shuffle directly (which will usually lower as a PACK or PSHUFB).

For the cases where the widening and shuffle isn't legal we can leave it to generic legalization to scalarize for us.

Fixes #81883
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 28 +++++++++++++------------
 llvm/test/CodeGen/X86/extract-concat.ll | 19 +++++++----------
 llvm/test/CodeGen/X86/vec_anyext.ll     | 23 +++-----------------
 llvm/test/CodeGen/X86/vec_cast.ll       |  2 +-
 4 files changed, 26 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a86f13135173b..a134ca32c66bd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32341,20 +32341,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
     }
 
-    if (128 % InBits == 0) {
+    if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
       // 128 bit and smaller inputs should avoid truncate all together and
-      // just use a build_vector that will become a shuffle.
-      // TODO: Widen and use a shuffle directly?
-      SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
-      // Use the original element count so we don't do more scalar opts than
-      // necessary.
-      for (unsigned i=0; i < MinElts; ++i) {
-        SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
-                                  DAG.getIntPtrConstant(i, dl));
-        Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
-      }
-      Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
-      return;
+      // use a shuffle.
+      if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
+        int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
+        SmallVector<int, 16> TruncMask(WidenNumElts, -1);
+        for (unsigned I = 0; I < MinElts; ++I)
+          TruncMask[I] = Scale * I;
+        SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
+        assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
+               "Illegal vector type in truncation");
+        WidenIn = DAG.getBitcast(WidenVT, WidenIn);
+        Results.push_back(
+            DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
+        return;
+      }
     }
 
     // With AVX512 there are some cases that can use a target specific
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index 93dbe99882fe0..e7415dcf229f4 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) {
 ; SSE2-LABEL: foo:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    shll $8, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
-; SSE2-NEXT:    movl %eax, (%rdi)
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: foo:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    movl $255, %eax
 ; SSE42-NEXT:    pinsrb $3, %eax, %xmm0
 ; SSE42-NEXT:    movd %xmm0, (%rdi)
@@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) {
 ; AVX-LABEL: foo:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    movl $255, %eax
 ; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index 09e4a4b3a773d..e229165be967a 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
 ;
 ; X64-LABEL: func_8_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    vmovd %eax, %xmm0
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $16, %ecx
-; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; X64-NEXT:    shrq $48, %rax
-; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; X64-NEXT:    movq (%rsi), %rax
-; X64-NEXT:    vmovd %eax, %xmm1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $16, %ecx
-; X64-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
-; X64-NEXT:    shrq $48, %rax
-; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; X64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    retq
   %F = load <4 x i16>, ptr %a
   %G = trunc <4 x i16> %F to <4 x i8>
diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll
index e0089354cc953..0a6bc2f59b685 100644
--- a/llvm/test/CodeGen/X86/vec_cast.ll
+++ b/llvm/test/CodeGen/X86/vec_cast.ll
@@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
 ; CHECK-WIN-LABEL: h:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    movdqa (%rcx), %xmm0
-; CHECK-WIN-NEXT:    movl (%rcx), %eax
+; CHECK-WIN-NEXT:    movd %xmm0, %eax
 ; CHECK-WIN-NEXT:    pextrw $2, %xmm0, %edx
 ; CHECK-WIN-NEXT:    pextrw $4, %xmm0, %ecx
 ; CHECK-WIN-NEXT:    # kill: def $ax killed $ax killed $eax

From b3ae6c205e4afcf1a9f7d7204a659a27ca700237 Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Tue, 27 Feb 2024 07:05:21 -0800
Subject: [PATCH 442/546] [CLANG][DWARF] Do not emit -ggnu-pubnames for split
 dwarf version 5. (#82840)

When -gsplit-dwarf is passed in clang emmmits -ggnu-pubnames which
results in
.debug_gnu_pubnames/..debug_gnu_pubtypes being generated.
This is used by GDB, but not by LLDB.
Changed so that these sections are not emitted for LLDB tuning.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 7 ++++---
 clang/test/Driver/split-debug.c       | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6e1b7e8657d0d..dbfc729bba24c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4479,9 +4479,10 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
                       options::OPT_gpubnames, options::OPT_gno_pubnames);
   if (DwarfFission != DwarfFissionKind::None ||
       (PubnamesArg && checkDebugInfoOption(PubnamesArg, Args, D, TC)))
-    if (!PubnamesArg ||
-        (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
-         !PubnamesArg->getOption().matches(options::OPT_gno_pubnames)))
+    if (DebuggerTuning != llvm::DebuggerKind::LLDB &&
+        (!PubnamesArg ||
+         (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
+          !PubnamesArg->getOption().matches(options::OPT_gno_pubnames))))
       CmdArgs.push_back(PubnamesArg && PubnamesArg->getOption().matches(
                                            options::OPT_gpubnames)
                             ? "-gpubnames"
diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c
index 968f33b4cc035..a2a3dc0235450 100644
--- a/clang/test/Driver/split-debug.c
+++ b/clang/test/Driver/split-debug.c
@@ -124,3 +124,8 @@
 // G1_NOSPLIT: "-debug-info-kind=line-tables-only"
 // G1_NOSPLIT-NOT: "-split-dwarf-file"
 // G1_NOSPLIT-NOT: "-split-dwarf-output"
+
+/// Do not generate -ggnu-pubnames for -glldb
+// RUN: %clang -### -c -target x86_64 -gsplit-dwarf -g -glldb %s 2>&1 | FileCheck %s --check-prefixes=GLLDBSPLIT
+
+// GLLDBSPLIT-NOT: "-ggnu-pubnames"

From 04db60d15069494f4effad7a1001965904b36e6f Mon Sep 17 00:00:00 2001
From: choikwa <5455710+choikwa@users.noreply.github.com>
Date: Tue, 27 Feb 2024 10:13:59 -0500
Subject: [PATCH 443/546] [AMDGPU] Prevent hang in SIFoldOperands by caching
 uses (#82099)

foldOperands() for REG_SEQUENCE has recursion that can trigger an infinite loop
as the method can modify the operand order, which messes up the range-based
for loop. This patch fixes the issue by caching the uses for processing beforehand,
and then iterating over the cache rather using the instruction iterator.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp      | 15 +++++++++------
 .../CodeGen/AMDGPU/si-fold-reg-sequence.mir    | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d16d8ebd41a54..634b4aeb30a73 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -775,20 +775,23 @@ void SIFoldOperands::foldOperand(
     Register RegSeqDstReg = UseMI->getOperand(0).getReg();
     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
 
-    for (auto &RSUse : make_early_inc_range(MRI->use_nodbg_operands(RegSeqDstReg))) {
-      MachineInstr *RSUseMI = RSUse.getParent();
+    // Grab the use operands first
+    SmallVector<MachineOperand *, 4> UsesToProcess;
+    for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
+      UsesToProcess.push_back(&Use);
+    for (auto *RSUse : UsesToProcess) {
+      MachineInstr *RSUseMI = RSUse->getParent();
 
       if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
-                         RSUseMI->getOperandNo(&RSUse), FoldList))
+                         RSUseMI->getOperandNo(RSUse), FoldList))
         continue;
 
-      if (RSUse.getSubReg() != RegSeqDstSubReg)
+      if (RSUse->getSubReg() != RegSeqDstSubReg)
         continue;
 
-      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(&RSUse), FoldList,
+      foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
                   CopiesToReplace);
     }
-
     return;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir
new file mode 100644
index 0000000000000..7852f5d0c96f5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir
@@ -0,0 +1,18 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s
+
+---
+name:            fold_reg_sequence
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2
+
+    %0:sreg_32 = S_MOV_B32 0
+    %1:sreg_32 = S_MOV_B32 429
+    %2:sreg_64 = REG_SEQUENCE killed %1, %subreg.sub0, %0, %subreg.sub1
+    %3:vgpr_32 = V_MUL_HI_U32_e64 $vgpr2, %2.sub0, implicit $exec
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    %5:vgpr_32 = V_MUL_HI_U32_e64 %4, %2.sub0, implicit $exec
+    S_ENDPGM 0
+
+...
+

From 70e61f5bbbb0958bebedffb1be285fdefb0e2f0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 27 Feb 2024 07:36:01 +0100
Subject: [PATCH 444/546] [clang][Interp][NFC] Rename InitPtr{,Pop} to
 FinishInit{,Pop}

The old name clashes with the Init opcode.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 10 +++++-----
 clang/lib/AST/Interp/ByteCodeExprGen.h   |  6 +++---
 clang/lib/AST/Interp/ByteCodeStmtGen.cpp |  2 +-
 clang/lib/AST/Interp/Interp.h            |  4 ++--
 clang/lib/AST/Interp/Opcodes.td          |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index a71b6e82817e4..b3a7bc587b647 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -897,7 +897,7 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
         if (!this->visitInitializer(Init))
           return false;
 
-        if (!this->emitInitPtrPop(E))
+        if (!this->emitFinishInitPop(E))
           return false;
         // Base initializers don't increase InitIndex, since they don't count
         // into the Record's fields.
@@ -940,7 +940,7 @@ bool ByteCodeExprGen<Emitter>::visitArrayElemInit(unsigned ElemIndex,
     return false;
   if (!this->visitInitializer(Init))
     return false;
-  return this->emitInitPtrPop(Init);
+  return this->emitFinishInitPop(Init);
 }
 
 template <class Emitter>
@@ -2151,7 +2151,7 @@ bool ByteCodeExprGen<Emitter>::VisitCXXUuidofExpr(const CXXUuidofExpr *E) {
     }
   }
 
-  return this->emitInitPtr(E);
+  return this->emitFinishInit(E);
 }
 
 template <class Emitter>
@@ -2364,7 +2364,7 @@ bool ByteCodeExprGen<Emitter>::visitZeroRecordInitializer(const Record *R,
       return false;
     if (!this->visitZeroRecordInitializer(B.R, E))
       return false;
-    if (!this->emitInitPtrPop(E))
+    if (!this->emitFinishInitPop(E))
       return false;
   }
 
@@ -2544,7 +2544,7 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
     if (!visitInitializer(E))
       return false;
 
-    if (!this->emitInitPtr(E))
+    if (!this->emitFinishInit(E))
       return false;
     return this->emitRetValue(E);
   }
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 8f7a0c2fc3c10..8769d1d7b8502 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -182,7 +182,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
     if (!visitInitializer(Init))
       return false;
 
-    if (!this->emitInitPtr(Init))
+    if (!this->emitFinishInit(Init))
       return false;
 
     return this->emitPopPtr(Init);
@@ -196,7 +196,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
     if (!visitInitializer(Init))
       return false;
 
-    if (!this->emitInitPtr(Init))
+    if (!this->emitFinishInit(Init))
       return false;
 
     return this->emitPopPtr(Init);
@@ -210,7 +210,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
     if (!visitInitializer(I))
       return false;
 
-    return this->emitInitPtrPop(I);
+    return this->emitFinishInitPop(I);
   }
 
   bool visitInitList(ArrayRef<const Expr *> Inits, const Expr *E);
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 7e2043f8de90b..d9213b12cbd08 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -200,7 +200,7 @@ bool ByteCodeStmtGen<Emitter>::visitFunc(const FunctionDecl *F) {
           return false;
         if (!this->visitInitializer(InitExpr))
           return false;
-        if (!this->emitInitPtrPop(InitExpr))
+        if (!this->emitFinishInitPop(InitExpr))
           return false;
       } else if (const IndirectFieldDecl *IFD = Init->getIndirectMember()) {
         assert(IFD->getChainingSize() >= 2);
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 2b36a05e1af96..caddcb5f97803 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1280,14 +1280,14 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
   return true;
 }
 
-inline bool InitPtrPop(InterpState &S, CodePtr OpPC) {
+inline bool FinishInitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (Ptr.canBeInitialized())
     Ptr.initialize();
   return true;
 }
 
-inline bool InitPtr(InterpState &S, CodePtr OpPC) {
+inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
   if (Ptr.canBeInitialized())
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index e36c42d450fc9..3f71087ebee56 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -326,8 +326,8 @@ def GetPtrBasePop : Opcode {
   let Args = [ArgUint32];
 }
 
-def InitPtrPop : Opcode;
-def InitPtr : Opcode;
+def FinishInitPop : Opcode;
+def FinishInit    : Opcode;
 
 def GetPtrDerivedPop : Opcode {
   let Args = [ArgUint32];

From 48bc9022b49cc452bd871c4942e5a059d226c9eb Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Tue, 27 Feb 2024 07:23:51 -0800
Subject: [PATCH 445/546] [MemProf] Fix the stack updating handling of pruned
 contexts (#81322)

Fix a bug in the handling of cases where a callsite's stack ids
partially overlap with the pruned context during matching of
calls to the graph contructed from the profiled contexts. This fix makes
the code match the comments.
---
 .../IPO/MemProfContextDisambiguation.cpp      |   2 +-
 .../MemProfContextDisambiguation/inlined3.ll  | 193 ++++++++++++++++++
 2 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 323f33fcc862d..271d3ed40030b 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1381,7 +1381,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
       // not fully matching stack contexts. To do this, subtract any context ids
       // found in caller nodes of the last node found above.
       if (Ids.back() != getLastStackId(Call)) {
-        for (const auto &PE : CurNode->CallerEdges) {
+        for (const auto &PE : LastNode->CallerEdges) {
           set_subtract(StackSequenceContextIds, PE->getContextIds());
           if (StackSequenceContextIds.empty())
             break;
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll
new file mode 100644
index 0000000000000..39a595897c370
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined3.ll
@@ -0,0 +1,193 @@
+;; This test ensures that the logic which assigns calls to stack nodes
+;; correctly handles an inlined callsite with stack ids that partially
+;; overlap with a trimmed context. In particular when it also partially
+;; overlaps with a longer non-trimmed context that doesn't match all of
+;; the inlined callsite stack ids.
+
+;; The profile data and call stacks were all manually added, but the code
+;; would be structured something like the following (fairly contrived to
+;; result in the type of control flow needed to test):
+
+;; void A(bool b) {
+;;   if (b)
+;;     // cold: stack ids 6, 2, 8 (trimmed ids 10)
+;;     // not cold: stack ids 6, 7 (trimmed ids 9, 11)
+;;     new char[10]; // stack id 6
+;;   else
+;;     // not cold: stack ids 1, 2, 8, 3, 4
+;;     // cold: stack ids 1, 2, 8, 3, 5
+;;     new char[10]; // stack id 1
+;; }
+;;
+;; void XZ() {
+;;   A(false); // stack ids 2, 8 (e.g. X inlined into Z)
+;; }
+;;
+;; void XZN() {
+;;   // This is the tricky one to get right. We want to ensure it gets
+;;   // correctly correlated with a stack node for the trimmed 6, 2, 8
+;;   // context shown in A. It should *not* be correlated with the longer
+;;   // untrimmed 1, 2, 8, 3, 4|5 contexts.
+;;   A(true); // stack ids 2, 8, 9 (e.g. X inlined into Z inlined into N)
+;; }
+;;
+;; void Y() {
+;;   A(true); // stack id 7
+;; }
+;;
+;; void M() {
+;;   XZ(); // stack id 3
+;; }
+;;
+;; int main() {
+;;   M(); // stack id 4 (leads to not cold allocation)
+;;   M(); // stack id 5 (leads to cold allocation)
+;;   XZN(); // stack id 11 (leads to cold allocation)
+;;   Y(); // stack id 10 (leads to not cold allocation)
+;; }
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
+
+; REMARKS: created clone _Z1Ab.memprof.1
+; REMARKS: created clone _Z2XZv.memprof.1
+; REMARKS: created clone _Z1Mv.memprof.1
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps
+;; trimmed cold context, and also partially overlaps completely
+;; unrelated contexts, correctly calls a cloned version of Z1Ab,
+;; which will call the cold annotated allocation.
+; REMARKS: call in clone _Z3XZNv assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z1Mv.memprof.1
+; REMARKS: call in clone _Z1Mv.memprof.1 assigned to call function clone _Z2XZv.memprof.1
+; REMARKS: call in clone _Z2XZv.memprof.1 assigned to call function clone _Z1Ab
+; REMARKS: call in clone main assigned to call function clone _Z1Mv
+; REMARKS: call in clone _Z1Mv assigned to call function clone _Z2XZv
+; REMARKS: call in clone _Z2XZv assigned to call function clone _Z1Ab.memprof.1
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Yv assigned to call function clone _Z1Ab
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1Ab marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Ab.memprof.1 marked with memprof allocation attribute notcold
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1Ab(i1 noundef zeroext %b) {
+entry:
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !5, !callsite !11
+  br label %if.end
+
+if.else:
+  %call2 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !memprof !0, !callsite !10
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Function Attrs: nobuiltin
+declare ptr @_Znam(i64) #0
+
+define dso_local void @_Z2XZv() local_unnamed_addr #0 {
+entry:
+  tail call void @_Z1Ab(i1 noundef zeroext false), !callsite !12
+  ret void
+}
+
+define dso_local void @_Z1Mv() local_unnamed_addr #0 {
+entry:
+  tail call void @_Z2XZv(), !callsite !19
+  ret void
+}
+
+define dso_local void @_Z3XZNv() local_unnamed_addr {
+entry:
+  tail call void @_Z1Ab(i1 noundef zeroext true), !callsite !15
+  ret void
+}
+
+define dso_local void @_Z1Yv() local_unnamed_addr {
+entry:
+  tail call void @_Z1Ab(i1 noundef zeroext true), !callsite !17
+  ret void
+}
+
+define dso_local noundef i32 @main() local_unnamed_addr {
+entry:
+  tail call void @_Z1Mv(), !callsite !13 ;; Not cold context
+  tail call void @_Z1Mv(), !callsite !14 ;; Cold context
+  tail call void @_Z3XZNv(), !callsite !16 ;; Cold context
+  tail call void @_Z1Yv(), !callsite !18 ;; Not cold context
+  ret i32 0
+}
+
+attributes #0 = { nobuiltin }
+attributes #7 = { builtin }
+
+!0 = !{!1, !3}
+;; Not cold context via first call to _Z1Mv in main
+!1 = !{!2, !"notcold"}
+!2 = !{i64 1, i64 2, i64 8, i64 3, i64 4}
+;; Cold context via second call to _Z1Mv in main
+!3 = !{!4, !"cold"}
+!4 = !{i64 1, i64 2, i64 8, i64 3, i64 5}
+!5 = !{!6, !8}
+;; Cold (trimmed) context via call to _Z3XZNv in main
+!6 = !{!7, !"cold"}
+!7 = !{i64 6, i64 2, i64 8}
+;; Not cold (trimmed) context via call to _Z1Yv in main
+!8 = !{!9, !"notcold"}
+!9 = !{i64 6, i64 7}
+!10 = !{i64 1}
+!11 = !{i64 6}
+!12 = !{i64 2, i64 8}
+!13 = !{i64 4}
+!14 = !{i64 5}
+;; Inlined context in _Z3XZNv, which includes part of trimmed cold context
+!15 = !{i64 2, i64 8, i64 9}
+!16 = !{i64 11}
+!17 = !{i64 7}
+!18 = !{i64 10}
+!19 = !{i64 3}
+
+; IR: define {{.*}} @_Z1Ab(i1 noundef zeroext %b)
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define {{.*}} @_Z2XZv()
+; IR:   call {{.*}} @_Z1Ab.memprof.1(i1 noundef zeroext false)
+; IR: define {{.*}} @_Z1Mv()
+; IR:   call {{.*}} @_Z2XZv()
+;; Make sure the inlined context in _Z3XZNv, which partially overlaps
+;; trimmed cold context, and also partially overlaps completely
+;; unrelated contexts, correctly calls the cloned version of Z1Ab
+;; that will call the cold annotated allocation.
+; IR: define {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z1Ab.memprof.1(i1 noundef zeroext true)
+; IR: define {{.*}} @_Z1Yv()
+; IR:   call {{.*}} @_Z1Ab(i1 noundef zeroext true)
+; IR: define {{.*}} @main()
+; IR:   call {{.*}} @_Z1Mv()
+; IR:   call {{.*}} @_Z1Mv.memprof.1()
+; IR:   call {{.*}} @_Z3XZNv()
+; IR:   call {{.*}} @_Z1Yv()
+; IR: define {{.*}} @_Z1Ab.memprof.1(i1 noundef zeroext %b)
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define {{.*}} @_Z2XZv.memprof.1()
+; IR:   call {{.*}} @_Z1Ab(i1 noundef zeroext false)
+; IR: define {{.*}} @_Z1Mv.memprof.1()
+; IR:   call {{.*}} @_Z2XZv.memprof.1()
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis

From 822142ffdfbe93f213c2c6b3f2aec7fe5f0af072 Mon Sep 17 00:00:00 2001
From: Joachim <jenke@itc.rwth-aachen.de>
Date: Tue, 27 Feb 2024 16:24:55 +0100
Subject: [PATCH 446/546] [OpenMP][OMPD] libompd must not link libomp (#83119)

Fixes a regression introduced in 91ccd8248.
The code for libompd includes kmp.h for enum kmp_sched. The dependency
to hwloc is not necessary. Avoid the dependency by skipping the
definitions in kmp.h using types from hwloc.h.

Fixes #80750
---
 openmp/libompd/src/CMakeLists.txt | 3 ++-
 openmp/libompd/src/omp-icv.cpp    | 2 ++
 openmp/runtime/src/kmp.h          | 5 +++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/openmp/libompd/src/CMakeLists.txt b/openmp/libompd/src/CMakeLists.txt
index 0402a01772010..ba228d5181048 100644
--- a/openmp/libompd/src/CMakeLists.txt
+++ b/openmp/libompd/src/CMakeLists.txt
@@ -13,7 +13,8 @@ cmake_minimum_required(VERSION 3.20.0)
 
 add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp omp-icv.cpp)
 
-target_link_libraries(ompd omp) # ensure generated import library is created first
+# libompd must not link against libomp, there is no code dependency.
+add_dependencies(ompd omp) # ensure generated import library is created first
 
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
diff --git a/openmp/libompd/src/omp-icv.cpp b/openmp/libompd/src/omp-icv.cpp
index 4a2c2b6e8bc62..0288e963a6974 100644
--- a/openmp/libompd/src/omp-icv.cpp
+++ b/openmp/libompd/src/omp-icv.cpp
@@ -18,7 +18,9 @@
 #include "omp.h"
 #include "ompd-private.h"
 #include "TargetValue.h"
+#define OMPD_SKIP_HWLOC 1
 #include "kmp.h"
+#undef OMPD_SKIP_HWLOC
 #include <cstring>
 
 /* The ICVs ompd-final-var and ompd-implicit-var below are for backward
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 259c57b5afbca..121e7e959129e 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -103,7 +103,8 @@ class kmp_stats_list;
 #define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
 #endif
 
-#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
+// OMPD_SKIP_HWLOC used in libompd/omp-icv.cpp to avoid OMPD depending on hwloc
+#if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED && !defined(OMPD_SKIP_HWLOC)
 #include "hwloc.h"
 #ifndef HWLOC_OBJ_NUMANODE
 #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
@@ -689,7 +690,7 @@ typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
 extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #endif /* KMP_OS_WINDOWS */
 
-#if KMP_USE_HWLOC
+#if KMP_USE_HWLOC && !defined(OMPD_SKIP_HWLOC)
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
 #endif

From 55783bd0f9cfc30aa93c718919dab5419d86a2c6 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Tue, 27 Feb 2024 10:52:55 -0500
Subject: [PATCH 447/546] [HIP] fix host min/max in header (#82956)

CUDA defines min/max functions for host in global namespace.
HIP header needs to define them too to be compatible.
Currently only min/max(int, int) is defined. This causes
wrong result for arguments that are out of range for int.
This patch defines host min/max functions to be compatible
with CUDA.

Also allows users to define
`__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__` to disable
host max/min in global namespace.

min/max functions with mixed signed/unsigned integer
parameters are not defined unless
`__HIP_DEFINE_MIXED_HOST_MIN_MAX__` is defined.

Fixes: SWDEV-446564
---
 clang/lib/Headers/__clang_hip_math.h | 72 +++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 11e1e7d032586..34d1de0a06005 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -1306,15 +1306,75 @@ float min(float __x, float __y) { return __builtin_fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return __builtin_fmin(__x, __y); }
 
-#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
-__host__ inline static int min(int __arg1, int __arg2) {
-  return __arg1 < __arg2 ? __arg1 : __arg2;
+// Define host min/max functions.
+#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&                  \
+    !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
+
+#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
+#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
+#define DEFINE_MIN_MAX_FUNCTIONS(ret_type, type1, type2)                       \
+  inline ret_type min(const type1 __a, const type2 __b) {                      \
+    return (__a < __b) ? __a : __b;                                            \
+  }                                                                            \
+  inline ret_type max(const type1 __a, const type2 __b) {                      \
+    return (__a > __b) ? __a : __b;                                            \
+  }
+
+// Define min and max functions for same type comparisons
+DEFINE_MIN_MAX_FUNCTIONS(int, int, int)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, unsigned int)
+DEFINE_MIN_MAX_FUNCTIONS(long, long, long)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, unsigned long)
+DEFINE_MIN_MAX_FUNCTIONS(long long, long long, long long)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long,
+                         unsigned long long)
+
+// The host min/max functions below accept mixed signed/unsigned integer
+// parameters and perform unsigned comparisons, which may produce unexpected
+// results if a signed integer was passed unintentionally. To avoid this
+// happening silently, these overloaded functions are not defined by default.
+// However, for compatibility with CUDA, they will be defined if users define
+// __HIP_DEFINE_MIXED_HOST_MIN_MAX__.
+#ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
+DEFINE_MIN_MAX_FUNCTIONS(unsigned int, int, unsigned int)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, int)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned long, long, unsigned long)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, long)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, long long, unsigned long long)
+DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long, long long)
+#endif // ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
+
+// Floating-point comparisons using built-in functions
+inline float min(float const __a, float const __b) {
+  return __builtin_fminf(__a, __b);
+}
+inline double min(double const __a, double const __b) {
+  return __builtin_fmin(__a, __b);
+}
+inline double min(float const __a, double const __b) {
+  return __builtin_fmin(__a, __b);
+}
+inline double min(double const __a, float const __b) {
+  return __builtin_fmin(__a, __b);
 }
 
-__host__ inline static int max(int __arg1, int __arg2) {
-  return __arg1 > __arg2 ? __arg1 : __arg2;
+inline float max(float const __a, float const __b) {
+  return __builtin_fmaxf(__a, __b);
+}
+inline double max(double const __a, double const __b) {
+  return __builtin_fmax(__a, __b);
+}
+inline double max(float const __a, double const __b) {
+  return __builtin_fmax(__a, __b);
 }
-#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
+inline double max(double const __a, float const __b) {
+  return __builtin_fmax(__a, __b);
+}
+
+#pragma pop_macro("DEFINE_MIN_MAX_FUNCTIONS")
+
+#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&
+       // !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
 #endif
 
 #pragma pop_macro("__DEVICE__")

From d612d593eff4af7976250023bbff34d2c10f7526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 27 Feb 2024 07:36:46 +0100
Subject: [PATCH 448/546] [clang][Interp] Fix local lvalue compound literals

Same fix we had for global ones: leave a pointer on the stack.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 28 +++++++++++++++++++-----
 clang/lib/AST/Interp/Interp.h            | 13 +++++++++++
 clang/lib/AST/Interp/Opcodes.td          |  1 +
 clang/test/AST/Interp/c.c                | 16 ++++++++++++++
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index b3a7bc587b647..2a6216a67e381 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1700,19 +1700,35 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundLiteralExpr(
   }
 
   // Otherwise, use a local variable.
-  if (T) {
+  if (T && !E->isLValue()) {
     // For primitive types, we just visit the initializer.
     return this->delegate(Init);
   } else {
-    if (std::optional<unsigned> LocalIndex = allocateLocal(Init)) {
-      if (!this->emitGetPtrLocal(*LocalIndex, E))
+    unsigned LocalIndex;
+
+    if (T)
+      LocalIndex = this->allocateLocalPrimitive(Init, *T, false, false);
+    else if (std::optional<unsigned> MaybeIndex = this->allocateLocal(Init))
+      LocalIndex = *MaybeIndex;
+    else
+      return false;
+
+    if (!this->emitGetPtrLocal(LocalIndex, E))
+      return false;
+
+    if (T) {
+      if (!this->visit(Init)) {
         return false;
+      }
+      return this->emitInit(*T, E);
+    } else {
       if (!this->visitInitializer(Init))
         return false;
-      if (DiscardResult)
-        return this->emitPopPtr(E);
-      return true;
     }
+
+    if (DiscardResult)
+      return this->emitPopPtr(E);
+    return true;
   }
 
   return false;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index caddcb5f97803..db52f6649c18b 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1399,6 +1399,19 @@ bool StoreBitFieldPop(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool Init(InterpState &S, CodePtr OpPC) {
+  const T &Value = S.Stk.pop<T>();
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckInit(S, OpPC, Ptr)) {
+    assert(false);
+    return false;
+  }
+  Ptr.initialize();
+  new (&Ptr.deref<T>()) T(Value);
+  return true;
+}
+
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitPop(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 3f71087ebee56..3e3ba1b163e33 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -476,6 +476,7 @@ def StoreBitField : StoreBitFieldOpcode {}
 def StoreBitFieldPop : StoreBitFieldOpcode {}
 
 // [Pointer, Value] -> []
+def Init : StoreOpcode {}
 def InitPop : StoreOpcode {}
 // [Pointer, Value] -> [Pointer]
 def InitElem : Opcode {
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index a6244c3af202a..2a72c24b43d1c 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -180,3 +180,19 @@ void test4(void) {
   t1 = sizeof(int);
 }
 
+void localCompoundLiteral(void) {
+  struct S { int x, y; } s = {}; // pedantic-expected-warning {{use of an empty initializer}} \
+                                 // pedantic-ref-warning {{use of an empty initializer}}
+  struct T {
+	int i;
+    struct S s;
+  } t1 = { 1, {} }; // pedantic-expected-warning {{use of an empty initializer}} \
+                    // pedantic-ref-warning {{use of an empty initializer}}
+
+  struct T t3 = {
+    (int){}, // pedantic-expected-warning {{use of an empty initializer}} \
+             // pedantic-ref-warning {{use of an empty initializer}}
+    {} // pedantic-expected-warning {{use of an empty initializer}} \
+       // pedantic-ref-warning {{use of an empty initializer}}
+  };
+}

From 4cba5957e6ffdbeb44174d32da892ad09b3eed88 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 27 Feb 2024 09:59:50 -0600
Subject: [PATCH 449/546] [mlir][ROCDL] Set the LLVM data layout when lowering
 to ROCDL LLVM (#74501)

In order to ensure operations lower correctly (especially
memref.addrspacecast, which relies on the data layout benig set
correctly then dealing with dynamic memrefs) and to prevent compilation
issues later down the line, set the `llvm.data_layout` attribute on GPU
modules when lowering their contents to a ROCDL / AMDGPU target.

If there's a good way to test the embedded string to prevent it from
going out of sync with the LLVM TargetMachine, I'd appreciate hearing
about it. (Or, alternatively, if there's a place I could farctor the
string out to).
---
 .../Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp  | 12 ++++++++++++
 mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir    | 10 ++++++++++
 2 files changed, 22 insertions(+)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 4fa3cdcbf85ce..f425b1f59d994 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -75,6 +75,11 @@ Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
                                                    ValueRange{minus1, mbcntLo});
   return laneId;
 }
+static constexpr StringLiteral amdgcnDataLayout =
+    "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
+    "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
+    "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
+    "G1-ni:7:8";
 
 namespace {
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
@@ -212,6 +217,12 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
 
+    auto llvmDataLayout = m->getAttrOfType<StringAttr>(
+        LLVM::LLVMDialect::getDataLayoutAttrName());
+    if (!llvmDataLayout) {
+      llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
+      m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
+    }
     // Request C wrapper emission.
     for (auto func : m.getOps<func::FuncOp>()) {
       func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
@@ -227,6 +238,7 @@ struct LowerGpuOpsToROCDLOpsPass
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
+    options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
 
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 2652b86657099..8a2d8bd7967ca 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
+// CHECK-LABEL: @test_module
+// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   // CHECK32-LABEL: func @gpu_index_ops()
@@ -628,3 +630,11 @@ gpu.module @test_module {
     func.return %shfl, %shfli : f32, f32
   }
 }
+
+// -----
+
+// CHECK-LABEL: @test_custom_data_layout
+// CHECK-SAME: llvm.data_layout = "e"
+gpu.module @test_custom_data_layout attributes {llvm.data_layout = "e"} {
+
+}

From b70f42a430723e00d76cc99d348e4f2fec221cf1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 27 Feb 2024 14:00:09 +0100
Subject: [PATCH 450/546] [clang][Interp] Handle PseudoObjectExprs

Evaluate all the semantic expressions.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp |  25 +++
 clang/lib/AST/Interp/ByteCodeExprGen.h   |   1 +
 clang/test/AST/Interp/spaceship.cpp      | 232 +++++++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 clang/test/AST/Interp/spaceship.cpp

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 2a6216a67e381..f193f959d3a6c 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2189,6 +2189,31 @@ bool ByteCodeExprGen<Emitter>::VisitCXXRewrittenBinaryOperator(
   return this->delegate(E->getSemanticForm());
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitPseudoObjectExpr(
+    const PseudoObjectExpr *E) {
+
+  for (const Expr *SemE : E->semantics()) {
+    if (auto *OVE = dyn_cast<OpaqueValueExpr>(SemE)) {
+      if (SemE == E->getResultExpr())
+        return false;
+
+      if (OVE->isUnique())
+        continue;
+
+      if (!this->discard(OVE))
+        return false;
+    } else if (SemE == E->getResultExpr()) {
+      if (!this->delegate(SemE))
+        return false;
+    } else {
+      if (!this->discard(SemE))
+        return false;
+    }
+  }
+  return true;
+}
+
 template <class Emitter> bool ByteCodeExprGen<Emitter>::discard(const Expr *E) {
   if (E->containsErrors())
     return false;
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 8769d1d7b8502..5b3b533dba387 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -117,6 +117,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitRequiresExpr(const RequiresExpr *E);
   bool VisitConceptSpecializationExpr(const ConceptSpecializationExpr *E);
   bool VisitCXXRewrittenBinaryOperator(const CXXRewrittenBinaryOperator *E);
+  bool VisitPseudoObjectExpr(const PseudoObjectExpr *E);
 
 protected:
   bool visitExpr(const Expr *E) override;
diff --git a/clang/test/AST/Interp/spaceship.cpp b/clang/test/AST/Interp/spaceship.cpp
new file mode 100644
index 0000000000000..7495b893772e8
--- /dev/null
+++ b/clang/test/AST/Interp/spaceship.cpp
@@ -0,0 +1,232 @@
+// RUN: %clang_cc1 -std=c++2a -verify=both,ref %s -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++2a -verify=both,expected %s -fcxx-exceptions -fexperimental-new-constant-interpreter
+
+namespace std {
+  struct strong_ordering { // both-note 6{{candidate}}
+    int n;
+    constexpr operator int() const { return n; }
+    static const strong_ordering less, equal, greater;
+  };
+  constexpr strong_ordering strong_ordering::less{-1},
+      strong_ordering::equal{0}, strong_ordering::greater{1};
+
+  struct weak_ordering {
+    int n;
+    constexpr weak_ordering(int n) : n(n) {}
+    constexpr weak_ordering(strong_ordering o) : n(o.n) {}
+    constexpr operator int() const { return n; }
+    static const weak_ordering less, equivalent, greater;
+  };
+  constexpr weak_ordering weak_ordering::less{-1},
+      weak_ordering::equivalent{0}, weak_ordering::greater{1};
+
+  struct partial_ordering {
+    double d;
+    constexpr partial_ordering(double d) : d(d) {}
+    constexpr partial_ordering(strong_ordering o) : d(o.n) {}
+    constexpr partial_ordering(weak_ordering o) : d(o.n) {}
+    constexpr operator double() const { return d; }
+    static const partial_ordering less, equivalent, greater, unordered;
+  };
+  constexpr partial_ordering partial_ordering::less{-1},
+      partial_ordering::equivalent{0}, partial_ordering::greater{1},
+      partial_ordering::unordered{__builtin_nan("")};
+
+  static_assert(!(partial_ordering::unordered < 0));
+  static_assert(!(partial_ordering::unordered == 0));
+  static_assert(!(partial_ordering::unordered > 0));
+}
+
+namespace Deletedness {
+  struct A {
+    std::strong_ordering operator<=>(const A&) const;
+  };
+  struct B {
+    bool operator==(const B&) const;
+    bool operator<(const B&) const;
+  };
+  struct C {
+    std::strong_ordering operator<=>(const C&) const = delete; // both-note 2{{deleted}}
+  };
+  struct D1 {
+    bool operator==(const D1&) const;
+    std::strong_ordering operator<=>(int) const; // both-note 2{{function not viable}} both-note 2{{function (with reversed parameter order) not viable}}
+    bool operator<(int) const; // both-note 2{{function not viable}}
+  };
+  struct D2 {
+    bool operator<(const D2&) const;
+    std::strong_ordering operator<=>(int) const; // both-note 2{{function not viable}} both-note 2{{function (with reversed parameter order) not viable}}
+    bool operator==(int) const; // both-note 2{{function not viable}}
+  };
+  struct E {
+    bool operator==(const E&) const;
+    bool operator<(const E&) const = delete; // both-note 2{{deleted}}
+  };
+  struct F {
+    std::strong_ordering operator<=>(const F&) const; // both-note 2{{candidate}}
+    std::strong_ordering operator<=>(F) const; // both-note 2{{candidate}}
+  };
+  struct G1 {
+    bool operator==(const G1&) const;
+    void operator<(const G1&) const;
+  };
+  struct G2 {
+    void operator==(const G2&) const;
+    bool operator<(const G2&) const;
+  };
+  struct H {
+    void operator<=>(const H&) const;
+  };
+
+  // both-note@#base {{deleted comparison function for base class 'C'}}
+  // both-note@#base {{no viable three-way comparison function for base class 'D1'}}
+  // both-note@#base {{three-way comparison cannot be synthesized because there is no viable function for '<' comparison}}
+  // both-note@#base {{no viable 'operator==' for base class 'D2'}}
+  // both-note@#base {{three-way comparison cannot be synthesized because there is no viable function for '==' comparison}}
+  // both-note@#base {{deleted comparison function for base class 'E'}}
+  // both-note@#base {{implied comparison for base class 'F' is ambiguous}}
+  template<typename T> struct Cmp : T { // #base
+    std::strong_ordering operator<=>(const Cmp&) const = default; // #cmp both-note 5{{here}}
+  };
+
+  void use(...);
+  void f() {
+    use(
+      Cmp<A>() <=> Cmp<A>(),
+      Cmp<B>() <=> Cmp<B>(),
+      Cmp<C>() <=> Cmp<C>(), // both-error {{deleted}}
+      Cmp<D1>() <=> Cmp<D1>(), // both-error {{deleted}}
+      Cmp<D2>() <=> Cmp<D2>(), // both-error {{deleted}}
+      Cmp<E>() <=> Cmp<E>(), // both-error {{deleted}}
+      Cmp<F>() <=> Cmp<F>(), // both-error {{deleted}}
+      // FIXME: The following three errors are not very good.
+      // both-error@#cmp {{value of type 'void' is not contextually convertible to 'bool'}}
+      Cmp<G1>() <=> Cmp<G1>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}Cmp<{{.*}}G1>' first required here}}j
+      // both-error@#cmp {{value of type 'void' is not contextually convertible to 'bool'}}
+      Cmp<G2>() <=> Cmp<G2>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}Cmp<{{.*}}G2>' first required here}}j
+      // both-error@#cmp {{no matching conversion for static_cast from 'void' to 'std::strong_ordering'}}
+      Cmp<H>() <=> Cmp<H>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}Cmp<{{.*}}H>' first required here}}j
+      0
+    );
+  }
+
+  // both-note@#arr {{deleted comparison function for member 'arr'}}
+  // both-note@#arr {{no viable three-way comparison function for member 'arr'}}
+  // both-note@#arr {{three-way comparison cannot be synthesized because there is no viable function for '<' comparison}}
+  // both-note@#arr {{no viable 'operator==' for member 'arr'}}
+  // both-note@#arr {{three-way comparison cannot be synthesized because there is no viable function for '==' comparison}}
+  // both-note@#arr {{deleted comparison function for member 'arr'}}
+  // both-note@#arr {{implied comparison for member 'arr' is ambiguous}}
+  template<typename T> struct CmpArray {
+    T arr[3]; // #arr
+    std::strong_ordering operator<=>(const CmpArray&) const = default; // #cmparray both-note 5{{here}}
+  };
+  void g() {
+    use(
+      CmpArray<A>() <=> CmpArray<A>(),
+      CmpArray<B>() <=> CmpArray<B>(),
+      CmpArray<C>() <=> CmpArray<C>(), // both-error {{deleted}}
+      CmpArray<D1>() <=> CmpArray<D1>(), // both-error {{deleted}}
+      CmpArray<D2>() <=> CmpArray<D2>(), // both-error {{deleted}}
+      CmpArray<E>() <=> CmpArray<E>(), // both-error {{deleted}}
+      CmpArray<F>() <=> CmpArray<F>(), // both-error {{deleted}}
+      // FIXME: The following three errors are not very good.
+      // both-error@#cmparray {{value of type 'void' is not contextually convertible to 'bool'}}
+      CmpArray<G1>() <=> CmpArray<G1>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}CmpArray<{{.*}}G1>' first required here}}j
+      // both-error@#cmparray {{value of type 'void' is not contextually convertible to 'bool'}}
+      CmpArray<G2>() <=> CmpArray<G2>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}CmpArray<{{.*}}G2>' first required here}}j
+      // both-error@#cmparray {{no matching conversion for static_cast from 'void' to 'std::strong_ordering'}}
+      CmpArray<H>() <=> CmpArray<H>(), // both-note-re {{in defaulted three-way comparison operator for '{{.*}}CmpArray<{{.*}}H>' first required here}}j
+      0
+    );
+  }
+}
+
+namespace Access {
+  class A {
+    std::strong_ordering operator<=>(const A &) const; // both-note {{here}}
+  public:
+    bool operator==(const A &) const;
+    bool operator<(const A &) const;
+  };
+  struct B {
+    A a; // both-note {{would invoke a private 'operator<=>'}}
+    friend std::strong_ordering operator<=>(const B &, const B &) = default; // both-warning {{deleted}} both-note{{replace 'default'}}
+  };
+
+  class C {
+    std::strong_ordering operator<=>(const C &); // not viable (not const)
+    bool operator==(const C &) const; // both-note {{here}}
+    bool operator<(const C &) const;
+  };
+  struct D {
+    C c; // both-note {{would invoke a private 'operator=='}}
+    friend std::strong_ordering operator<=>(const D &, const D &) = default; // both-warning {{deleted}} both-note{{replace 'default'}}
+  };
+}
+
+namespace Synthesis {
+  enum Result { False, True, Mu };
+
+  constexpr bool toBool(Result R) {
+    if (R == Mu) throw "should not ask this question";
+    return R == True;
+  }
+
+  struct Val {
+    Result equal, less;
+    constexpr bool operator==(const Val&) const { return toBool(equal); }
+    constexpr bool operator<(const Val&) const { return toBool(less); }
+  };
+
+  template<typename T> struct Cmp {
+    Val val;
+    friend T operator<=>(const Cmp&, const Cmp&) = default; // both-note {{deleted}}
+  };
+
+  template<typename T> constexpr auto cmp(Result equal, Result less = Mu, Result reverse_less = Mu) {
+    return Cmp<T>{equal, less} <=> Cmp<T>{Mu, reverse_less};
+  }
+
+  static_assert(cmp<std::strong_ordering>(True) == 0);
+  static_assert(cmp<std::strong_ordering>(False, True) < 0);
+  static_assert(cmp<std::strong_ordering>(False, False) > 0);
+
+  static_assert(cmp<std::weak_ordering>(True) == 0);
+  static_assert(cmp<std::weak_ordering>(False, True) < 0);
+  static_assert(cmp<std::weak_ordering>(False, False) > 0);
+
+  static_assert(cmp<std::partial_ordering>(True) == 0);
+  static_assert(cmp<std::partial_ordering>(False, True) < 0);
+  static_assert(cmp<std::partial_ordering>(False, False, True) > 0);
+  static_assert(!(cmp<std::partial_ordering>(False, False, False) > 0));
+  static_assert(!(cmp<std::partial_ordering>(False, False, False) == 0));
+  static_assert(!(cmp<std::partial_ordering>(False, False, False) < 0));
+
+  // No synthesis is performed for a custom return type, even if it can be
+  // converted from a standard ordering.
+  struct custom_ordering {
+    custom_ordering(std::strong_ordering o);
+  };
+  void f(Cmp<custom_ordering> c) {
+    c <=> c; // both-error {{deleted}}
+  }
+}
+
+namespace Preference {
+  struct A {
+    A(const A&) = delete; // both-note {{deleted}}
+    // "usable" candidate that can't actually be called
+    friend void operator<=>(A, A); // both-note {{passing}}
+    // Callable candidates for synthesis not considered.
+    friend bool operator==(A, A);
+    friend bool operator<(A, A);
+  };
+
+  struct B {
+    B();
+    A a;
+    std::strong_ordering operator<=>(const B&) const = default; // both-error {{call to deleted constructor of 'A'}}
+  };
+  bool x = B() < B(); // both-note {{in defaulted three-way comparison operator for 'B' first required here}}
+}

From 8b56d9ef4d946f772e6b7a8d508f34b586f684a0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 27 Feb 2024 08:13:23 -0800
Subject: [PATCH 451/546] [llvm-objcopy] Improve help messages (#82830)

https://reviews.llvm.org/D63820 added
llvm/docs/CommandGuide/llvm-objcopy.rst with clearer semantics, e.g.

```
Read a list of names from the file <filename> and mark defined symbols with those names as global in the output

instead of the help message
Read a list of symbols from <filename> and marks them global" (omits "defined")

Rename sections called <old> to <new> in the output

instead of the help message
Rename a section from old to new (multiple sections may be named <old>
```

Sync the help messages to incorporate the CommandGuide improvement.

While here, switch to the conventional imperative sentences for a few
options. Additionally, mark some options as grp_coff or grp_macho.
---
 llvm/docs/CommandGuide/llvm-objcopy.rst |  4 +-
 llvm/tools/llvm-objcopy/CommonOpts.td   | 27 +++++----
 llvm/tools/llvm-objcopy/ObjcopyOpts.td  | 75 ++++++++++++-------------
 3 files changed, 54 insertions(+), 52 deletions(-)

diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 0fb3c4bed643a..755291676abf6 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -450,7 +450,7 @@ them.
  Set the type of section ``<section>`` to the integer ``<type>``. Can be
  specified multiple times to update multiple sections.
 
-.. option:: --set-start-addr <addr>
+.. option:: --set-start <addr>
 
  Set the start address of the output to ``<addr>``. Overrides any previously
  specified :option:`--change-start` or :option:`--adjust-start` options.
@@ -484,7 +484,7 @@ them.
 
 .. option:: --weaken-symbol <symbol>, -W
 
- Mark any global symbol named ``<symbol>`` as a weak symbol in the output. Can
+ Mark global symbols named ``<symbol>`` as weak symbols in the output. Can
  be specified multiple times to mark multiple symbols as weak.
 
 .. option:: --weaken-symbols <filename>
diff --git a/llvm/tools/llvm-objcopy/CommonOpts.td b/llvm/tools/llvm-objcopy/CommonOpts.td
index 4222532a1a38f..c247c93f6e0f2 100644
--- a/llvm/tools/llvm-objcopy/CommonOpts.td
+++ b/llvm/tools/llvm-objcopy/CommonOpts.td
@@ -7,6 +7,9 @@ multiclass Eq<string name, string help> {
                   HelpText<help>;
 }
 
+def grp_coff : OptionGroup<"kind">, HelpText<"OPTIONS (COFF specific)">;
+def grp_macho : OptionGroup<"kind">, HelpText<"OPTIONS (Mach-O specific)">;
+
 def help : Flag<["--"], "help">;
 def h : Flag<["-"], "h">, Alias<help>;
 
@@ -39,13 +42,13 @@ def p : Flag<["-"], "p">,
         HelpText<"Alias for --preserve-dates">;
 
 def strip_all : Flag<["--"], "strip-all">,
-                HelpText<"Remove non-allocated sections outside segments. "
-                         ".gnu.warning* and .ARM.attribute sections are not "
-                         "removed">;
+    HelpText<"For ELF, remove all symbols and non-alloc sections not within "
+             "segments, except for .gnu.warning*, .ARM.attribute, and the section name table. "
+             "For COFF and Mach-O, remove all symbols, debug sections, and relocations">;
 
 def strip_all_gnu
     : Flag<["--"], "strip-all-gnu">,
-      HelpText<"Compatible with GNU's --strip-all">;
+      HelpText<"Remove all symbols, debug sections and relocations. Compatible with GNU's --strip-all">;
 
 def strip_debug : Flag<["--"], "strip-debug">,
                   HelpText<"Remove all debug sections">;
@@ -64,7 +67,7 @@ def R : JoinedOrSeparate<["-"], "R">,
 
 def strip_sections
     : Flag<["--"], "strip-sections">,
-      HelpText<"Remove all section headers and all sections not in segments">;
+      HelpText<"Remove all section headers and all section data not within segments">;
 
 defm strip_symbol : Eq<"strip-symbol", "Strip <symbol>">,
                     MetaVarName<"symbol">;
@@ -75,17 +78,17 @@ def N : JoinedOrSeparate<["-"], "N">,
 defm keep_section : Eq<"keep-section", "Keep <section>">,
                     MetaVarName<"section">;
 
-defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+defm keep_symbol : Eq<"keep-symbol", "When removing symbols, do not remove <symbol>">,
                    MetaVarName<"symbol">;
 def K : JoinedOrSeparate<["-"], "K">,
         Alias<keep_symbol>,
         HelpText<"Alias for --keep-symbol">;
 
 def keep_file_symbols : Flag<["--"], "keep-file-symbols">,
-                        HelpText<"Do not remove file symbols">;
+                        HelpText<"Keep symbols of type STT_FILE, even if they would otherwise be stripped">;
 
 def keep_undefined : Flag<["--"], "keep-undefined">,
-                        HelpText<"Do not remove undefined symbols">;
+                        HelpText<"Do not remove undefined symbols">, Group<grp_macho>;
 
 def only_keep_debug
     : Flag<["--"], "only-keep-debug">,
@@ -94,16 +97,16 @@ def only_keep_debug
           "sections useful for debugging purposes">;
 
 def discard_locals : Flag<["--"], "discard-locals">,
-                     HelpText<"Remove compiler-generated local symbols, (e.g. "
-                              "symbols starting with .L)">;
+                     HelpText<"Remove local symbols starting with .L">;
 def X : Flag<["-"], "X">,
         Alias<discard_locals>,
         HelpText<"Alias for --discard-locals">;
 
 def discard_all
     : Flag<["--"], "discard-all">,
-      HelpText<"Remove all local symbols except file and section symbols. Also "
-               "remove all debug sections">;
+      HelpText<"Remove most local symbols. Different file formats may limit this to a subset. "
+      "For ELF, file and section symbols are not discarded. "
+      "Additionally, remove all debug sections">;
 def x : Flag<["-"], "x">,
         Alias<discard_all>,
         HelpText<"Alias for --discard-all">;
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index bd041fabbdd7a..86774c889ab86 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -6,28 +6,25 @@ def B : JoinedOrSeparate<["-"], "B">,
         Alias<binary_architecture>,
         HelpText<"Alias for --binary-architecture">;
 
-defm target : Eq<"target", "Format of the input and output file">,
+defm target : Eq<"target", "Equivalent to --input-target and --output-target for the specified format">,
               Values<"binary">;
 def F : JoinedOrSeparate<["-"], "F">,
         Alias<target>,
         HelpText<"Alias for --target">;
 
-defm input_target : Eq<"input-target", "Format of the input file">,
-                    Values<"binary">;
+defm input_target : Eq<"input-target", "Read the input as the specified format">, MetaVarName<"format">;
 def I : JoinedOrSeparate<["-"], "I">,
         Alias<input_target>,
         HelpText<"Alias for --input-target">;
 
-defm output_target : Eq<"output-target", "Format of the output file">,
-                     Values<"binary">;
+defm output_target : Eq<"output-target", "Write the output as the specified format">, MetaVarName<"format">;
 def O : JoinedOrSeparate<["-"], "O">,
         Alias<output_target>,
         HelpText<"Alias for --output-target">;
 
-defm new_symbol_visibility : Eq<"new-symbol-visibility", "Visibility of "
-                                "symbols generated for binary input or added"
-                                " with --add-symbol unless otherwise"
-                                " specified. The default value is 'default'">;
+defm new_symbol_visibility
+    : Eq<"new-symbol-visibility", "Specify the visibility of symbols automatically "
+         "created when using binary input or --add-symbol. The default is 'default'">;
 
 def compress_debug_sections
     : Joined<["--"], "compress-debug-sections=">,
@@ -39,8 +36,8 @@ def : Flag<["--"], "compress-debug-sections">, Alias<compress_debug_sections>,
 def decompress_debug_sections : Flag<["--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections">;
 defm split_dwo
-    : Eq<"split-dwo", "Equivalent to extract-dwo on the input file to "
-                      "<dwo-file>, then strip-dwo on the input file">,
+    : Eq<"split-dwo", "Equivalent to --extract-dwo and <dwo-file> as the output file and no other options, "
+                      "and then --strip-dwo on the input file">,
       MetaVarName<"dwo-file">;
 
 defm add_gnu_debuglink
@@ -49,17 +46,15 @@ defm add_gnu_debuglink
 
 defm rename_section
     : Eq<"rename-section",
-         "Renames a section from old to new, optionally with specified flags. "
-         "Flags supported for GNU compatibility: alloc, load, noload, "
-         "readonly, exclude, debug, code, data, rom, share, contents, merge, "
-         "strings, large">,
+         "Rename sections called <old> to <new>, and apply any specified <flag> values. "
+         "See --set-section-flags for a list of supported flags">,
       MetaVarName<"old=new[,flag1,...]">;
 defm redefine_symbol
     : Eq<"redefine-sym", "Change the name of a symbol old to new">,
       MetaVarName<"old=new">;
 defm redefine_symbols
     : Eq<"redefine-syms",
-         "Reads a list of symbol pairs from <filename> and runs as if "
+         "Read a list of symbol pairs from <filename> and run as if "
          "--redefine-sym=<old>=<new> is set for each one. <filename> "
          "contains two symbols per line separated with whitespace and may "
          "contain comments beginning with '#'. Leading and trailing "
@@ -74,7 +69,7 @@ def j : JoinedOrSeparate<["-"], "j">,
         HelpText<"Alias for --only-section">;
 defm add_section
     : Eq<"add-section",
-         "Make a section named <section> with the contents of <file>">,
+         "Add a section named <section> with the contents of <file>">,
       MetaVarName<"section=file">;
 
 defm set_section_alignment
@@ -83,8 +78,8 @@ defm set_section_alignment
 
 defm set_section_flags
     : Eq<"set-section-flags",
-         "Set section flags for a given section. Flags supported for GNU "
-         "compatibility: alloc, load, noload, readonly, exclude, debug, code, "
+         "Set section properties based on the specified <flags>. Supported flag names are: "
+         "alloc, load, noload, readonly, exclude, debug, code, "
          "data, rom, share, contents, merge, strings, large">,
       MetaVarName<"section=flag1[,flag2,...]">;
 
@@ -97,24 +92,25 @@ def S : Flag<["-"], "S">,
         Alias<strip_all>,
         HelpText<"Alias for --strip-all">;
 def strip_dwo : Flag<["--"], "strip-dwo">,
-                HelpText<"Remove all DWARF .dwo sections from file">;
+                HelpText<"Remove all DWARF .dwo sections">;
 def strip_non_alloc
     : Flag<["--"], "strip-non-alloc">,
-      HelpText<"Remove all non-allocated sections outside segments">;
+      HelpText<"Remove all non-allocated sections that are not within segments">;
 defm strip_unneeded_symbol
     : Eq<"strip-unneeded-symbol",
-         "Remove symbol <symbol> if it is not needed by relocations">,
+         "Remove all symbols named <symbol> that are local or undefined and "
+         "are not required by any relocation">,
       MetaVarName<"symbol">;
 defm strip_unneeded_symbols
     : Eq<"strip-unneeded-symbols",
-         "Reads a list of symbols from <filename> and removes them "
-         "if they are not needed by relocations">,
+         "Remove all symbols whose names appear in the file <file>, if they "
+         "are local or undefined and are not required by any relocation">,
       MetaVarName<"filename">;
 
 defm subsystem
     : Eq<"subsystem",
-         "Set PE subsystem and version">,
-      MetaVarName<"name[:version]">;
+         "Set the PE subsystem, and optionally subsystem version">,
+      MetaVarName<"name[:version]">, Group<grp_coff>;
 
 def extract_dwo
     : Flag<["--"], "extract-dwo">,
@@ -132,11 +128,13 @@ def localize_hidden
     : Flag<["--"], "localize-hidden">,
       HelpText<
           "Mark all symbols that have hidden or internal visibility as local">;
-defm localize_symbol : Eq<"localize-symbol", "Mark <symbol> as local">,
-                       MetaVarName<"symbol">;
+defm localize_symbol
+    : Eq<"localize-symbol", "Mark any defined non-common symbol named <symbol> as local">,
+      MetaVarName<"symbol">;
 defm localize_symbols
     : Eq<"localize-symbols",
-         "Reads a list of symbols from <filename> and marks them local">,
+         "Read a list of names from <filename> and mark any defined non-common "
+         "symbols with those names as local">,
       MetaVarName<"filename">;
 
 def L : JoinedOrSeparate<["-"], "L">,
@@ -148,13 +146,14 @@ defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
 
 defm globalize_symbols
     : Eq<"globalize-symbols",
-         "Reads a list of symbols from <filename> and marks them global">,
+         "Read a list of symbols from <filename> and mark defined symbols"
+         " with those names as global">,
       MetaVarName<"filename">;
 
 defm keep_global_symbol
     : Eq<"keep-global-symbol",
-         "Convert all symbols except <symbol> to local. May be repeated to "
-         "convert all except a set of symbols to local">,
+         "Mark all symbols local, except for symbols with the name <symbol>. "
+         "Can be specified multiple times to ignore multiple symbols">,
       MetaVarName<"symbol">;
 def G : JoinedOrSeparate<["-"], "G">,
         Alias<keep_global_symbol>,
@@ -162,34 +161,34 @@ def G : JoinedOrSeparate<["-"], "G">,
 
 defm keep_global_symbols
     : Eq<"keep-global-symbols",
-         "Reads a list of symbols from <filename> and runs as if "
+         "Read a list of symbols from <filename> and run as if "
          "--keep-global-symbol=<symbol> is set for each one. <filename> "
          "contains one symbol per line and may contain comments beginning with "
          "'#'. Leading and trailing whitespace is stripped from each line. May "
          "be repeated to read symbols from many files">,
       MetaVarName<"filename">;
 
-defm weaken_symbol : Eq<"weaken-symbol", "Mark <symbol> as weak">,
+defm weaken_symbol : Eq<"weaken-symbol", "Mark global symbols named <symbol> as weak">,
                      MetaVarName<"symbol">;
 defm weaken_symbols
     : Eq<"weaken-symbols",
-         "Reads a list of symbols from <filename> and marks them weak">,
+         "Read a list of symbols from <filename> and mark global symbols with those names as weak">,
       MetaVarName<"filename">;
 
 def W : JoinedOrSeparate<["-"], "W">,
         Alias<weaken_symbol>,
         HelpText<"Alias for --weaken-symbol">;
 def weaken : Flag<["--"], "weaken">,
-             HelpText<"Mark all global symbols as weak">;
+             HelpText<"Mark all defined global symbols as weak">;
 
 defm strip_symbols
     : Eq<"strip-symbols",
-         "Reads a list of symbols from <filename> and removes them">,
+         "Remove all symbols whose names appear in the file <filename>">,
       MetaVarName<"filename">;
 
 defm keep_symbols
     : Eq<"keep-symbols",
-         "Reads a list of symbols from <filename> and runs as if "
+         "Read a list of symbols from <filename> and run as if "
          "--keep-symbol=<symbol> is set for each one. <filename> "
          "contains one symbol per line and may contain comments beginning with "
          "'#'. Leading and trailing whitespace is stripped from each line. May "

From 8a87f763a6841832e71bcd24dea45eac8d2dbee1 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Tue, 27 Feb 2024 08:14:46 -0800
Subject: [PATCH 452/546] Aim debugserver workaround more precisely. (#83099)

---
 .../gdb-remote/GDBRemoteCommunicationClient.cpp   | 15 ++++++++++++++-
 lldb/source/Target/Target.cpp                     |  8 --------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 6f8aa26228994..1f6116967a4f6 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -17,6 +17,7 @@
 
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Host/HostInfo.h"
+#include "lldb/Host/SafeMachO.h"
 #include "lldb/Host/XML.h"
 #include "lldb/Symbol/Symbol.h"
 #include "lldb/Target/MemoryRegionInfo.h"
@@ -2147,8 +2148,20 @@ bool GDBRemoteCommunicationClient::GetCurrentProcessInfo(bool allow_lazy) {
           if (!value.getAsInteger(16, cpu))
             ++num_keys_decoded;
         } else if (name.equals("cpusubtype")) {
-          if (!value.getAsInteger(16, sub))
+          if (!value.getAsInteger(16, sub)) {
             ++num_keys_decoded;
+            // Workaround for pre-2024 Apple debugserver, which always
+            // returns arm64e on arm64e-capable hardware regardless of
+            // what the process is. This can be deleted at some point
+            // in the future.
+            if (cpu == llvm::MachO::CPU_TYPE_ARM64 &&
+                sub == llvm::MachO::CPU_SUBTYPE_ARM64E) {
+              if (GetGDBServerVersion())
+                if (m_gdb_server_version >= 1000 &&
+                    m_gdb_server_version <= 1504)
+                  sub = 0;
+            }
+          }
         } else if (name.equals("triple")) {
           StringExtractor extractor(value);
           extractor.GetHexByteString(triple);
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 089915cab4915..e982a30a3ae4f 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -33,7 +33,6 @@
 #include "lldb/Expression/UtilityFunction.h"
 #include "lldb/Host/Host.h"
 #include "lldb/Host/PosixApi.h"
-#include "lldb/Host/SafeMachO.h"
 #include "lldb/Host/StreamFile.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
@@ -1572,13 +1571,6 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
 
         if (m_arch.GetSpec().GetTriple() == other.GetTriple())
           replace_local_arch = false;
-        // Workaround for for pre-2024 debugserver, which always
-        // returns arm64e on arm64e-capable hardware regardless of
-        // what the process is. This can be deleted at some point in
-        // the future.
-        if (!m_arch.GetSpec().GetMachOCPUSubType() &&
-            other.GetMachOCPUSubType() == llvm::MachO::CPU_SUBTYPE_ARM64E)
-          replace_local_arch = true;
       }
     }
   }

From b98e6a5ced8328fdefa9a519ae98052a29462e23 Mon Sep 17 00:00:00 2001
From: Elvina Yakubova <elvinayakubova@gmail.com>
Date: Tue, 27 Feb 2024 19:19:47 +0300
Subject: [PATCH 453/546] [BOLT][AArch64] Skip BBs only instead of functions
 (#81989)

After [this
](https://github.com/llvm/llvm-project/commit/846eb76761c858cbfc75700bf68445e0e3ade48e)
commit we noticed that the size of fdata file decreased a lot. That's
why the better and more precise way will be to skip basic blocks with
exclusive instructions only instead of the whole function
---
 bolt/include/bolt/Core/MCPlusBuilder.h        | 12 ++-
 bolt/lib/Passes/Instrumentation.cpp           | 96 +++++++++++++++++--
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 26 +++--
 bolt/test/AArch64/exclusive-instrument.s      | 90 +++++++++++++++--
 4 files changed, 196 insertions(+), 28 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 687b49a3cbda4..eeb7609ff6b5f 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -620,7 +620,17 @@ class MCPlusBuilder {
     return Info->get(Inst.getOpcode()).mayStore();
   }
 
-  virtual bool isAArch64Exclusive(const MCInst &Inst) const {
+  virtual bool isAArch64ExclusiveLoad(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isAArch64ExclusiveStore(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isAArch64ExclusiveClear(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
   }
diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp
index 760ca84b4ef1c..68acff7e6a867 100644
--- a/bolt/lib/Passes/Instrumentation.cpp
+++ b/bolt/lib/Passes/Instrumentation.cpp
@@ -17,7 +17,9 @@
 #include "bolt/Utils/Utils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/RWMutex.h"
+#include <queue>
 #include <stack>
+#include <unordered_set>
 
 #define DEBUG_TYPE "bolt-instrumentation"
 
@@ -86,21 +88,89 @@ cl::opt<bool> InstrumentCalls("instrument-calls",
 namespace llvm {
 namespace bolt {
 
-static bool hasAArch64ExclusiveMemop(BinaryFunction &Function) {
+static bool hasAArch64ExclusiveMemop(
+    BinaryFunction &Function,
+    std::unordered_set<const BinaryBasicBlock *> &BBToSkip) {
   // FIXME ARMv8-a architecture reference manual says that software must avoid
   // having any explicit memory accesses between exclusive load and associated
-  // store instruction. So for now skip instrumentation for functions that have
-  // these instructions, since it might lead to runtime deadlock.
+  // store instruction. So for now skip instrumentation for basic blocks that
+  // have these instructions, since it might lead to runtime deadlock.
   BinaryContext &BC = Function.getBinaryContext();
-  for (const BinaryBasicBlock &BB : Function)
-    for (const MCInst &Inst : BB)
-      if (BC.MIB->isAArch64Exclusive(Inst)) {
-        if (opts::Verbosity >= 1)
-          BC.outs() << "BOLT-INSTRUMENTER: Function " << Function
-                    << " has exclusive instructions, skip instrumentation\n";
+  std::queue<std::pair<BinaryBasicBlock *, bool>> BBQueue; // {BB, isLoad}
+  std::unordered_set<BinaryBasicBlock *> Visited;
+
+  if (Function.getLayout().block_begin() == Function.getLayout().block_end())
+    return 0;
+
+  BinaryBasicBlock *BBfirst = *Function.getLayout().block_begin();
+  BBQueue.push({BBfirst, false});
+
+  while (!BBQueue.empty()) {
+    BinaryBasicBlock *BB = BBQueue.front().first;
+    bool IsLoad = BBQueue.front().second;
+    BBQueue.pop();
+    if (Visited.find(BB) != Visited.end())
+      continue;
+    Visited.insert(BB);
+
+    for (const MCInst &Inst : *BB) {
+      // Two loads one after another - skip whole function
+      if (BC.MIB->isAArch64ExclusiveLoad(Inst) && IsLoad) {
+        if (opts::Verbosity >= 2) {
+          outs() << "BOLT-INSTRUMENTER: function " << Function.getPrintName()
+                 << " has two exclusive loads. Ignoring the function.\n";
+        }
         return true;
       }
 
+      if (BC.MIB->isAArch64ExclusiveLoad(Inst))
+        IsLoad = true;
+
+      if (IsLoad && BBToSkip.find(BB) == BBToSkip.end()) {
+        BBToSkip.insert(BB);
+        if (opts::Verbosity >= 2) {
+          outs() << "BOLT-INSTRUMENTER: skip BB " << BB->getName()
+                 << " due to exclusive instruction in function "
+                 << Function.getPrintName() << "\n";
+        }
+      }
+
+      if (!IsLoad && BC.MIB->isAArch64ExclusiveStore(Inst)) {
+        if (opts::Verbosity >= 2) {
+          outs() << "BOLT-INSTRUMENTER: function " << Function.getPrintName()
+                 << " has exclusive store without corresponding load. Ignoring "
+                    "the function.\n";
+        }
+        return true;
+      }
+
+      if (IsLoad && (BC.MIB->isAArch64ExclusiveStore(Inst) ||
+                     BC.MIB->isAArch64ExclusiveClear(Inst)))
+        IsLoad = false;
+    }
+
+    if (IsLoad && BB->succ_size() == 0) {
+      if (opts::Verbosity >= 2) {
+        outs()
+            << "BOLT-INSTRUMENTER: function " << Function.getPrintName()
+            << " has exclusive load in trailing BB. Ignoring the function.\n";
+      }
+      return true;
+    }
+
+    for (BinaryBasicBlock *BBS : BB->successors())
+      BBQueue.push({BBS, IsLoad});
+  }
+
+  if (BBToSkip.size() == Visited.size()) {
+    if (opts::Verbosity >= 2) {
+      outs() << "BOLT-INSTRUMENTER: all BBs are marked with true. Ignoring the "
+                "function "
+             << Function.getPrintName() << "\n";
+    }
+    return true;
+  }
+
   return false;
 }
 
@@ -307,7 +377,8 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
   if (BC.isMachO() && Function.hasName("___GLOBAL_init_65535/1"))
     return;
 
-  if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function))
+  std::unordered_set<const BinaryBasicBlock *> BBToSkip;
+  if (BC.isAArch64() && hasAArch64ExclusiveMemop(Function, BBToSkip))
     return;
 
   SplitWorklistTy SplitWorklist;
@@ -389,6 +460,11 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function,
 
   for (auto BBI = Function.begin(), BBE = Function.end(); BBI != BBE; ++BBI) {
     BinaryBasicBlock &BB = *BBI;
+
+    // Skip BBs with exclusive load/stores
+    if (BBToSkip.find(&BB) != BBToSkip.end())
+      continue;
+
     bool HasUnconditionalBranch = false;
     bool HasJumpTable = false;
     bool IsInvokeBlock = InvokeBlocks.count(&BB) > 0;
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index d90512e212258..c1c09c70ab01d 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -270,32 +270,38 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst);
   }
 
-  bool isAArch64Exclusive(const MCInst &Inst) const override {
+  bool isAArch64ExclusiveLoad(const MCInst &Inst) const override {
     return (Inst.getOpcode() == AArch64::LDXPX ||
             Inst.getOpcode() == AArch64::LDXPW ||
             Inst.getOpcode() == AArch64::LDXRX ||
             Inst.getOpcode() == AArch64::LDXRW ||
             Inst.getOpcode() == AArch64::LDXRH ||
             Inst.getOpcode() == AArch64::LDXRB ||
-            Inst.getOpcode() == AArch64::STXPX ||
-            Inst.getOpcode() == AArch64::STXPW ||
-            Inst.getOpcode() == AArch64::STXRX ||
-            Inst.getOpcode() == AArch64::STXRW ||
-            Inst.getOpcode() == AArch64::STXRH ||
-            Inst.getOpcode() == AArch64::STXRB ||
             Inst.getOpcode() == AArch64::LDAXPX ||
             Inst.getOpcode() == AArch64::LDAXPW ||
             Inst.getOpcode() == AArch64::LDAXRX ||
             Inst.getOpcode() == AArch64::LDAXRW ||
             Inst.getOpcode() == AArch64::LDAXRH ||
-            Inst.getOpcode() == AArch64::LDAXRB ||
+            Inst.getOpcode() == AArch64::LDAXRB);
+  }
+
+  bool isAArch64ExclusiveStore(const MCInst &Inst) const override {
+    return (Inst.getOpcode() == AArch64::STXPX ||
+            Inst.getOpcode() == AArch64::STXPW ||
+            Inst.getOpcode() == AArch64::STXRX ||
+            Inst.getOpcode() == AArch64::STXRW ||
+            Inst.getOpcode() == AArch64::STXRH ||
+            Inst.getOpcode() == AArch64::STXRB ||
             Inst.getOpcode() == AArch64::STLXPX ||
             Inst.getOpcode() == AArch64::STLXPW ||
             Inst.getOpcode() == AArch64::STLXRX ||
             Inst.getOpcode() == AArch64::STLXRW ||
             Inst.getOpcode() == AArch64::STLXRH ||
-            Inst.getOpcode() == AArch64::STLXRB ||
-            Inst.getOpcode() == AArch64::CLREX);
+            Inst.getOpcode() == AArch64::STLXRB);
+  }
+
+  bool isAArch64ExclusiveClear(const MCInst &Inst) const override {
+    return (Inst.getOpcode() == AArch64::CLREX);
   }
 
   bool isLoadFromStack(const MCInst &Inst) const {
diff --git a/bolt/test/AArch64/exclusive-instrument.s b/bolt/test/AArch64/exclusive-instrument.s
index 502dd83b2f2a5..69cc0707aba8e 100644
--- a/bolt/test/AArch64/exclusive-instrument.s
+++ b/bolt/test/AArch64/exclusive-instrument.s
@@ -6,32 +6,108 @@
 // RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
 // RUN:   %s -o %t.o
 // RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q -Wl,-fini=dummy
-// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=1 | FileCheck %s
+// RUN: llvm-bolt %t.exe -o %t.bolt -instrument -v=2 | FileCheck %s
 
-// CHECK: Function foo has exclusive instructions, skip instrumentation
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function foo
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function foo
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function foo
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case1
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case2
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case2
+// CHECK: BOLT-INSTRUMENTER: function case3 has exclusive store without corresponding load. Ignoring the function.
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case4
+// CHECK: BOLT-INSTRUMENTER: function case4 has two exclusive loads. Ignoring the function.
+// CHECK: BOLT-INSTRUMENTER: skip BB {{.*}} due to exclusive instruction in function case5
+// CHECK: BOLT-INSTRUMENTER: function case5 has exclusive load in trailing BB. Ignoring the function.
 
 .global foo
 .type foo, %function
 foo:
+  # exclusive load and store in two bbs
   ldaxr w9, [x10]
   cbnz w9, .Lret
   stlxr w12, w11, [x9]
   cbz w12, foo
-  clrex
 .Lret:
+  clrex
   ret
 .size foo, .-foo
 
 .global _start
 .type _start, %function
 _start:
-  cmp x0, #0
-  b.eq .Lexit
-  bl foo
-.Lexit:
+  mov x0, #0
+  mov x1, #1
+  mov x2, #2
+  mov x3, #3
+
+  bl case1
+  bl case2
+  bl case3
+  bl case4
+  bl case5
+
   ret
 .size _start, .-_start
 
+# Case 1: exclusive load and store in one basic block
+.global case1
+.type case1, %function
+case1:
+  str  x0, [x2]
+  ldxr w0, [x2]
+  add  w0, w0, #1
+  stxr w1, w0, [x2]
+  ret
+.size case1, .-case1
+
+# Case 2: exclusive load and store in different blocks
+.global case2
+.type case2, %function
+case2:
+  b    case2_load
+
+case2_load:
+  ldxr x0, [x2]
+  b    case2_store
+
+case2_store:
+  add  x0, x0, #1
+  stxr w1, x0, [x2]
+  ret
+.size case2, .-case2
+
+# Case 3: store without preceding load
+.global case3
+.type case3, %function
+case3:
+  stxr w1, x3, [x2]
+  ret
+.size case3, .-case3
+
+# Case 4: two exclusive load instructions in neighboring blocks
+.global case4
+.type case4, %function
+case4:
+  b    case4_load
+
+case4_load:
+  ldxr x0, [x2]
+  b    case4_load_next
+
+case4_load_next:
+  ldxr x1, [x2]
+  ret
+.size case4, .-case4
+
+# Case 5:  Exclusive load without successor
+.global case5
+.type case5, %function
+case5:
+  ldxr x0, [x2]
+  ret
+.size case5, .-case5
+
 .global dummy
 .type dummy, %function
 dummy:

From 183b6b56f2602ea171502f9f2843c2c1caca2919 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 27 Feb 2024 14:11:26 +0100
Subject: [PATCH 454/546] [clang][Interp] Ignore unnamed bitfields when
 checking init

Unnamed bitfields need to be ignored here.
---
 clang/lib/AST/Interp/EvaluationResult.cpp |  2 ++
 clang/test/AST/Interp/cxx20.cpp           | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index 693ae9ce4a94e..07b28d07326f9 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -105,6 +105,8 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
       Result &= CheckFieldsInitialized(S, Loc, FieldPtr, FieldPtr.getRecord());
     } else if (FieldType->isIncompleteArrayType()) {
       // Nothing to do here.
+    } else if (F.Decl->isUnnamedBitfield()) {
+      // Nothing do do here.
     } else if (FieldType->isArrayType()) {
       const auto *CAT =
           cast<ConstantArrayType>(FieldType->getAsArrayTypeUnsafe());
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 78c09661c6dd7..5c9c625796510 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -776,3 +776,26 @@ namespace RewrittenBinaryOperators {
   };
   static_assert(X() < X(), "");
 }
+
+namespace GH61417 {
+struct A {
+  unsigned x : 1;
+  unsigned   : 0;
+  unsigned y : 1;
+
+  constexpr A() : x(0), y(0) {}
+  bool operator==(const A& rhs) const noexcept = default;
+};
+
+void f1() {
+  constexpr A a, b;
+  constexpr bool c = (a == b); // no diagnostic, we should not be comparing the
+                               // unnamed bit-field which is indeterminate
+}
+
+void f2() {
+    A a, b;
+    bool c = (a == b); // no diagnostic nor crash during codegen attempting to
+                       // access info for unnamed bit-field
+}
+}

From ed35ad18ae7b5e61c24b63c22028ee0d1ba10e19 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 27 Feb 2024 16:30:31 +0000
Subject: [PATCH 455/546] [RemoveDIs][DebugInfo] Add DPValue checks to the
 verifier, prepare DPValue for parsing support (#79810)

As part of the RemoveDIs project, this patch adds support for checking
DPValues in the verifier. Although this is not strictly parsing-related,
and we currently automatically convert back to the old debug info format
immediately after parsing, we are approaching the point where the we can
operate end-to-end in the new debug info format, at which point it is
appropriate that we can actually validate modules in the new format.
This patch also contains some changes that aren't strictly
parsing-related, but are necessary class refactors for parsing support,
and are used in the verifier checks (i.e. changing the DILocalVariable
field to be a tracking MD reference, and adding a Verifier check to
confirm that it is a DILocalVariable).
---
 llvm/include/llvm/IR/BasicBlock.h             |   9 --
 .../include/llvm/IR/DebugProgramInstruction.h |   7 +-
 llvm/lib/IR/AsmWriter.cpp                     |   6 +-
 llvm/lib/IR/BasicBlock.cpp                    |  61 ----------
 llvm/lib/IR/DebugProgramInstruction.cpp       |   6 +
 llvm/lib/IR/Verifier.cpp                      | 115 +++++++++++++++++-
 llvm/unittests/IR/DebugInfoTest.cpp           |  13 +-
 7 files changed, 132 insertions(+), 85 deletions(-)

diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index ed25cb96ae964..179305e9260f9 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -93,15 +93,6 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// if necessary.
   void setIsNewDbgInfoFormat(bool NewFlag);
 
-  /// Validate any DPMarkers / DPValues attached to instructions in this block,
-  /// and block-level stored data too (TrailingDPValues).
-  /// \p Assert Should this method fire an assertion if a problem is found?
-  /// \p Msg Should this method print a message to errs() if a problem is found?
-  /// \p OS Output stream to write errors to.
-  /// \returns True if a problem is found.
-  bool validateDbgValues(bool Assert = true, bool Msg = false,
-                         raw_ostream *OS = nullptr);
-
   /// Record that the collection of DPValues in \p M "trails" after the last
   /// instruction of this block. These are equivalent to dbg.value intrinsics
   /// that exist at the end of a basic block with no terminator (a transient
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 97089098ee535..2dd546ce709c7 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -224,7 +224,7 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   // DebugValueUser superclass instead. The referred to Value can either be a
   // ValueAsMetadata or a DIArgList.
 
-  DILocalVariable *Variable;
+  TrackingMDNodeRef Variable;
   DIExpression *Expression;
   DIExpression *AddressExpression;
 
@@ -331,7 +331,7 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   void addVariableLocationOps(ArrayRef<Value *> NewValues,
                               DIExpression *NewExpr);
 
-  void setVariable(DILocalVariable *NewVar) { Variable = NewVar; }
+  void setVariable(DILocalVariable *NewVar);
 
   void setExpression(DIExpression *NewExpr) { Expression = NewExpr; }
 
@@ -349,7 +349,8 @@ class DPValue : public DbgRecord, protected DebugValueUser {
   void setKillLocation();
   bool isKillLocation() const;
 
-  DILocalVariable *getVariable() const { return Variable; }
+  DILocalVariable *getVariable() const;
+  MDNode *getRawVariable() const { return Variable; }
 
   DIExpression *getExpression() const { return Expression; }
 
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index a3da6ca811489..4e1e48b4ad4a3 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1143,15 +1143,15 @@ void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
     // Process metadata used by DbgRecords; we only specifically care about the
     // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
     // Expression fields should only be printed inline and so do not use a slot.
-    CreateMetadataSlot(DPV->getVariable());
+    CreateMetadataSlot(DPV->getRawVariable());
     if (DPV->isDbgAssign())
-      CreateMetadataSlot(DPV->getAssignID());
+      CreateMetadataSlot(cast<MDNode>(DPV->getRawAssignID()));
   } else if (const DPLabel *DPL = dyn_cast<const DPLabel>(&DR)) {
     CreateMetadataSlot(DPL->getLabel());
   } else {
     llvm_unreachable("unsupported DbgRecord kind");
   }
-  CreateMetadataSlot(DR.getDebugLoc());
+  CreateMetadataSlot(DR.getDebugLoc().getAsMDNode());
 }
 
 void SlotTracker::processInstructionMetadata(const Instruction &I) {
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index c110c4c1437c3..25aa326116451 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -122,67 +122,6 @@ void BasicBlock::convertFromNewDbgValues() {
   assert(!getTrailingDPValues());
 }
 
-bool BasicBlock::validateDbgValues(bool Assert, bool Msg, raw_ostream *OS) {
-  bool RetVal = false;
-  if (!OS)
-    OS = &errs();
-
-  // Helper lambda for reporting failures: via assertion, printing, and return
-  // value.
-  auto TestFailure = [Assert, Msg, &RetVal, OS](bool Val, const char *Text) {
-    // Did the test fail?
-    if (Val)
-      return;
-
-    // If we're asserting, then fire off an assertion.
-    if (Assert)
-      llvm_unreachable(Text);
-
-    if (Msg)
-      *OS << Text << "\n";
-    RetVal = true;
-  };
-
-  // We should have the same debug-format as the parent function.
-  TestFailure(getParent()->IsNewDbgInfoFormat == IsNewDbgInfoFormat,
-              "Parent function doesn't have the same debug-info format");
-
-  // Only validate if we are using the new format.
-  if (!IsNewDbgInfoFormat)
-    return RetVal;
-
-  // Match every DPMarker to every Instruction and vice versa, and
-  // verify that there are no invalid DPValues.
-  for (auto It = begin(); It != end(); ++It) {
-    if (!It->DbgMarker)
-      continue;
-
-    // Validate DebugProgramMarkers.
-    DPMarker *CurrentDebugMarker = It->DbgMarker;
-
-    // If this is a marker, it should match the instruction and vice versa.
-    TestFailure(CurrentDebugMarker->MarkedInstr == &*It,
-                "Debug Marker points to incorrect instruction?");
-
-    // Now validate any DPValues in the marker.
-    for (DbgRecord &DPR : CurrentDebugMarker->getDbgValueRange()) {
-      // Validate DebugProgramValues.
-      TestFailure(DPR.getMarker() == CurrentDebugMarker,
-                  "Not pointing at correct next marker!");
-
-      // Verify that no DbgValues appear prior to PHIs.
-      TestFailure(
-          !isa<PHINode>(It),
-          "DebugProgramValues must not appear before PHI nodes in a block!");
-    }
-  }
-
-  // Except transiently when removing + re-inserting the block terminator, there
-  // should be no trailing DPValues.
-  TestFailure(!getTrailingDPValues(), "Trailing DPValues in block");
-  return RetVal;
-}
-
 #ifndef NDEBUG
 void BasicBlock::dumpDbgValues() const {
   for (auto &Inst : *this) {
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 389bac4de6a1c..3a8b94a87bbcf 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -175,6 +175,8 @@ DPValue *DPValue::createLinkedDPVAssign(Instruction *LinkedInstr, Value *Val,
   return NewDPVAssign;
 }
 
+void DPValue::setVariable(DILocalVariable *NewVar) { Variable.reset(NewVar); }
+
 iterator_range<DPValue::location_op_iterator> DPValue::location_ops() const {
   auto *MD = getRawLocation();
   // If a Value has been deleted, the "location" for this DPValue will be
@@ -313,6 +315,10 @@ bool DPValue::isKillLocation() const {
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
+DILocalVariable *DPValue::getVariable() const {
+  return cast<DILocalVariable>(Variable.get());
+}
+
 std::optional<uint64_t> DPValue::getFragmentSizeInBits() const {
   if (auto Fragment = getExpression()->getFragmentInfo())
     return Fragment->SizeInBits;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 4f321bc516cc3..3741e5deaa4cd 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -173,11 +173,36 @@ struct VerifierSupport {
     }
   }
 
+  void Write(const DbgRecord *DR) {
+    if (DR)
+      DR->print(*OS, MST, false);
+  }
+
   void Write(const DPValue *V) {
     if (V)
       V->print(*OS, MST, false);
   }
 
+  void Write(DPValue::LocationType Type) {
+    switch (Type) {
+    case DPValue::LocationType::Value:
+      *OS << "value";
+      break;
+    case DPValue::LocationType::Declare:
+      *OS << "declare";
+      break;
+    case DPValue::LocationType::Assign:
+      *OS << "assign";
+      break;
+    case DPValue::LocationType::End:
+      *OS << "end";
+      break;
+    case DPValue::LocationType::Any:
+      *OS << "any";
+      break;
+    };
+  }
+
   void Write(const Metadata *MD) {
     if (!MD)
       return;
@@ -522,8 +547,10 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   void visitTemplateParams(const MDNode &N, const Metadata &RawParams);
 
+  void visit(DPValue &DPV);
   // InstVisitor overrides...
   using InstVisitor<Verifier>::visit;
+  void visitDbgRecords(Instruction &I);
   void visit(Instruction &I);
 
   void visitTruncInst(TruncInst &I);
@@ -649,7 +676,22 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
     }                                                                          \
   } while (false)
 
+void Verifier::visitDbgRecords(Instruction &I) {
+  if (!I.DbgMarker)
+    return;
+  CheckDI(I.DbgMarker->MarkedInstr == &I, "Instruction has invalid DbgMarker",
+          &I);
+  CheckDI(!isa<PHINode>(&I) || !I.hasDbgValues(),
+          "PHI Node must not have any attached DbgRecords", &I);
+  for (DPValue &DPV : DPValue::filter(I.getDbgValueRange())) {
+    CheckDI(DPV.getMarker() == I.DbgMarker, "DbgRecord had invalid DbgMarker",
+            &I, &DPV);
+    visit(DPV);
+  }
+}
+
 void Verifier::visit(Instruction &I) {
+  visitDbgRecords(I);
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
     Check(I.getOperand(i) != nullptr, "Operand is null", &I);
   InstVisitor<Verifier>::visit(I);
@@ -2976,12 +3018,9 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   }
 
   // Confirm that no issues arise from the debug program.
-  if (BB.IsNewDbgInfoFormat) {
-    // Configure the validate function to not fire assertions, instead print
-    // errors and return true if there's a problem.
-    bool RetVal = BB.validateDbgValues(false, true, OS);
-    Check(!RetVal, "Invalid configuration of new-debug-info data found");
-  }
+  if (BB.IsNewDbgInfoFormat)
+    CheckDI(!BB.getTrailingDPValues(), "Basic Block has trailing DbgRecords!",
+            &BB);
 }
 
 void Verifier::visitTerminator(Instruction &I) {
@@ -6148,6 +6187,70 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
   return nullptr;
 }
 
+void Verifier::visit(DPValue &DPV) {
+  CheckDI(DPV.getType() == DPValue::LocationType::Value ||
+              DPV.getType() == DPValue::LocationType::Declare ||
+              DPV.getType() == DPValue::LocationType::Assign,
+          "invalid #dbg record type", &DPV, DPV.getType());
+  // The location for a DPValue must be either a ValueAsMetadata, DIArgList, or
+  // an empty MDNode (which is a legacy representation for an "undef" location).
+  auto *MD = DPV.getRawLocation();
+  CheckDI(isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
+              (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands()),
+          "invalid #dbg record address/value", &DPV, MD);
+  CheckDI(isa<DILocalVariable>(DPV.getRawVariable()),
+          "invalid #dbg record variable", &DPV, DPV.getRawVariable());
+  CheckDI(DPV.getExpression(), "missing #dbg record expression", &DPV,
+          DPV.getExpression());
+
+  if (DPV.isDbgAssign()) {
+    CheckDI(isa<DIAssignID>(DPV.getRawAssignID()),
+            "invalid #dbg_assign DIAssignID", &DPV, DPV.getRawAssignID());
+    const auto *RawAddr = DPV.getRawAddress();
+    // Similarly to the location above, the address for an assign DPValue must
+    // be a ValueAsMetadata or an empty MDNode, which represents an undef
+    // address.
+    CheckDI(
+        isa<ValueAsMetadata>(RawAddr) ||
+            (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
+        "invalid #dbg_assign address", &DPV, DPV.getRawAddress());
+    CheckDI(DPV.getAddressExpression(),
+            "missing #dbg_assign address expression", &DPV,
+            DPV.getAddressExpression());
+    // All of the linked instructions should be in the same function as DPV.
+    for (Instruction *I : at::getAssignmentInsts(&DPV))
+      CheckDI(DPV.getFunction() == I->getFunction(),
+              "inst not in same function as #dbg_assign", I, &DPV);
+  }
+
+  if (MDNode *N = DPV.getDebugLoc().getAsMDNode()) {
+    CheckDI(isa<DILocation>(N), "invalid #dbg record location", &DPV, N);
+    visitDILocation(*cast<DILocation>(N));
+  }
+
+  BasicBlock *BB = DPV.getParent();
+  Function *F = BB ? BB->getParent() : nullptr;
+
+  // The scopes for variables and !dbg attachments must agree.
+  DILocalVariable *Var = DPV.getVariable();
+  DILocation *Loc = DPV.getDebugLoc();
+  CheckDI(Loc, "missing #dbg record DILocation", &DPV, BB, F);
+
+  DISubprogram *VarSP = getSubprogram(Var->getRawScope());
+  DISubprogram *LocSP = getSubprogram(Loc->getRawScope());
+  if (!VarSP || !LocSP)
+    return; // Broken scope chains are checked elsewhere.
+
+  CheckDI(VarSP == LocSP,
+          "mismatched subprogram between #dbg record variable and DILocation",
+          &DPV, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
+          Loc->getScope()->getSubprogram());
+
+  // This check is redundant with one in visitLocalVariable().
+  CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
+          Var->getRawType());
+}
+
 void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
   if (auto *VPCast = dyn_cast<VPCastIntrinsic>(&VPI)) {
     auto *RetTy = cast<VectorType>(VPCast->getType());
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 65db93e537f7d..c99f928de8a9d 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -1082,7 +1082,10 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   EXPECT_FALSE(BB1->IsNewDbgInfoFormat);
   // Validating the block for DPValues / DPMarkers shouldn't fail -- there's
   // no data stored right now.
-  EXPECT_FALSE(BB1->validateDbgValues(false, false));
+  bool BrokenDebugInfo = false;
+  bool Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
+  EXPECT_FALSE(Error);
+  EXPECT_FALSE(BrokenDebugInfo);
 
   // Function and module should be marked as not having the new format too.
   EXPECT_FALSE(F->IsNewDbgInfoFormat);
@@ -1135,13 +1138,17 @@ TEST(MetadataTest, DPValueConversionRoutines) {
     EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDPValues.empty());
 
   // Validating the first block should continue to not be a problem,
-  EXPECT_FALSE(BB1->validateDbgValues(false, false));
+  Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
+  EXPECT_FALSE(Error);
+  EXPECT_FALSE(BrokenDebugInfo);
   // But if we were to break something, it should be able to fire. Don't attempt
   // to comprehensively test the validator, it's a smoke-test rather than a
   // "proper" verification pass.
   DPV1->setMarker(nullptr);
   // A marker pointing the wrong way should be an error.
-  EXPECT_TRUE(BB1->validateDbgValues(false, false));
+  Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
+  EXPECT_FALSE(Error);
+  EXPECT_TRUE(BrokenDebugInfo);
   DPV1->setMarker(FirstInst->DbgMarker);
 
   DILocalVariable *DLV1 = DPV1->getVariable();

From a6b4e29c77ceb49e16bda38cfc4eddc2c4c76c0b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 10:59:45 -0600
Subject: [PATCH 456/546] [libc] Re-enable several GPU math smoke tests
 (#83147)

Summary:
These were originally disabled after some changes caused them to
regress. It seems that whatever broke them the first time a few months
ago has been fixed elsewhere and they now work again. Re-enable all the
ones that work. There are still a few that we need to look into, but
this is a good start.
---
 libc/test/src/math/smoke/CMakeLists.txt | 76 -------------------------
 1 file changed, 76 deletions(-)

diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 0d6b33bd7d66e..8ccb54f2926bb 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -65,8 +65,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.fabs
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -81,14 +79,10 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.fabsf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
   fabsl_test
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -103,8 +97,6 @@ add_fp_unittest(
 
 add_fp_unittest(
   fabsf128_test
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -129,8 +121,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.trunc
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -145,8 +135,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.truncf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -161,8 +149,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.truncl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -177,8 +163,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.truncf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -193,8 +177,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceil
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -209,8 +191,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceilf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -225,8 +205,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceill
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -241,8 +219,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.ceilf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -257,8 +233,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floor
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -273,8 +247,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floorf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -289,8 +261,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floorl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -305,8 +275,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.floorf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -321,8 +289,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.round
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -337,8 +303,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.roundf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -353,8 +317,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.roundl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -369,8 +331,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.roundf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -389,8 +349,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.lround
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -409,8 +367,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.lroundf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -429,8 +385,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.lroundl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -449,8 +403,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.llround
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -469,8 +421,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.llroundf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -489,8 +439,6 @@ add_fp_unittest(
     libc.src.fenv.fetestexcept
     libc.src.math.llroundl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -718,8 +666,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysign
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -734,8 +680,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysignf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -750,8 +694,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysignl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -766,8 +708,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.copysignf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1189,8 +1129,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrtf
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1203,8 +1141,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrt
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1217,8 +1153,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrtl
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1231,8 +1165,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.sqrtf128
     libc.src.__support.FPUtil.fp_bits
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1247,8 +1179,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1263,8 +1193,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1279,8 +1207,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(
@@ -1295,8 +1221,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.generic.sqrt
   COMPILE_OPTIONS
     -O3
-  # FIXME: Currently fails on the GPU build.
-  UNIT_TEST_ONLY
 )
 
 add_fp_unittest(

From c1c2c928f935ba142e4ee7804dd3ff0ebf0ce4c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 27 Feb 2024 09:01:50 -0800
Subject: [PATCH 457/546] [flang][cuda][NFC] Add test for attributes on
 procedure (#83044)

Similar to #82844. Test that CUDA attributes on procedure are correctly
imported.
---
 flang/test/Lower/CUDA/cuda-mod.cuf        |  6 ++++++
 flang/test/Lower/CUDA/cuda-module-use.cuf | 14 ++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/flang/test/Lower/CUDA/cuda-mod.cuf b/flang/test/Lower/CUDA/cuda-mod.cuf
index 2cc6439789a27..ae5bf63d2da49 100644
--- a/flang/test/Lower/CUDA/cuda-mod.cuf
+++ b/flang/test/Lower/CUDA/cuda-mod.cuf
@@ -4,6 +4,12 @@
 
 module cuf_mod
   real, device :: md
+
+contains
+  attributes(device) subroutine devsub()
+  end
 end module
 
 ! CHECK: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+
+! CHECK: func.func @_QMcuf_modPdevsub() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
diff --git a/flang/test/Lower/CUDA/cuda-module-use.cuf b/flang/test/Lower/CUDA/cuda-module-use.cuf
index 43ab0f6ff8d68..f54083b026ee8 100644
--- a/flang/test/Lower/CUDA/cuda-module-use.cuf
+++ b/flang/test/Lower/CUDA/cuda-module-use.cuf
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-hlfir -fcuda %S/cuda-mod.cuf
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
 
-! Test importing module with variable with CUDA attributes.
+! Test importing module containing variable and subroutine with CUDA attributes.
 
 subroutine sub1()
   use cuf_mod
@@ -12,4 +12,14 @@ end
 ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QMcuf_modEmd) : !fir.ref<f32>
 ! CHECK: %{{.*}}:2 = hlfir.declare %[[ADDR]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuf_modEmd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
-! CHECK: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+attributes(device) subroutine sub2()
+  use cuf_mod
+  call devsub()
+end
+
+! CHECK-LABEL: func.func @_QPsub2() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK: fir.call @_QMcuf_modPdevsub()
+
+! CHECK-LABEL: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+
+! CHECK-LABEL: func.func private @_QMcuf_modPdevsub() attributes {fir.cuda_attr = #fir.cuda_proc<device>}

From b50bcc7ffb6ad6caa4c141a22915ab59f725b7ae Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 27 Feb 2024 18:10:53 +0100
Subject: [PATCH 458/546] [libc++][modules] Fixes naming inconsistency.
 (#83036)

The modules used is-standard-library and is-std-library. The latter is
the name used in the SG15 proposal,

Fixes: https://github.com/llvm/llvm-project/issues/82879
---
 libcxx/modules/modules.json.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/modules/modules.json.in b/libcxx/modules/modules.json.in
index ddc377f28f919..759ac92d81f18 100644
--- a/libcxx/modules/modules.json.in
+++ b/libcxx/modules/modules.json.in
@@ -5,7 +5,7 @@
     {
       "logical-name": "std",
       "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.cppm",
-      "is-standard-library": true,
+      "is-std-library": true,
       "local-arguments": {
         "system-include-directories": [
           "@LIBCXX_MODULE_RELATIVE_PATH@"

From aa95aa69b96f51a3c19a24a0387ebc1fb7e6e52c Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 27 Feb 2024 12:23:19 -0500
Subject: [PATCH 459/546] [libc][math][c23] Add C23 math functions ilogbf128,
 logbf128, and llogb(f|l|f128). (#82144)

---
 libc/config/gpu/entrypoints.txt               |   2 +
 libc/config/linux/aarch64/entrypoints.txt     |   6 +
 libc/config/linux/arm/entrypoints.txt         |   3 +
 libc/config/linux/riscv/entrypoints.txt       |   6 +
 libc/config/linux/x86_64/entrypoints.txt      |   6 +
 libc/docs/math/index.rst                      |  12 ++
 libc/include/llvm-libc-macros/CMakeLists.txt  |   2 +
 libc/include/llvm-libc-macros/math-macros.h   |  11 +-
 libc/spec/stdc.td                             |   7 ++
 .../__support/FPUtil/ManipulationFunctions.h  |  86 ++++++++-----
 libc/src/__support/FPUtil/dyadic_float.h      |   5 +
 libc/src/math/CMakeLists.txt                  |   7 ++
 libc/src/math/generic/CMakeLists.txt          |  98 +++++++++++++--
 libc/src/math/generic/ilogb.cpp               |   2 +-
 libc/src/math/generic/ilogbf.cpp              |   2 +-
 libc/src/math/generic/ilogbf128.cpp           |  19 +++
 libc/src/math/generic/ilogbl.cpp              |   4 +-
 libc/src/math/generic/llogb.cpp               |  17 +++
 libc/src/math/generic/llogbf.cpp              |  17 +++
 libc/src/math/generic/llogbf128.cpp           |  19 +++
 libc/src/math/generic/llogbl.cpp              |  19 +++
 libc/src/math/generic/logbf.cpp               |   2 +-
 libc/src/math/generic/logbf128.cpp            |  17 +++
 libc/src/math/ilogbf128.h                     |  20 +++
 libc/src/math/llogb.h                         |  20 +++
 libc/src/math/llogbf.h                        |  20 +++
 libc/src/math/llogbf128.h                     |  20 +++
 libc/src/math/llogbl.h                        |  20 +++
 libc/src/math/logbf128.h                      |  20 +++
 libc/test/src/math/smoke/CMakeLists.txt       |  92 +++++++++++++-
 libc/test/src/math/smoke/ILogbTest.h          | 117 ++++++++++--------
 libc/test/src/math/smoke/LogbTest.h           |   8 +-
 libc/test/src/math/smoke/ilogb_test.cpp       |  25 +---
 libc/test/src/math/smoke/ilogbf128_test.cpp   |  13 ++
 libc/test/src/math/smoke/ilogbf_test.cpp      |  25 +---
 libc/test/src/math/smoke/ilogbl_test.cpp      |  25 +---
 libc/test/src/math/smoke/llogb_test.cpp       |  13 ++
 libc/test/src/math/smoke/llogbf128_test.cpp   |  13 ++
 libc/test/src/math/smoke/llogbf_test.cpp      |  13 ++
 libc/test/src/math/smoke/llogbl_test.cpp      |  13 ++
 libc/test/src/math/smoke/logbf128_test.cpp    |  13 ++
 41 files changed, 674 insertions(+), 185 deletions(-)
 create mode 100644 libc/src/math/generic/ilogbf128.cpp
 create mode 100644 libc/src/math/generic/llogb.cpp
 create mode 100644 libc/src/math/generic/llogbf.cpp
 create mode 100644 libc/src/math/generic/llogbf128.cpp
 create mode 100644 libc/src/math/generic/llogbl.cpp
 create mode 100644 libc/src/math/generic/logbf128.cpp
 create mode 100644 libc/src/math/ilogbf128.h
 create mode 100644 libc/src/math/llogb.h
 create mode 100644 libc/src/math/llogbf.h
 create mode 100644 libc/src/math/llogbf128.h
 create mode 100644 libc/src/math/llogbl.h
 create mode 100644 libc/src/math/logbf128.h
 create mode 100644 libc/test/src/math/smoke/ilogbf128_test.cpp
 create mode 100644 libc/test/src/math/smoke/llogb_test.cpp
 create mode 100644 libc/test/src/math/smoke/llogbf128_test.cpp
 create mode 100644 libc/test/src/math/smoke/llogbf_test.cpp
 create mode 100644 libc/test/src/math/smoke/llogbl_test.cpp
 create mode 100644 libc/test/src/math/smoke/logbf128_test.cpp

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 5224e92bbcc58..fca5315fc4f0a 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -239,6 +239,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ilogbf
     libc.src.math.ldexp
     libc.src.math.ldexpf
+    libc.src.math.llogb
+    libc.src.math.llogbf
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llround
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 8a6c160c09932..a6dc74101dbcc 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -341,6 +341,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ilogb
     libc.src.math.ilogbf
     libc.src.math.ilogbl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
@@ -422,7 +425,10 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.fmaxf128
     libc.src.math.fminf128
     libc.src.math.frexpf128
+    libc.src.math.ilogbf128
     libc.src.math.ldexpf128
+    libc.src.math.llogbf128
+    libc.src.math.logbf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 7df1904908886..2fca96c5601b0 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -217,6 +217,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ldexp
     libc.src.math.ldexpf
     libc.src.math.ldexpl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 5c8cc7618a9e8..fc4d8828f4c68 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -353,6 +353,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ldexp
     libc.src.math.ldexpf
     libc.src.math.ldexpl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
@@ -431,7 +434,10 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.fmaxf128
     libc.src.math.fminf128
     libc.src.math.frexpf128
+    libc.src.math.ilogbf128
     libc.src.math.ldexpf128
+    libc.src.math.llogbf128
+    libc.src.math.logbf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index cd44b5c52b58c..c2300a2aa681a 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -365,6 +365,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.ldexp
     libc.src.math.ldexpf
     libc.src.math.ldexpl
+    libc.src.math.llogb
+    libc.src.math.llogbf
+    libc.src.math.llogbl
     libc.src.math.llrint
     libc.src.math.llrintf
     libc.src.math.llrintl
@@ -445,7 +448,10 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     libc.src.math.fmaxf128
     libc.src.math.fminf128
     libc.src.math.frexpf128
+    libc.src.math.ilogbf128
     libc.src.math.ldexpf128
+    libc.src.math.llogbf128
+    libc.src.math.logbf128
     libc.src.math.roundf128
     libc.src.math.sqrtf128
     libc.src.math.truncf128
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 8d584338c10f3..80d12718edccd 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -185,6 +185,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ilogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| ilogf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ldexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ldexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -193,6 +195,14 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | ldexpf128    | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| llogf128     | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | llrint       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | llrintf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -211,6 +221,8 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | logbl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| logf128      | |check| | |check| |         | |check| |         |         |         |         |         |         |         |         |
++--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | lrint        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | lrintf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 6e0875829127e..157b786aa7e81 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -83,6 +83,8 @@ add_macro_header(
   math_macros
   HDR
     math-macros.h
+  DEPENDS
+    .limits_macros
 )
 
 add_macro_header(
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 136670e665e6b..0a23647319f4d 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -9,6 +9,8 @@
 #ifndef __LLVM_LIBC_MACROS_MATH_MACROS_H
 #define __LLVM_LIBC_MACROS_MATH_MACROS_H
 
+#include "limits-macros.h"
+
 #define MATH_ERRNO 1
 #define MATH_ERREXCEPT 2
 
@@ -16,8 +18,11 @@
 #define INFINITY __builtin_inf()
 #define NAN __builtin_nanf("")
 
-#define FP_ILOGB0 (-__INT_MAX__ - 1)
-#define FP_ILOGBNAN __INT_MAX__
+#define FP_ILOGB0 (-INT_MAX - 1)
+#define FP_ILOGBNAN INT_MAX
+
+#define FP_LLOGB0 (-LONG_MAX - 1)
+#define FP_LLOGBNAN LONG_MAX
 
 #define isfinite(x) __builtin_isfinite(x)
 #define isinf(x) __builtin_isinf(x)
@@ -25,7 +30,7 @@
 
 #ifdef __FAST_MATH__
 #define math_errhandling 0
-#elif defined __NO_MATH_ERRNO__
+#elif defined(__NO_MATH_ERRNO__)
 #define math_errhandling (MATH_ERREXCEPT)
 #else
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 6b292588b6c7a..8a1a235e4eecd 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -414,6 +414,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"ilogb", RetValSpec<IntType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"ilogbf", RetValSpec<IntType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"ilogbl", RetValSpec<IntType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"ilogbf128", RetValSpec<IntType>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
+
+          FunctionSpec<"llogb", RetValSpec<LongType>, [ArgSpec<DoubleType>]>,
+          FunctionSpec<"llogbf", RetValSpec<LongType>, [ArgSpec<FloatType>]>,
+          FunctionSpec<"llogbl", RetValSpec<LongType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"llogbf128", RetValSpec<LongType>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"ldexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
           FunctionSpec<"ldexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
@@ -435,6 +441,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"logb", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"logbf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"logbl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          GuardedFunctionSpec<"logbf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_COMPILER_HAS_FLOAT128">,
 
           FunctionSpec<"modf", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoublePtr>]>,
           FunctionSpec<"modff", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatPtr>]>,
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 9e760a28f42d7..c1d57bd37c197 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -71,54 +71,80 @@ LIBC_INLINE T copysign(T x, T y) {
   return xbits.get_val();
 }
 
-template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
-LIBC_INLINE int ilogb(T x) {
-  // TODO: Raise appropriate floating point exceptions and set errno to the
-  // an appropriate error value wherever relevant.
-  FPBits<T> bits(x);
-  if (bits.is_zero()) {
-    return FP_ILOGB0;
-  } else if (bits.is_nan()) {
-    return FP_ILOGBNAN;
-  } else if (bits.is_inf()) {
-    return INT_MAX;
+template <typename T> struct IntLogbConstants;
+
+template <> struct IntLogbConstants<int> {
+  LIBC_INLINE_VAR static constexpr int FP_LOGB0 = FP_ILOGB0;
+  LIBC_INLINE_VAR static constexpr int FP_LOGBNAN = FP_ILOGBNAN;
+  LIBC_INLINE_VAR static constexpr int T_MAX = INT_MAX;
+  LIBC_INLINE_VAR static constexpr int T_MIN = INT_MIN;
+};
+
+template <> struct IntLogbConstants<long> {
+  LIBC_INLINE_VAR static constexpr long FP_LOGB0 = FP_ILOGB0;
+  LIBC_INLINE_VAR static constexpr long FP_LOGBNAN = FP_ILOGBNAN;
+  LIBC_INLINE_VAR static constexpr long T_MAX = LONG_MAX;
+  LIBC_INLINE_VAR static constexpr long T_MIN = LONG_MIN;
+};
+
+template <typename T, typename U>
+LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_floating_point_v<U>, T>
+intlogb(U x) {
+  FPBits<U> bits(x);
+  if (LIBC_UNLIKELY(bits.is_zero() || bits.is_inf_or_nan())) {
+    set_errno_if_required(EDOM);
+    raise_except_if_required(FE_INVALID);
+
+    if (bits.is_zero())
+      return IntLogbConstants<T>::FP_LOGB0;
+    if (bits.is_nan())
+      return IntLogbConstants<T>::FP_LOGBNAN;
+    // bits is inf.
+    return IntLogbConstants<T>::T_MAX;
   }
 
-  NormalFloat<T> normal(bits);
+  DyadicFloat<FPBits<U>::STORAGE_LEN> normal(bits.get_val());
+  int exponent = normal.get_unbiased_exponent();
   // The C standard does not specify the return value when an exponent is
   // out of int range. However, XSI conformance required that INT_MAX or
   // INT_MIN are returned.
   // NOTE: It is highly unlikely that exponent will be out of int range as
   // the exponent is only 15 bits wide even for the 128-bit floating point
   // format.
-  if (normal.exponent > INT_MAX)
-    return INT_MAX;
-  else if (normal.exponent < INT_MIN)
-    return INT_MIN;
-  else
-    return normal.exponent;
+  if (LIBC_UNLIKELY(exponent > IntLogbConstants<T>::T_MAX ||
+                    exponent < IntLogbConstants<T>::T_MIN)) {
+    set_errno_if_required(ERANGE);
+    raise_except_if_required(FE_INVALID);
+    return exponent > 0 ? IntLogbConstants<T>::T_MAX
+                        : IntLogbConstants<T>::T_MIN;
+  }
+
+  return static_cast<T>(exponent);
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
-LIBC_INLINE T logb(T x) {
+LIBC_INLINE constexpr T logb(T x) {
   FPBits<T> bits(x);
-  if (bits.is_zero()) {
-    // TODO(Floating point exception): Raise div-by-zero exception.
-    // TODO(errno): POSIX requires setting errno to ERANGE.
-    return FPBits<T>::inf(Sign::NEG).get_val();
-  } else if (bits.is_nan()) {
-    return x;
-  } else if (bits.is_inf()) {
-    // Return positive infinity.
+  if (LIBC_UNLIKELY(bits.is_zero() || bits.is_inf_or_nan())) {
+    if (bits.is_nan())
+      return x;
+
+    raise_except_if_required(FE_DIVBYZERO);
+
+    if (bits.is_zero()) {
+      set_errno_if_required(ERANGE);
+      return FPBits<T>::inf(Sign::NEG).get_val();
+    }
+    // bits is inf.
     return FPBits<T>::inf().get_val();
   }
 
-  NormalFloat<T> normal(bits);
-  return static_cast<T>(normal.exponent);
+  DyadicFloat<FPBits<T>::STORAGE_LEN> normal(bits.get_val());
+  return static_cast<T>(normal.get_unbiased_exponent());
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
-LIBC_INLINE T ldexp(T x, int exp) {
+LIBC_INLINE constexpr T ldexp(T x, int exp) {
   FPBits<T> bits(x);
   if (LIBC_UNLIKELY((exp == 0) || bits.is_zero() || bits.is_inf_or_nan()))
     return x;
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 7797c57b96fd8..14bc73433097b 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -79,6 +79,11 @@ template <size_t Bits> struct DyadicFloat {
     return *this;
   }
 
+  // Assume that it is already normalized.  Output the unbiased exponent.
+  LIBC_INLINE constexpr int get_unbiased_exponent() const {
+    return exponent + (Bits - 1);
+  }
+
   // Assume that it is already normalized.
   // Output is rounded correctly with respect to the current rounding mode.
   template <typename T,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 33dc1fc97c568..efa14c42e389b 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -157,6 +157,12 @@ add_math_entrypoint_object(hypotf)
 add_math_entrypoint_object(ilogb)
 add_math_entrypoint_object(ilogbf)
 add_math_entrypoint_object(ilogbl)
+add_math_entrypoint_object(ilogbf128)
+
+add_math_entrypoint_object(llogb)
+add_math_entrypoint_object(llogbf)
+add_math_entrypoint_object(llogbl)
+add_math_entrypoint_object(llogbf128)
 
 add_math_entrypoint_object(ldexp)
 add_math_entrypoint_object(ldexpf)
@@ -178,6 +184,7 @@ add_math_entrypoint_object(logf)
 add_math_entrypoint_object(logb)
 add_math_entrypoint_object(logbf)
 add_math_entrypoint_object(logbl)
+add_math_entrypoint_object(logbf128)
 
 add_math_entrypoint_object(llrint)
 add_math_entrypoint_object(llrintf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 2ef13168bce8f..120ada8202ab9 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -969,10 +969,10 @@ add_entrypoint_object(
     ilogb.cpp
   HDRS
     ../ilogb.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -981,10 +981,10 @@ add_entrypoint_object(
     ilogbf.cpp
   HDRS
     ../ilogbf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -993,10 +993,72 @@ add_entrypoint_object(
     ilogbl.cpp
   HDRS
     ../ilogbl.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  ilogbf128
+  SRCS
+    ilogbf128.cpp
+  HDRS
+    ../ilogbf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogb
+  SRCS
+    llogb.cpp
+  HDRS
+    ../llogb.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogbf
+  SRCS
+    llogbf.cpp
+  HDRS
+    ../llogbf.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogbl
+  SRCS
+    llogbl.cpp
+  HDRS
+    ../llogbl.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  llogbf128
+  SRCS
+    llogbf128.cpp
+  HDRS
+    ../llogbf128.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_entrypoint_object(
@@ -1044,8 +1106,8 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O3
   DEPENDS
-  libc.src.__support.macros.properties.float
-  libc.src.__support.FPUtil.manipulation_functions
+    libc.src.__support.macros.properties.float
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_object_library(
@@ -1229,10 +1291,10 @@ add_entrypoint_object(
     logb.cpp
   HDRS
     ../logb.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1241,10 +1303,10 @@ add_entrypoint_object(
     logbf.cpp
   HDRS
     ../logbf.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
-  COMPILE_OPTIONS
-    -O2
 )
 
 add_entrypoint_object(
@@ -1253,10 +1315,22 @@ add_entrypoint_object(
     logbl.cpp
   HDRS
     ../logbl.h
+  COMPILE_OPTIONS
+    -O3
   DEPENDS
     libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_entrypoint_object(
+  logbf128
+  SRCS
+    logbf128.cpp
+  HDRS
+    ../logbf128.h
   COMPILE_OPTIONS
-    -O2
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.manipulation_functions
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/ilogb.cpp b/libc/src/math/generic/ilogb.cpp
index 4e5f7d9642b4a..7e4f66970c5d3 100644
--- a/libc/src/math/generic/ilogb.cpp
+++ b/libc/src/math/generic/ilogb.cpp
@@ -12,6 +12,6 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return fputil::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return fputil::intlogb<int>(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbf.cpp b/libc/src/math/generic/ilogbf.cpp
index ca15879bc25fe..422788cec9e04 100644
--- a/libc/src/math/generic/ilogbf.cpp
+++ b/libc/src/math/generic/ilogbf.cpp
@@ -12,6 +12,6 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return fputil::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return fputil::intlogb<int>(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbf128.cpp b/libc/src/math/generic/ilogbf128.cpp
new file mode 100644
index 0000000000000..4049eccc5f36b
--- /dev/null
+++ b/libc/src/math/generic/ilogbf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of ilogbf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogbf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogbf128, (float128 x)) {
+  return fputil::intlogb<int>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ilogbl.cpp b/libc/src/math/generic/ilogbl.cpp
index 4c18daab1a535..b7f7eb40c4410 100644
--- a/libc/src/math/generic/ilogbl.cpp
+++ b/libc/src/math/generic/ilogbl.cpp
@@ -12,6 +12,8 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogbl, (long double x)) { return fputil::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogbl, (long double x)) {
+  return fputil::intlogb<int>(x);
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogb.cpp b/libc/src/math/generic/llogb.cpp
new file mode 100644
index 0000000000000..917bc38c03792
--- /dev/null
+++ b/libc/src/math/generic/llogb.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of llogb function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogb.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogb, (double x)) { return fputil::intlogb<long>(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbf.cpp b/libc/src/math/generic/llogbf.cpp
new file mode 100644
index 0000000000000..ca1c03db5c2e4
--- /dev/null
+++ b/libc/src/math/generic/llogbf.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of llogbf function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbf.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbf, (float x)) { return fputil::intlogb<long>(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbf128.cpp b/libc/src/math/generic/llogbf128.cpp
new file mode 100644
index 0000000000000..5ae4af302110c
--- /dev/null
+++ b/libc/src/math/generic/llogbf128.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of llogbf128 function ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbf128, (float128 x)) {
+  return fputil::intlogb<long>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/llogbl.cpp b/libc/src/math/generic/llogbl.cpp
new file mode 100644
index 0000000000000..a092997b92449
--- /dev/null
+++ b/libc/src/math/generic/llogbl.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of llogbl function ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llogbl.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, llogbl, (long double x)) {
+  return fputil::intlogb<long>(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/logbf.cpp b/libc/src/math/generic/logbf.cpp
index 78aa33ebbf4a9..9f9f7fbcfbb88 100644
--- a/libc/src/math/generic/logbf.cpp
+++ b/libc/src/math/generic/logbf.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of logbf function ---------------------------------===//
+//===-- Implementation of logbf function ----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/libc/src/math/generic/logbf128.cpp b/libc/src/math/generic/logbf128.cpp
new file mode 100644
index 0000000000000..090433d1fb860
--- /dev/null
+++ b/libc/src/math/generic/logbf128.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of logbf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logbf128.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, logbf128, (float128 x)) { return fputil::logb(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/ilogbf128.h b/libc/src/math/ilogbf128.h
new file mode 100644
index 0000000000000..df1145ffc0f8a
--- /dev/null
+++ b/libc/src/math/ilogbf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ilogbf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_ILOGBF128_H
+#define LLVM_LIBC_SRC_MATH_ILOGBF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+int ilogbf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_ILOGBF128_H
diff --git a/libc/src/math/llogb.h b/libc/src/math/llogb.h
new file mode 100644
index 0000000000000..2d95877425e56
--- /dev/null
+++ b/libc/src/math/llogb.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogb -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGB_H
+#define LLVM_LIBC_SRC_MATH_LLOGB_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogb(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGB_H
diff --git a/libc/src/math/llogbf.h b/libc/src/math/llogbf.h
new file mode 100644
index 0000000000000..512e174b66ee4
--- /dev/null
+++ b/libc/src/math/llogbf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBF_H
+#define LLVM_LIBC_SRC_MATH_LLOGBF_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbf(float x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBF_H
diff --git a/libc/src/math/llogbf128.h b/libc/src/math/llogbf128.h
new file mode 100644
index 0000000000000..7fb74d4bbe730
--- /dev/null
+++ b/libc/src/math/llogbf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbf128 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBF128_H
+#define LLVM_LIBC_SRC_MATH_LLOGBF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBF128_H
diff --git a/libc/src/math/llogbl.h b/libc/src/math/llogbl.h
new file mode 100644
index 0000000000000..4033100fbe3da
--- /dev/null
+++ b/libc/src/math/llogbl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for llogbl ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LLOGBL_H
+#define LLVM_LIBC_SRC_MATH_LLOGBL_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+long llogbl(long double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LLOGBL_H
diff --git a/libc/src/math/logbf128.h b/libc/src/math/logbf128.h
new file mode 100644
index 0000000000000..8baa076af1bfd
--- /dev/null
+++ b/libc/src/math/logbf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for logbf128  ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_LOGBF128_H
+#define LLVM_LIBC_SRC_MATH_LOGBF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 logbf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_LOGBF128_H
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 8ccb54f2926bb..7ef87a0dd2eec 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -769,7 +769,6 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     HDRS
       ILogbTest.h
     DEPENDS
-      libc.include.math
       libc.src.math.ilogb
       libc.src.__support.CPP.limits
       libc.src.__support.FPUtil.fp_bits
@@ -785,7 +784,6 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     HDRS
       ILogbTest.h
     DEPENDS
-      libc.include.math
       libc.src.math.ilogbf
       libc.src.__support.CPP.limits
       libc.src.__support.FPUtil.fp_bits
@@ -802,13 +800,87 @@ add_fp_unittest(
   HDRS
     ILogbTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.ilogbl
     libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.manipulation_functions
 )
 
+add_fp_unittest(
+  ilogbf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogbf128_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogbf128
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogb_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogb_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogb
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbf_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbl_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbl
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
+add_fp_unittest(
+  llogbf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    llogbf128_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.llogbf128
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
 add_fp_unittest(
   ldexp_test
   SUITE
@@ -876,7 +948,6 @@ add_fp_unittest(
   SRCS
     logb_test.cpp
   DEPENDS
-    libc.include.math
     libc.src.math.logb
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -888,7 +959,6 @@ add_fp_unittest(
   SRCS
     logbf_test.cpp
   DEPENDS
-    libc.include.math
     libc.src.math.logbf
     libc.src.__support.FPUtil.manipulation_functions
 )
@@ -902,11 +972,21 @@ add_fp_unittest(
   HDRS
     LogbTest.h
   DEPENDS
-    libc.include.math
     libc.src.math.logbl
     libc.src.__support.FPUtil.manipulation_functions
 )
 
+add_fp_unittest(
+  logbf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    logbf128_test.cpp
+  DEPENDS
+    libc.src.math.logbf128
+    libc.src.__support.FPUtil.manipulation_functions
+)
+
 add_fp_unittest(
   modf_test
   SUITE
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 3e2db33e2c052..cbee25b139d48 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -13,101 +13,110 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "test/UnitTest/Test.h"
-#include <math.h>
 
+template <typename OutType, typename InType>
 class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
+  using FPBits = LIBC_NAMESPACE::fputil::FPBits<InType>;
+  using StorageType = typename FPBits::StorageType;
+  using Sign = LIBC_NAMESPACE::fputil::Sign;
+
 public:
-  template <typename T> struct ILogbFunc {
-    typedef int (*Func)(T);
-  };
-
-  template <typename T>
-  void test_special_numbers(typename ILogbFunc<T>::Func func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-    using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::POS).get_val()));
-    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::NEG).get_val()));
-    EXPECT_EQ(FP_ILOGBNAN, func(FPBits::quiet_nan().get_val()));
-    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::POS).get_val()));
-    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::NEG).get_val()));
+  typedef OutType (*Func)(InType);
+
+  void test_special_numbers(Func func) {
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::FP_LOGB0,
+              func(FPBits::zero(Sign::POS).get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::FP_LOGB0,
+              func(FPBits::zero(Sign::NEG).get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::FP_LOGBNAN,
+              func(FPBits::quiet_nan().get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::T_MAX,
+              func(FPBits::inf(Sign::POS).get_val()));
+    EXPECT_EQ(LIBC_NAMESPACE::fputil::IntLogbConstants<OutType>::T_MAX,
+              func(FPBits::inf(Sign::NEG).get_val()));
   }
 
-  template <typename T>
-  void test_powers_of_two(typename ILogbFunc<T>::Func func) {
-    EXPECT_EQ(0, func(T(1.0)));
-    EXPECT_EQ(0, func(T(-1.0)));
+  void test_powers_of_two(Func func) {
+    EXPECT_EQ(OutType(0), func(InType(1.0)));
+    EXPECT_EQ(OutType(0), func(InType(-1.0)));
 
-    EXPECT_EQ(1, func(T(2.0)));
-    EXPECT_EQ(1, func(T(-2.0)));
+    EXPECT_EQ(OutType(1), func(InType(2.0)));
+    EXPECT_EQ(OutType(1), func(InType(-2.0)));
 
-    EXPECT_EQ(2, func(T(4.0)));
-    EXPECT_EQ(2, func(T(-4.0)));
+    EXPECT_EQ(OutType(2), func(InType(4.0)));
+    EXPECT_EQ(OutType(2), func(InType(-4.0)));
 
-    EXPECT_EQ(3, func(T(8.0)));
-    EXPECT_EQ(3, func(-8.0));
+    EXPECT_EQ(OutType(3), func(InType(8.0)));
+    EXPECT_EQ(OutType(3), func(-8.0));
 
-    EXPECT_EQ(4, func(16.0));
-    EXPECT_EQ(4, func(-16.0));
+    EXPECT_EQ(OutType(4), func(16.0));
+    EXPECT_EQ(OutType(4), func(-16.0));
 
-    EXPECT_EQ(5, func(32.0));
-    EXPECT_EQ(5, func(-32.0));
+    EXPECT_EQ(OutType(5), func(32.0));
+    EXPECT_EQ(OutType(5), func(-32.0));
   }
 
-  template <typename T>
-  void test_some_integers(typename ILogbFunc<T>::Func func) {
-    EXPECT_EQ(1, func(T(3.0)));
-    EXPECT_EQ(1, func(T(-3.0)));
+  void test_some_integers(Func func) {
+    EXPECT_EQ(OutType(1), func(InType(3.0)));
+    EXPECT_EQ(OutType(1), func(InType(-3.0)));
 
-    EXPECT_EQ(2, func(T(7.0)));
-    EXPECT_EQ(2, func(T(-7.0)));
+    EXPECT_EQ(OutType(2), func(InType(7.0)));
+    EXPECT_EQ(OutType(2), func(InType(-7.0)));
 
-    EXPECT_EQ(3, func(T(10.0)));
-    EXPECT_EQ(3, func(T(-10.0)));
+    EXPECT_EQ(OutType(3), func(InType(10.0)));
+    EXPECT_EQ(OutType(3), func(InType(-10.0)));
 
-    EXPECT_EQ(4, func(T(31.0)));
-    EXPECT_EQ(4, func(-31.0));
+    EXPECT_EQ(OutType(4), func(InType(31.0)));
+    EXPECT_EQ(OutType(4), func(-31.0));
 
-    EXPECT_EQ(5, func(55.0));
-    EXPECT_EQ(5, func(-55.0));
+    EXPECT_EQ(OutType(5), func(55.0));
+    EXPECT_EQ(OutType(5), func(-55.0));
   }
 
-  template <typename T>
-  void test_subnormal_range(typename ILogbFunc<T>::Func func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-    using StorageType = typename FPBits::StorageType;
+  void test_subnormal_range(Func func) {
     constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
     constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
-      T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      FPBits x_bits = FPBits(v);
+      if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
+      InType x = x_bits.get_val();
+
       int exponent;
       LIBC_NAMESPACE::fputil::frexp(x, exponent);
-      ASSERT_EQ(exponent, func(x) + 1);
+      ASSERT_EQ(static_cast<OutType>(exponent), func(x) + OutType(1));
     }
   }
 
-  template <typename T>
-  void test_normal_range(typename ILogbFunc<T>::Func func) {
-    using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-    using StorageType = typename FPBits::StorageType;
+  void test_normal_range(Func func) {
     constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
     constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
-      T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0)
+      FPBits x_bits = FPBits(v);
+      if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
+      InType x = x_bits.get_val();
+
       int exponent;
       LIBC_NAMESPACE::fputil::frexp(x, exponent);
-      ASSERT_EQ(exponent, func(x) + 1);
+      ASSERT_EQ(static_cast<OutType>(exponent), func(x) + OutType(1));
     }
   }
 };
 
+#define LIST_INTLOGB_TESTS(OutType, InType, Func)                              \
+  using LlvmLibcIntLogbTest = LlvmLibcILogbTest<OutType, InType>;              \
+  TEST_F(LlvmLibcIntLogbTest, SpecialNumbers) { test_special_numbers(&Func); } \
+  TEST_F(LlvmLibcIntLogbTest, PowersOfTwo) { test_powers_of_two(&Func); }      \
+  TEST_F(LlvmLibcIntLogbTest, SomeIntegers) { test_some_integers(&Func); }     \
+  TEST_F(LlvmLibcIntLogbTest, SubnormalRange) { test_subnormal_range(&Func); } \
+  TEST_F(LlvmLibcIntLogbTest, NormalRange) { test_normal_range(&Func); }       \
+  static_assert(true)
+
 #endif // LLVM_LIBC_TEST_SRC_MATH_ILOGBTEST_H
diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h
index e2698e2b7b81b..01e1050b4c4f8 100644
--- a/libc/test/src/math/smoke/LogbTest.h
+++ b/libc/test/src/math/smoke/LogbTest.h
@@ -10,8 +10,6 @@
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include <math.h>
-
 template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
 
   DECLARE_SPECIAL_CONSTANTS(T)
@@ -72,10 +70,12 @@ template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = FPBits(v).get_val();
-      if (isnan(x) || isinf(x) || x == 0.0l)
+      FPBits x_bits = FPBits(v);
+      if (x_bits.is_zero() || x_bits.is_inf_or_nan())
         continue;
 
+      T x = x_bits.get_val();
+
       int exponent;
       LIBC_NAMESPACE::fputil::frexp(x, exponent);
       ASSERT_FP_EQ(T(exponent), func(x) + T(1.0));
diff --git a/libc/test/src/math/smoke/ilogb_test.cpp b/libc/test/src/math/smoke/ilogb_test.cpp
index 7011c43386e66..67c7939608e8b 100644
--- a/libc/test/src/math/smoke/ilogb_test.cpp
+++ b/libc/test/src/math/smoke/ilogb_test.cpp
@@ -8,29 +8,6 @@
 
 #include "ILogbTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogb.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogb) {
-  test_special_numbers<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogb) {
-  test_powers_of_two<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogb) {
-  test_some_integers<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogb) {
-  test_subnormal_range<double>(&LIBC_NAMESPACE::ilogb);
-}
-
-TEST_F(LlvmLibcILogbTest, NormalRange_ilogb) {
-  test_normal_range<double>(&LIBC_NAMESPACE::ilogb);
-}
+LIST_INTLOGB_TESTS(int, double, LIBC_NAMESPACE::ilogb);
diff --git a/libc/test/src/math/smoke/ilogbf128_test.cpp b/libc/test/src/math/smoke/ilogbf128_test.cpp
new file mode 100644
index 0000000000000..21ed0dd112b91
--- /dev/null
+++ b/libc/test/src/math/smoke/ilogbf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ilogbf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/ilogbf128.h"
+
+LIST_INTLOGB_TESTS(int, float128, LIBC_NAMESPACE::ilogbf128);
diff --git a/libc/test/src/math/smoke/ilogbf_test.cpp b/libc/test/src/math/smoke/ilogbf_test.cpp
index dcff8eeb15180..68e6950cdf725 100644
--- a/libc/test/src/math/smoke/ilogbf_test.cpp
+++ b/libc/test/src/math/smoke/ilogbf_test.cpp
@@ -8,29 +8,6 @@
 
 #include "ILogbTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbf.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbf) {
-  test_special_numbers<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogbf) {
-  test_powers_of_two<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogbf) {
-  test_some_integers<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogbf) {
-  test_subnormal_range<float>(&LIBC_NAMESPACE::ilogbf);
-}
-
-TEST_F(LlvmLibcILogbTest, NormalRange_ilogbf) {
-  test_normal_range<float>(&LIBC_NAMESPACE::ilogbf);
-}
+LIST_INTLOGB_TESTS(int, float, LIBC_NAMESPACE::ilogbf);
diff --git a/libc/test/src/math/smoke/ilogbl_test.cpp b/libc/test/src/math/smoke/ilogbl_test.cpp
index 29a221ad7f08f..afc961f168639 100644
--- a/libc/test/src/math/smoke/ilogbl_test.cpp
+++ b/libc/test/src/math/smoke/ilogbl_test.cpp
@@ -8,29 +8,6 @@
 
 #include "ILogbTest.h"
 
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
 #include "src/math/ilogbl.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include <math.h>
 
-TEST_F(LlvmLibcILogbTest, SpecialNumbers_ilogbl) {
-  test_special_numbers<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, PowersOfTwo_ilogbl) {
-  test_powers_of_two<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, SomeIntegers_ilogbl) {
-  test_some_integers<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, SubnormalRange_ilogbl) {
-  test_subnormal_range<long double>(&LIBC_NAMESPACE::ilogbl);
-}
-
-TEST_F(LlvmLibcILogbTest, NormalRange_ilogbl) {
-  test_normal_range<long double>(&LIBC_NAMESPACE::ilogbl);
-}
+LIST_INTLOGB_TESTS(int, long double, LIBC_NAMESPACE::ilogbl);
diff --git a/libc/test/src/math/smoke/llogb_test.cpp b/libc/test/src/math/smoke/llogb_test.cpp
new file mode 100644
index 0000000000000..3bccded6c3e33
--- /dev/null
+++ b/libc/test/src/math/smoke/llogb_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogb -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogb.h"
+
+LIST_INTLOGB_TESTS(long, double, LIBC_NAMESPACE::llogb);
diff --git a/libc/test/src/math/smoke/llogbf128_test.cpp b/libc/test/src/math/smoke/llogbf128_test.cpp
new file mode 100644
index 0000000000000..a1d2021d2a378
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbf128 -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbf128.h"
+
+LIST_INTLOGB_TESTS(long, float128, LIBC_NAMESPACE::llogbf128);
diff --git a/libc/test/src/math/smoke/llogbf_test.cpp b/libc/test/src/math/smoke/llogbf_test.cpp
new file mode 100644
index 0000000000000..60c92fbc2c461
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbf ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbf.h"
+
+LIST_INTLOGB_TESTS(long, float, LIBC_NAMESPACE::llogbf);
diff --git a/libc/test/src/math/smoke/llogbl_test.cpp b/libc/test/src/math/smoke/llogbl_test.cpp
new file mode 100644
index 0000000000000..c698210fc3de1
--- /dev/null
+++ b/libc/test/src/math/smoke/llogbl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for llogbl ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ILogbTest.h"
+
+#include "src/math/llogbl.h"
+
+LIST_INTLOGB_TESTS(long, long double, LIBC_NAMESPACE::llogbl);
diff --git a/libc/test/src/math/smoke/logbf128_test.cpp b/libc/test/src/math/smoke/logbf128_test.cpp
new file mode 100644
index 0000000000000..49485f8ee7144
--- /dev/null
+++ b/libc/test/src/math/smoke/logbf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for logbf128 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LogbTest.h"
+
+#include "src/math/logbf128.h"
+
+LIST_LOGB_TESTS(float128, LIBC_NAMESPACE::logbf128)

From d29261074cd18383f4d3c84e740c66de738cb61f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 11:39:13 -0600
Subject: [PATCH 460/546] [libc] Clean up GPU math implementations (#83133)

Summary:
The math directory likes to do architecture specific implementations of
these math functions. For the GPU case it was complicated by the fact
that both NVPTX and AMDGPU had to go through the same code paths. Since
reworking the GPU target this is no longer the case and we can simply
use the same scheme. This patch moves all the old code into two separate
directories. This likely results in a net increase in code, but it's
easier to reason with.
---
 libc/cmake/modules/LLVMLibCObjectRules.cmake  |    2 +-
 libc/src/math/CMakeLists.txt                  |   26 -
 .../{gpu/vendor => amdgpu}/CMakeLists.txt     |  563 ++++++--
 libc/src/math/amdgpu/acos.cpp                 |   18 +
 libc/src/math/amdgpu/acosf.cpp                |   18 +
 libc/src/math/amdgpu/acosh.cpp                |   18 +
 libc/src/math/amdgpu/acoshf.cpp               |   18 +
 libc/src/math/amdgpu/asin.cpp                 |   18 +
 libc/src/math/amdgpu/asinf.cpp                |   18 +
 libc/src/math/amdgpu/asinh.cpp                |   18 +
 libc/src/math/amdgpu/asinhf.cpp               |   18 +
 libc/src/math/amdgpu/atan.cpp                 |   18 +
 libc/src/math/amdgpu/atan2.cpp                |   20 +
 libc/src/math/amdgpu/atan2f.cpp               |   20 +
 libc/src/math/amdgpu/atanf.cpp                |   18 +
 libc/src/math/amdgpu/atanh.cpp                |   18 +
 libc/src/math/amdgpu/atanhf.cpp               |   18 +
 libc/src/math/{gpu => amdgpu}/ceil.cpp        |    0
 libc/src/math/{gpu => amdgpu}/ceilf.cpp       |    0
 libc/src/math/{gpu => amdgpu}/copysign.cpp    |    0
 libc/src/math/{gpu => amdgpu}/copysignf.cpp   |    0
 libc/src/math/amdgpu/cos.cpp                  |   18 +
 libc/src/math/amdgpu/cosf.cpp                 |   18 +
 libc/src/math/amdgpu/cosh.cpp                 |   18 +
 libc/src/math/amdgpu/coshf.cpp                |   18 +
 .../{gpu/vendor => }/amdgpu/declarations.h    |    2 +
 libc/src/math/amdgpu/erf.cpp                  |   18 +
 libc/src/math/amdgpu/erff.cpp                 |   18 +
 libc/src/math/{gpu/vendor => amdgpu}/exp.cpp  |    4 +-
 libc/src/math/amdgpu/exp10.cpp                |   18 +
 libc/src/math/amdgpu/exp10f.cpp               |   18 +
 libc/src/math/amdgpu/exp2.cpp                 |   18 +
 libc/src/math/amdgpu/exp2f.cpp                |   18 +
 libc/src/math/{gpu/vendor => amdgpu}/expf.cpp |    4 +-
 libc/src/math/amdgpu/expm1.cpp                |   18 +
 libc/src/math/amdgpu/expm1f.cpp               |   18 +
 libc/src/math/{gpu => amdgpu}/fabs.cpp        |    0
 libc/src/math/{gpu => amdgpu}/fabsf.cpp       |    0
 libc/src/math/amdgpu/fdim.cpp                 |   20 +
 libc/src/math/amdgpu/fdimf.cpp                |   20 +
 libc/src/math/{gpu => amdgpu}/floor.cpp       |    0
 libc/src/math/{gpu => amdgpu}/floorf.cpp      |    0
 libc/src/math/{gpu => amdgpu}/fma.cpp         |    0
 libc/src/math/{gpu => amdgpu}/fmaf.cpp        |    0
 libc/src/math/{gpu => amdgpu}/fmax.cpp        |    0
 libc/src/math/{gpu => amdgpu}/fmaxf.cpp       |    0
 libc/src/math/{gpu => amdgpu}/fmin.cpp        |    0
 libc/src/math/{gpu => amdgpu}/fminf.cpp       |    0
 libc/src/math/{gpu => amdgpu}/fmod.cpp        |    0
 libc/src/math/{gpu => amdgpu}/fmodf.cpp       |    0
 libc/src/math/amdgpu/frexp.cpp                |   20 +
 libc/src/math/amdgpu/frexpf.cpp               |   20 +
 libc/src/math/amdgpu/hypot.cpp                |   20 +
 libc/src/math/amdgpu/hypotf.cpp               |   20 +
 libc/src/math/amdgpu/ilogb.cpp                |   18 +
 libc/src/math/amdgpu/ilogbf.cpp               |   18 +
 libc/src/math/amdgpu/ldexp.cpp                |   20 +
 libc/src/math/amdgpu/ldexpf.cpp               |   20 +
 .../math/{gpu/vendor => amdgpu}/llrint.cpp    |    4 +-
 .../math/{gpu/vendor => amdgpu}/llrintf.cpp   |    4 +-
 libc/src/math/{gpu => amdgpu}/llround.cpp     |    0
 libc/src/math/{gpu => amdgpu}/llroundf.cpp    |    0
 libc/src/math/amdgpu/log.cpp                  |   18 +
 libc/src/math/amdgpu/log10.cpp                |   18 +
 libc/src/math/amdgpu/log10f.cpp               |   18 +
 libc/src/math/amdgpu/log1p.cpp                |   18 +
 libc/src/math/amdgpu/log1pf.cpp               |   18 +
 libc/src/math/amdgpu/log2.cpp                 |   18 +
 libc/src/math/amdgpu/log2f.cpp                |   18 +
 libc/src/math/amdgpu/logb.cpp                 |   18 +
 libc/src/math/amdgpu/logbf.cpp                |   18 +
 libc/src/math/amdgpu/logf.cpp                 |   18 +
 libc/src/math/amdgpu/lrint.cpp                |   20 +
 libc/src/math/amdgpu/lrintf.cpp               |   20 +
 libc/src/math/{gpu => amdgpu}/lround.cpp      |    0
 libc/src/math/{gpu => amdgpu}/lroundf.cpp     |    0
 libc/src/math/{gpu => amdgpu}/modf.cpp        |    0
 libc/src/math/{gpu => amdgpu}/modff.cpp       |    0
 libc/src/math/{gpu => amdgpu}/nearbyint.cpp   |    0
 libc/src/math/{gpu => amdgpu}/nearbyintf.cpp  |    0
 libc/src/math/amdgpu/nextafter.cpp            |   20 +
 libc/src/math/amdgpu/nextafterf.cpp           |   20 +
 .../math/{gpu/vendor => }/amdgpu/platform.h   |    0
 libc/src/math/{gpu/vendor => amdgpu}/pow.cpp  |    4 +-
 libc/src/math/{gpu/vendor => amdgpu}/powf.cpp |    4 +-
 libc/src/math/{gpu => amdgpu}/remainder.cpp   |    0
 libc/src/math/{gpu => amdgpu}/remainderf.cpp  |    0
 libc/src/math/amdgpu/remquo.cpp               |   23 +
 libc/src/math/amdgpu/remquof.cpp              |   23 +
 libc/src/math/{gpu => amdgpu}/rint.cpp        |    0
 libc/src/math/{gpu => amdgpu}/rintf.cpp       |    0
 libc/src/math/{gpu => amdgpu}/round.cpp       |    0
 libc/src/math/{gpu => amdgpu}/roundf.cpp      |    0
 libc/src/math/amdgpu/scalbn.cpp               |   20 +
 libc/src/math/amdgpu/scalbnf.cpp              |   20 +
 libc/src/math/amdgpu/sin.cpp                  |   18 +
 libc/src/math/amdgpu/sincos.cpp               |   20 +
 .../math/{gpu/vendor => amdgpu}/sincosf.cpp   |    4 +-
 libc/src/math/amdgpu/sinf.cpp                 |   18 +
 libc/src/math/{gpu => amdgpu}/sinh.cpp        |    6 +-
 libc/src/math/amdgpu/sinhf.cpp                |   18 +
 libc/src/math/{gpu => amdgpu}/sqrt.cpp        |    0
 libc/src/math/{gpu => amdgpu}/sqrtf.cpp       |    0
 libc/src/math/{gpu => amdgpu}/tan.cpp         |    6 +-
 libc/src/math/amdgpu/tanf.cpp                 |   18 +
 libc/src/math/{gpu => amdgpu}/tanh.cpp        |    6 +-
 libc/src/math/amdgpu/tanhf.cpp                |   18 +
 libc/src/math/amdgpu/tgamma.cpp               |   18 +
 libc/src/math/amdgpu/tgammaf.cpp              |   18 +
 libc/src/math/{gpu => amdgpu}/trunc.cpp       |    0
 libc/src/math/{gpu => amdgpu}/truncf.cpp      |    0
 libc/src/math/gpu/CMakeLists.txt              |  384 ------
 libc/src/math/gpu/vendor/amdgpu/amdgpu.h      |  127 --
 libc/src/math/gpu/vendor/common.h             |   22 -
 libc/src/math/nvptx/CMakeLists.txt            | 1179 +++++++++++++++++
 libc/src/math/{gpu/vendor => nvptx}/acos.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/acosf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/acosh.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/acoshf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/asin.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/asinf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/asinh.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/asinhf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/atan.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/atan2.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/atan2f.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/atanf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/atanh.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/atanhf.cpp |    4 +-
 libc/src/math/nvptx/ceil.cpp                  |   16 +
 libc/src/math/nvptx/ceilf.cpp                 |   16 +
 libc/src/math/nvptx/copysign.cpp              |   18 +
 libc/src/math/nvptx/copysignf.cpp             |   18 +
 libc/src/math/{gpu/vendor => nvptx}/cos.cpp   |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/cosf.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/cosh.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/coshf.cpp |    4 +-
 .../{gpu/vendor => }/nvptx/declarations.h     |    0
 libc/src/math/{gpu/vendor => nvptx}/erf.cpp   |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/erff.cpp  |    4 +-
 libc/src/math/nvptx/exp.cpp                   |   18 +
 libc/src/math/{gpu/vendor => nvptx}/exp10.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/exp10f.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/exp2.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/exp2f.cpp |    4 +-
 libc/src/math/nvptx/expf.cpp                  |   18 +
 libc/src/math/{gpu/vendor => nvptx}/expm1.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/expm1f.cpp |    4 +-
 libc/src/math/nvptx/fabs.cpp                  |   16 +
 libc/src/math/nvptx/fabsf.cpp                 |   16 +
 libc/src/math/{gpu/vendor => nvptx}/fdim.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/fdimf.cpp |    4 +-
 libc/src/math/nvptx/floor.cpp                 |   16 +
 libc/src/math/nvptx/floorf.cpp                |   16 +
 libc/src/math/nvptx/fma.cpp                   |   18 +
 libc/src/math/nvptx/fmaf.cpp                  |   18 +
 libc/src/math/nvptx/fmax.cpp                  |   18 +
 libc/src/math/nvptx/fmaxf.cpp                 |   18 +
 libc/src/math/nvptx/fmin.cpp                  |   18 +
 libc/src/math/nvptx/fminf.cpp                 |   18 +
 libc/src/math/nvptx/fmod.cpp                  |   18 +
 libc/src/math/nvptx/fmodf.cpp                 |   18 +
 libc/src/math/{gpu/vendor => nvptx}/frexp.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/frexpf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/hypot.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/hypotf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/ilogb.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/ilogbf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/ldexp.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/ldexpf.cpp |    4 +-
 libc/src/math/nvptx/llrint.cpp                |   18 +
 libc/src/math/nvptx/llrintf.cpp               |   18 +
 libc/src/math/nvptx/llround.cpp               |   18 +
 libc/src/math/nvptx/llroundf.cpp              |   18 +
 libc/src/math/{gpu/vendor => nvptx}/log.cpp   |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/log10.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/log10f.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/log1p.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/log1pf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/log2.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/log2f.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/logb.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/logbf.cpp |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/logf.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/lrint.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/lrintf.cpp |    4 +-
 libc/src/math/nvptx/lround.cpp                |   16 +
 libc/src/math/nvptx/lroundf.cpp               |   16 +
 libc/src/math/nvptx/modf.cpp                  |   18 +
 libc/src/math/nvptx/modff.cpp                 |   18 +
 libc/src/math/nvptx/nearbyint.cpp             |   18 +
 libc/src/math/nvptx/nearbyintf.cpp            |   18 +
 .../math/{gpu/vendor => nvptx}/nextafter.cpp  |    4 +-
 .../math/{gpu/vendor => nvptx}/nextafterf.cpp |    4 +-
 libc/src/math/{gpu/vendor => }/nvptx/nvptx.h  |    0
 libc/src/math/nvptx/pow.cpp                   |   18 +
 libc/src/math/nvptx/powf.cpp                  |   18 +
 libc/src/math/nvptx/remainder.cpp             |   18 +
 libc/src/math/nvptx/remainderf.cpp            |   18 +
 .../src/math/{gpu/vendor => nvptx}/remquo.cpp |    4 +-
 .../math/{gpu/vendor => nvptx}/remquof.cpp    |    4 +-
 libc/src/math/nvptx/rint.cpp                  |   16 +
 libc/src/math/nvptx/rintf.cpp                 |   16 +
 libc/src/math/nvptx/round.cpp                 |   16 +
 libc/src/math/nvptx/roundf.cpp                |   16 +
 .../src/math/{gpu/vendor => nvptx}/scalbn.cpp |    4 +-
 .../math/{gpu/vendor => nvptx}/scalbnf.cpp    |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/sin.cpp   |    4 +-
 .../src/math/{gpu/vendor => nvptx}/sincos.cpp |    4 +-
 libc/src/math/nvptx/sincosf.cpp               |   20 +
 libc/src/math/{gpu/vendor => nvptx}/sinf.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/sinh.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/sinhf.cpp |    4 +-
 libc/src/math/nvptx/sqrt.cpp                  |   16 +
 libc/src/math/nvptx/sqrtf.cpp                 |   16 +
 libc/src/math/{gpu/vendor => nvptx}/tan.cpp   |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/tanf.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/tanh.cpp  |    4 +-
 libc/src/math/{gpu/vendor => nvptx}/tanhf.cpp |    4 +-
 .../src/math/{gpu/vendor => nvptx}/tgamma.cpp |    4 +-
 .../math/{gpu/vendor => nvptx}/tgammaf.cpp    |    4 +-
 libc/src/math/nvptx/trunc.cpp                 |   16 +
 libc/src/math/nvptx/truncf.cpp                |   16 +
 223 files changed, 3682 insertions(+), 797 deletions(-)
 rename libc/src/math/{gpu/vendor => amdgpu}/CMakeLists.txt (59%)
 create mode 100644 libc/src/math/amdgpu/acos.cpp
 create mode 100644 libc/src/math/amdgpu/acosf.cpp
 create mode 100644 libc/src/math/amdgpu/acosh.cpp
 create mode 100644 libc/src/math/amdgpu/acoshf.cpp
 create mode 100644 libc/src/math/amdgpu/asin.cpp
 create mode 100644 libc/src/math/amdgpu/asinf.cpp
 create mode 100644 libc/src/math/amdgpu/asinh.cpp
 create mode 100644 libc/src/math/amdgpu/asinhf.cpp
 create mode 100644 libc/src/math/amdgpu/atan.cpp
 create mode 100644 libc/src/math/amdgpu/atan2.cpp
 create mode 100644 libc/src/math/amdgpu/atan2f.cpp
 create mode 100644 libc/src/math/amdgpu/atanf.cpp
 create mode 100644 libc/src/math/amdgpu/atanh.cpp
 create mode 100644 libc/src/math/amdgpu/atanhf.cpp
 rename libc/src/math/{gpu => amdgpu}/ceil.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/ceilf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/copysign.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/copysignf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/cos.cpp
 create mode 100644 libc/src/math/amdgpu/cosf.cpp
 create mode 100644 libc/src/math/amdgpu/cosh.cpp
 create mode 100644 libc/src/math/amdgpu/coshf.cpp
 rename libc/src/math/{gpu/vendor => }/amdgpu/declarations.h (99%)
 create mode 100644 libc/src/math/amdgpu/erf.cpp
 create mode 100644 libc/src/math/amdgpu/erff.cpp
 rename libc/src/math/{gpu/vendor => amdgpu}/exp.cpp (84%)
 create mode 100644 libc/src/math/amdgpu/exp10.cpp
 create mode 100644 libc/src/math/amdgpu/exp10f.cpp
 create mode 100644 libc/src/math/amdgpu/exp2.cpp
 create mode 100644 libc/src/math/amdgpu/exp2f.cpp
 rename libc/src/math/{gpu/vendor => amdgpu}/expf.cpp (84%)
 create mode 100644 libc/src/math/amdgpu/expm1.cpp
 create mode 100644 libc/src/math/amdgpu/expm1f.cpp
 rename libc/src/math/{gpu => amdgpu}/fabs.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fabsf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/fdim.cpp
 create mode 100644 libc/src/math/amdgpu/fdimf.cpp
 rename libc/src/math/{gpu => amdgpu}/floor.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/floorf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fma.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fmaf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fmax.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fmaxf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fmin.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fminf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fmod.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/fmodf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/frexp.cpp
 create mode 100644 libc/src/math/amdgpu/frexpf.cpp
 create mode 100644 libc/src/math/amdgpu/hypot.cpp
 create mode 100644 libc/src/math/amdgpu/hypotf.cpp
 create mode 100644 libc/src/math/amdgpu/ilogb.cpp
 create mode 100644 libc/src/math/amdgpu/ilogbf.cpp
 create mode 100644 libc/src/math/amdgpu/ldexp.cpp
 create mode 100644 libc/src/math/amdgpu/ldexpf.cpp
 rename libc/src/math/{gpu/vendor => amdgpu}/llrint.cpp (87%)
 rename libc/src/math/{gpu/vendor => amdgpu}/llrintf.cpp (87%)
 rename libc/src/math/{gpu => amdgpu}/llround.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/llroundf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/log.cpp
 create mode 100644 libc/src/math/amdgpu/log10.cpp
 create mode 100644 libc/src/math/amdgpu/log10f.cpp
 create mode 100644 libc/src/math/amdgpu/log1p.cpp
 create mode 100644 libc/src/math/amdgpu/log1pf.cpp
 create mode 100644 libc/src/math/amdgpu/log2.cpp
 create mode 100644 libc/src/math/amdgpu/log2f.cpp
 create mode 100644 libc/src/math/amdgpu/logb.cpp
 create mode 100644 libc/src/math/amdgpu/logbf.cpp
 create mode 100644 libc/src/math/amdgpu/logf.cpp
 create mode 100644 libc/src/math/amdgpu/lrint.cpp
 create mode 100644 libc/src/math/amdgpu/lrintf.cpp
 rename libc/src/math/{gpu => amdgpu}/lround.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/lroundf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/modf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/modff.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/nearbyint.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/nearbyintf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/nextafter.cpp
 create mode 100644 libc/src/math/amdgpu/nextafterf.cpp
 rename libc/src/math/{gpu/vendor => }/amdgpu/platform.h (100%)
 rename libc/src/math/{gpu/vendor => amdgpu}/pow.cpp (90%)
 rename libc/src/math/{gpu/vendor => amdgpu}/powf.cpp (90%)
 rename libc/src/math/{gpu => amdgpu}/remainder.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/remainderf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/remquo.cpp
 create mode 100644 libc/src/math/amdgpu/remquof.cpp
 rename libc/src/math/{gpu => amdgpu}/rint.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/rintf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/round.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/roundf.cpp (100%)
 create mode 100644 libc/src/math/amdgpu/scalbn.cpp
 create mode 100644 libc/src/math/amdgpu/scalbnf.cpp
 create mode 100644 libc/src/math/amdgpu/sin.cpp
 create mode 100644 libc/src/math/amdgpu/sincos.cpp
 rename libc/src/math/{gpu/vendor => amdgpu}/sincosf.cpp (89%)
 create mode 100644 libc/src/math/amdgpu/sinf.cpp
 rename libc/src/math/{gpu => amdgpu}/sinh.cpp (71%)
 create mode 100644 libc/src/math/amdgpu/sinhf.cpp
 rename libc/src/math/{gpu => amdgpu}/sqrt.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/sqrtf.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/tan.cpp (72%)
 create mode 100644 libc/src/math/amdgpu/tanf.cpp
 rename libc/src/math/{gpu => amdgpu}/tanh.cpp (71%)
 create mode 100644 libc/src/math/amdgpu/tanhf.cpp
 create mode 100644 libc/src/math/amdgpu/tgamma.cpp
 create mode 100644 libc/src/math/amdgpu/tgammaf.cpp
 rename libc/src/math/{gpu => amdgpu}/trunc.cpp (100%)
 rename libc/src/math/{gpu => amdgpu}/truncf.cpp (100%)
 delete mode 100644 libc/src/math/gpu/CMakeLists.txt
 delete mode 100644 libc/src/math/gpu/vendor/amdgpu/amdgpu.h
 delete mode 100644 libc/src/math/gpu/vendor/common.h
 create mode 100644 libc/src/math/nvptx/CMakeLists.txt
 rename libc/src/math/{gpu/vendor => nvptx}/acos.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/acosf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/acosh.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/acoshf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/asin.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/asinf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/asinh.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/asinhf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/atan.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/atan2.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/atan2f.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/atanf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/atanh.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/atanhf.cpp (83%)
 create mode 100644 libc/src/math/nvptx/ceil.cpp
 create mode 100644 libc/src/math/nvptx/ceilf.cpp
 create mode 100644 libc/src/math/nvptx/copysign.cpp
 create mode 100644 libc/src/math/nvptx/copysignf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/cos.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/cosf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/cosh.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/coshf.cpp (83%)
 rename libc/src/math/{gpu/vendor => }/nvptx/declarations.h (100%)
 rename libc/src/math/{gpu/vendor => nvptx}/erf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/erff.cpp (83%)
 create mode 100644 libc/src/math/nvptx/exp.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/exp10.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/exp10f.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/exp2.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/exp2f.cpp (83%)
 create mode 100644 libc/src/math/nvptx/expf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/expm1.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/expm1f.cpp (83%)
 create mode 100644 libc/src/math/nvptx/fabs.cpp
 create mode 100644 libc/src/math/nvptx/fabsf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/fdim.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/fdimf.cpp (91%)
 create mode 100644 libc/src/math/nvptx/floor.cpp
 create mode 100644 libc/src/math/nvptx/floorf.cpp
 create mode 100644 libc/src/math/nvptx/fma.cpp
 create mode 100644 libc/src/math/nvptx/fmaf.cpp
 create mode 100644 libc/src/math/nvptx/fmax.cpp
 create mode 100644 libc/src/math/nvptx/fmaxf.cpp
 create mode 100644 libc/src/math/nvptx/fmin.cpp
 create mode 100644 libc/src/math/nvptx/fminf.cpp
 create mode 100644 libc/src/math/nvptx/fmod.cpp
 create mode 100644 libc/src/math/nvptx/fmodf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/frexp.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/frexpf.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/hypot.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/hypotf.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/ilogb.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/ilogbf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/ldexp.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/ldexpf.cpp (91%)
 create mode 100644 libc/src/math/nvptx/llrint.cpp
 create mode 100644 libc/src/math/nvptx/llrintf.cpp
 create mode 100644 libc/src/math/nvptx/llround.cpp
 create mode 100644 libc/src/math/nvptx/llroundf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/log.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/log10.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/log10f.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/log1p.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/log1pf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/log2.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/log2f.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/logb.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/logbf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/logf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/lrint.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/lrintf.cpp (83%)
 create mode 100644 libc/src/math/nvptx/lround.cpp
 create mode 100644 libc/src/math/nvptx/lroundf.cpp
 create mode 100644 libc/src/math/nvptx/modf.cpp
 create mode 100644 libc/src/math/nvptx/modff.cpp
 create mode 100644 libc/src/math/nvptx/nearbyint.cpp
 create mode 100644 libc/src/math/nvptx/nearbyintf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/nextafter.cpp (90%)
 rename libc/src/math/{gpu/vendor => nvptx}/nextafterf.cpp (90%)
 rename libc/src/math/{gpu/vendor => }/nvptx/nvptx.h (100%)
 create mode 100644 libc/src/math/nvptx/pow.cpp
 create mode 100644 libc/src/math/nvptx/powf.cpp
 create mode 100644 libc/src/math/nvptx/remainder.cpp
 create mode 100644 libc/src/math/nvptx/remainderf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/remquo.cpp (90%)
 rename libc/src/math/{gpu/vendor => nvptx}/remquof.cpp (90%)
 create mode 100644 libc/src/math/nvptx/rint.cpp
 create mode 100644 libc/src/math/nvptx/rintf.cpp
 create mode 100644 libc/src/math/nvptx/round.cpp
 create mode 100644 libc/src/math/nvptx/roundf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/scalbn.cpp (91%)
 rename libc/src/math/{gpu/vendor => nvptx}/scalbnf.cpp (90%)
 rename libc/src/math/{gpu/vendor => nvptx}/sin.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/sincos.cpp (89%)
 create mode 100644 libc/src/math/nvptx/sincosf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/sinf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/sinh.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/sinhf.cpp (83%)
 create mode 100644 libc/src/math/nvptx/sqrt.cpp
 create mode 100644 libc/src/math/nvptx/sqrtf.cpp
 rename libc/src/math/{gpu/vendor => nvptx}/tan.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/tanf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/tanh.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/tanhf.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/tgamma.cpp (83%)
 rename libc/src/math/{gpu/vendor => nvptx}/tgammaf.cpp (83%)
 create mode 100644 libc/src/math/nvptx/trunc.cpp
 create mode 100644 libc/src/math/nvptx/truncf.cpp

diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 8a84c82206ba6..0649e9f7a7670 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -307,7 +307,7 @@ function(create_entrypoint_object fq_target_name)
     ${fq_target_name}
     PROPERTIES
       ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
-      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+      TARGET_TYPE ${entrypoint_target_type}
       OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
       CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
       DEPS "${fq_deps_list}"
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index efa14c42e389b..882befd9f7e7f 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -1,9 +1,6 @@
 add_subdirectory(generic)
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
   add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
-elseif(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
-  # TODO: We should split this into 'nvptx' and 'amdgpu' for the GPU build.
-  add_subdirectory(${LIBC_TARGET_OS})
 endif()
 
 function(add_math_entrypoint_object name)
@@ -11,7 +8,6 @@ function(add_math_entrypoint_object name)
   # that first and return early if we are able to add an alias target for the
   # machine specific implementation.
   get_fq_target_name("${LIBC_TARGET_ARCHITECTURE}.${name}" fq_machine_specific_target_name)
-  get_fq_target_name("${LIBC_TARGET_OS}.${name}" fq_os_specific_target_name)
   if(TARGET ${fq_machine_specific_target_name})
     add_entrypoint_object(
       ${name}
@@ -20,28 +16,6 @@ function(add_math_entrypoint_object name)
         .${LIBC_TARGET_ARCHITECTURE}.${name}
     )
     return()
-  elseif(TARGET ${fq_os_specific_target_name})
-    add_entrypoint_object(
-      ${name}
-      ALIAS
-      DEPENDS
-        .${LIBC_TARGET_OS}.${name}
-    )
-    return()
-  endif()
-
-  # The GPU optionally depends on vendor libraries. If we emitted one of these
-  # entrypoints it means the user requested it and we should use it instead.
-  get_fq_target_name("${LIBC_TARGET_OS}.vendor.${name}" fq_vendor_specific_target_name)
-  if(TARGET ${fq_vendor_specific_target_name})
-    add_entrypoint_object(
-      ${name}
-      ALIAS
-      DEPENDS
-        .${LIBC_TARGET_OS}.vendor.${name}
-      VENDOR
-    )
-    return()
   endif()
 
   get_fq_target_name("generic.${name}" fq_generic_target_name)
diff --git a/libc/src/math/gpu/vendor/CMakeLists.txt b/libc/src/math/amdgpu/CMakeLists.txt
similarity index 59%
rename from libc/src/math/gpu/vendor/CMakeLists.txt
rename to libc/src/math/amdgpu/CMakeLists.txt
index 36087ade63bfc..cb77341aa5052 100644
--- a/libc/src/math/gpu/vendor/CMakeLists.txt
+++ b/libc/src/math/amdgpu/CMakeLists.txt
@@ -1,39 +1,360 @@
+# Math functions not yet available in the libc project, or those not yet tuned
+# for GPU workloads are provided as wrappers over vendor libraries. If we find
+# them ahead of time we will import them statically. Otherwise, we will keep
+# them as external references and expect them to be resolved by the user when
+# they compile. In the future,we will use implementations from the 'libc'
+# project and not provide these wrappers.
 find_package(AMDDeviceLibs QUIET HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
 if(AMDDeviceLibs_FOUND)
   message(STATUS "Found the ROCm device library. Implementations falling back "
                  "to the vendor libraries will be resolved statically.")
   get_target_property(ocml_path ocml IMPORTED_LOCATION)
-  list(APPEND bitcode_link_flags
-       "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}")
+  set(bitcode_link_flags
+      "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${ocml_path}")
 else()
   message(STATUS "Could not find the ROCm device library. Unimplemented "
                  "functions will be an external reference to the vendor libraries.")
 endif()
 
-if(CUDAToolkit_FOUND)
-  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
-  if (EXISTS ${libdevice_path})
-    message(STATUS "Found the CUDA device library. Implementations falling back "
-                   "to the vendor libraries will be resolved statically.")
-    list(APPEND bitcode_link_flags
-         "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
-  endif()
-else()
-  message(STATUS "Could not find the CUDA device library. Unimplemented "
-                 "functions will be an external reference to the vendor libraries.")
-endif()
+add_entrypoint_object(
+  ceil
+  SRCS
+    ceil.cpp
+  HDRS
+    ../ceil.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  ceilf
+  SRCS
+    ceilf.cpp
+  HDRS
+    ../ceilf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysign
+  SRCS
+    copysign.cpp
+  HDRS
+    ../copysign.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysignf
+  SRCS
+    copysignf.cpp
+  HDRS
+    ../copysignf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabs
+  SRCS
+    fabs.cpp
+  HDRS
+    ../fabs.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabsf
+  SRCS
+    fabsf.cpp
+  HDRS
+    ../fabsf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floor
+  SRCS
+    floor.cpp
+  HDRS
+    ../floor.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floorf
+  SRCS
+    floorf.cpp
+  HDRS
+    ../floorf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fma
+  SRCS
+    fma.cpp
+  HDRS
+    ../fma.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaf
+  SRCS
+    fmaf.cpp
+  HDRS
+    ../fmaf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmax
+  SRCS
+    fmax.cpp
+  HDRS
+    ../fmax.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaxf
+  SRCS
+    fmaxf.cpp
+  HDRS
+    ../fmaxf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmin
+  SRCS
+    fmin.cpp
+  HDRS
+    ../fmin.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fminf
+  SRCS
+    fminf.cpp
+  HDRS
+    ../fminf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmod
+  SRCS
+    fmod.cpp
+  HDRS
+    ../fmod.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmodf
+  SRCS
+    fmodf.cpp
+  HDRS
+    ../fmodf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lround
+  SRCS
+    lround.cpp
+  HDRS
+    ../lround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lroundf
+  SRCS
+    lroundf.cpp
+  HDRS
+    ../lroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llround
+  SRCS
+    llround.cpp
+  HDRS
+    ../llround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llroundf
+  SRCS
+    llroundf.cpp
+  HDRS
+    ../llroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modf
+  SRCS
+    modf.cpp
+  HDRS
+    ../modf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modff
+  SRCS
+    modff.cpp
+  HDRS
+    ../modff.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyint
+  SRCS
+    nearbyint.cpp
+  HDRS
+    ../nearbyint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyintf
+  SRCS
+    nearbyintf.cpp
+  HDRS
+    ../nearbyintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainder
+  SRCS
+    remainder.cpp
+  HDRS
+    ../remainder.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainderf
+  SRCS
+    remainderf.cpp
+  HDRS
+    ../remainderf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rint
+  SRCS
+    rint.cpp
+  HDRS
+    ../rint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rintf
+  SRCS
+    rintf.cpp
+  HDRS
+    ../rintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  round
+  SRCS
+    round.cpp
+  HDRS
+    ../round.h
+  COMPILE_OPTIONS
+    -O2
+)
 
-# FIXME: We need a way to pass the library to only the NVTPX / AMDGPU build.
-# This shouldn't cause issues because we only link in needed symbols, but it
-# will link in identity metadata from both libraries. This silences the warning.
-list(APPEND bitcode_link_flags "-Wno-linker-warnings")
+add_entrypoint_object(
+  sqrt
+  SRCS
+    sqrt.cpp
+  HDRS
+    ../sqrt.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  sqrtf
+  SRCS
+    sqrtf.cpp
+  HDRS
+    ../sqrtf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  trunc
+  SRCS
+    trunc.cpp
+  HDRS
+    ../trunc.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  truncf
+  SRCS
+    truncf.cpp
+  HDRS
+    ../truncf.h
+  COMPILE_OPTIONS
+    -O2
+)
 
+# The following functions currently are not implemented natively and borrow from
+# existing implementations. This will be removed in the future.
 add_entrypoint_object(
   acos
   SRCS
     acos.cpp
   HDRS
-    ../../acos.h
+    ../acos.h
+  VENDOR
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
@@ -44,10 +365,11 @@ add_entrypoint_object(
   SRCS
     acosf.cpp
   HDRS
-    ../../acosf.h
+    ../acosf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -55,10 +377,11 @@ add_entrypoint_object(
   SRCS
     acosh.cpp
   HDRS
-    ../../acosh.h
+    ../acosh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -66,10 +389,11 @@ add_entrypoint_object(
   SRCS
     acoshf.cpp
   HDRS
-    ../../acoshf.h
+    ../acoshf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -77,10 +401,11 @@ add_entrypoint_object(
   SRCS
     asin.cpp
   HDRS
-    ../../asin.h
+    ../asin.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -88,10 +413,11 @@ add_entrypoint_object(
   SRCS
     asinf.cpp
   HDRS
-    ../../asinf.h
+    ../asinf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -99,10 +425,11 @@ add_entrypoint_object(
   SRCS
     asinh.cpp
   HDRS
-    ../../asinh.h
+    ../asinh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -110,10 +437,11 @@ add_entrypoint_object(
   SRCS
     atan.cpp
   HDRS
-    ../../atan.h
+    ../atan.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -121,10 +449,11 @@ add_entrypoint_object(
   SRCS
     atanf.cpp
   HDRS
-    ../../atanf.h
+    ../atanf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -132,10 +461,11 @@ add_entrypoint_object(
   SRCS
     atan2.cpp
   HDRS
-    ../../atan2.h
+    ../atan2.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -143,10 +473,11 @@ add_entrypoint_object(
   SRCS
     atan2f.cpp
   HDRS
-    ../../atan2f.h
+    ../atan2f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -154,10 +485,11 @@ add_entrypoint_object(
   SRCS
     atanh.cpp
   HDRS
-    ../../atanh.h
+    ../atanh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -165,10 +497,11 @@ add_entrypoint_object(
   SRCS
     atanhf.cpp
   HDRS
-    ../../atanhf.h
+    ../atanhf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -176,10 +509,11 @@ add_entrypoint_object(
   SRCS
     cos.cpp
   HDRS
-    ../../cos.h
+    ../cos.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -187,10 +521,11 @@ add_entrypoint_object(
   SRCS
     cosf.cpp
   HDRS
-    ../../cosf.h
+    ../cosf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -198,10 +533,11 @@ add_entrypoint_object(
   SRCS
     cosh.cpp
   HDRS
-    ../../cosh.h
+    ../cosh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -209,10 +545,11 @@ add_entrypoint_object(
   SRCS
     coshf.cpp
   HDRS
-    ../../coshf.h
+    ../coshf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -220,10 +557,11 @@ add_entrypoint_object(
   SRCS
     erf.cpp
   HDRS
-    ../../erf.h
+    ../erf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -231,10 +569,11 @@ add_entrypoint_object(
   SRCS
     erff.cpp
   HDRS
-    ../../erff.h
+    ../erff.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -242,10 +581,11 @@ add_entrypoint_object(
   SRCS
     exp.cpp
   HDRS
-    ../../exp.h
+    ../exp.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -253,10 +593,11 @@ add_entrypoint_object(
   SRCS
     exp10.cpp
   HDRS
-    ../../exp10.h
+    ../exp10.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -264,10 +605,11 @@ add_entrypoint_object(
   SRCS
     exp10f.cpp
   HDRS
-    ../../exp10f.h
+    ../exp10f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -275,10 +617,11 @@ add_entrypoint_object(
   SRCS
     exp2.cpp
   HDRS
-    ../../exp2.h
+    ../exp2.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -286,10 +629,11 @@ add_entrypoint_object(
   SRCS
     exp2f.cpp
   HDRS
-    ../../exp2f.h
+    ../exp2f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -297,10 +641,11 @@ add_entrypoint_object(
   SRCS
     expf.cpp
   HDRS
-    ../../expf.h
+    ../expf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -308,10 +653,11 @@ add_entrypoint_object(
   SRCS
     expm1.cpp
   HDRS
-    ../../expm1.h
+    ../expm1.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -319,10 +665,11 @@ add_entrypoint_object(
   SRCS
     expm1f.cpp
   HDRS
-    ../../expm1f.h
+    ../expm1f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -330,10 +677,11 @@ add_entrypoint_object(
   SRCS
     fdim.cpp
   HDRS
-    ../../fdim.h
+    ../fdim.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -341,10 +689,11 @@ add_entrypoint_object(
   SRCS
     fdimf.cpp
   HDRS
-    ../../fdimf.h
+    ../fdimf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -352,10 +701,11 @@ add_entrypoint_object(
   SRCS
     hypot.cpp
   HDRS
-    ../../hypot.h
+    ../hypot.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -363,10 +713,11 @@ add_entrypoint_object(
   SRCS
     hypotf.cpp
   HDRS
-    ../../hypotf.h
+    ../hypotf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -374,10 +725,11 @@ add_entrypoint_object(
   SRCS
     ilogb.cpp
   HDRS
-    ../../ilogb.h
+    ../ilogb.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -385,10 +737,11 @@ add_entrypoint_object(
   SRCS
     ilogbf.cpp
   HDRS
-    ../../ilogbf.h
+    ../ilogbf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -396,10 +749,11 @@ add_entrypoint_object(
   SRCS
     log10.cpp
   HDRS
-    ../../log10.h
+    ../log10.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -407,10 +761,11 @@ add_entrypoint_object(
   SRCS
     log10f.cpp
   HDRS
-    ../../log10f.h
+    ../log10f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -418,10 +773,11 @@ add_entrypoint_object(
   SRCS
     log2.cpp
   HDRS
-    ../../log2.h
+    ../log2.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -429,10 +785,11 @@ add_entrypoint_object(
   SRCS
     log2f.cpp
   HDRS
-    ../../log2f.h
+    ../log2f.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -440,10 +797,11 @@ add_entrypoint_object(
   SRCS
     log.cpp
   HDRS
-    ../../log.h
+    ../log.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -451,10 +809,11 @@ add_entrypoint_object(
   SRCS
     logf.cpp
   HDRS
-    ../../logf.h
+    ../logf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -462,10 +821,11 @@ add_entrypoint_object(
   SRCS
     lrint.cpp
   HDRS
-    ../../lrint.h
+    ../lrint.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -473,10 +833,11 @@ add_entrypoint_object(
   SRCS
     lrintf.cpp
   HDRS
-    ../../lrintf.h
+    ../lrintf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -484,10 +845,11 @@ add_entrypoint_object(
   SRCS
     ldexp.cpp
   HDRS
-    ../../ldexp.h
+    ../ldexp.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -495,10 +857,11 @@ add_entrypoint_object(
   SRCS
     ldexpf.cpp
   HDRS
-    ../../ldexpf.h
+    ../ldexpf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -506,10 +869,11 @@ add_entrypoint_object(
   SRCS
     log1p.cpp
   HDRS
-    ../../log1p.h
+    ../log1p.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -517,10 +881,11 @@ add_entrypoint_object(
   SRCS
     log1pf.cpp
   HDRS
-    ../../log1pf.h
+    ../log1pf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -528,10 +893,11 @@ add_entrypoint_object(
   SRCS
     llrint.cpp
   HDRS
-    ../../llrint.h
+    ../llrint.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -539,10 +905,11 @@ add_entrypoint_object(
   SRCS
     llrintf.cpp
   HDRS
-    ../../llrintf.h
+    ../llrintf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -550,10 +917,11 @@ add_entrypoint_object(
   SRCS
     remquo.cpp
   HDRS
-    ../../remquo.h
+    ../remquo.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -561,10 +929,11 @@ add_entrypoint_object(
   SRCS
     remquof.cpp
   HDRS
-    ../../remquof.h
+    ../remquof.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -572,10 +941,11 @@ add_entrypoint_object(
   SRCS
     scalbn.cpp
   HDRS
-    ../../scalbn.h
+    ../scalbn.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -583,10 +953,11 @@ add_entrypoint_object(
   SRCS
     scalbnf.cpp
   HDRS
-    ../../scalbnf.h
+    ../scalbnf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 
@@ -595,10 +966,11 @@ add_entrypoint_object(
   SRCS
     nextafter.cpp
   HDRS
-    ../../nextafter.h
+    ../nextafter.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -606,10 +978,11 @@ add_entrypoint_object(
   SRCS
     nextafterf.cpp
   HDRS
-    ../../nextafterf.h
+    ../nextafterf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -617,10 +990,11 @@ add_entrypoint_object(
   SRCS
     pow.cpp
   HDRS
-    ../../pow.h
+    ../pow.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -628,10 +1002,11 @@ add_entrypoint_object(
   SRCS
     powf.cpp
   HDRS
-    ../../powf.h
+    ../powf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -639,10 +1014,11 @@ add_entrypoint_object(
   SRCS
     sin.cpp
   HDRS
-    ../../sin.h
+    ../sin.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -650,10 +1026,11 @@ add_entrypoint_object(
   SRCS
     sinf.cpp
   HDRS
-    ../../sinf.h
+    ../sinf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -661,10 +1038,11 @@ add_entrypoint_object(
   SRCS
     sincos.cpp
   HDRS
-    ../../sincos.h
+    ../sincos.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -672,10 +1050,11 @@ add_entrypoint_object(
   SRCS
     sincosf.cpp
   HDRS
-    ../../sincosf.h
+    ../sincosf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -683,10 +1062,11 @@ add_entrypoint_object(
   SRCS
     sinh.cpp
   HDRS
-    ../../sinh.h
+    ../sinh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -694,10 +1074,11 @@ add_entrypoint_object(
   SRCS
     sinhf.cpp
   HDRS
-    ../../sinhf.h
+    ../sinhf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -705,10 +1086,11 @@ add_entrypoint_object(
   SRCS
     tan.cpp
   HDRS
-    ../../tan.h
+    ../tan.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -716,10 +1098,11 @@ add_entrypoint_object(
   SRCS
     tanf.cpp
   HDRS
-    ../../tanf.h
+    ../tanf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -727,10 +1110,11 @@ add_entrypoint_object(
   SRCS
     tanh.cpp
   HDRS
-    ../../tanh.h
+    ../tanh.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -738,10 +1122,11 @@ add_entrypoint_object(
   SRCS
     tanhf.cpp
   HDRS
-    ../../tanhf.h
+    ../tanhf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -749,10 +1134,11 @@ add_entrypoint_object(
   SRCS
     tgamma.cpp
   HDRS
-    ../../tgamma.h
+    ../tgamma.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -760,10 +1146,11 @@ add_entrypoint_object(
   SRCS
     tgammaf.cpp
   HDRS
-    ../../tgammaf.h
+    ../tgammaf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -771,10 +1158,11 @@ add_entrypoint_object(
   SRCS
     frexp.cpp
   HDRS
-    ../../frexp.h
+    ../frexp.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
 
 add_entrypoint_object(
@@ -782,8 +1170,9 @@ add_entrypoint_object(
   SRCS
     frexpf.cpp
   HDRS
-    ../../frexpf.h
+    ../frexpf.h
   COMPILE_OPTIONS
     ${bitcode_link_flags}
     -O2
+  VENDOR
 )
diff --git a/libc/src/math/amdgpu/acos.cpp b/libc/src/math/amdgpu/acos.cpp
new file mode 100644
index 0000000000000..b1e30fef82ded
--- /dev/null
+++ b/libc/src/math/amdgpu/acos.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU acos function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acos.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, acos, (double x)) { return __ocml_acos_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/acosf.cpp b/libc/src/math/amdgpu/acosf.cpp
new file mode 100644
index 0000000000000..4c2dd4bcf4353
--- /dev/null
+++ b/libc/src/math/amdgpu/acosf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the acosf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acosf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return __ocml_acos_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/acosh.cpp b/libc/src/math/amdgpu/acosh.cpp
new file mode 100644
index 0000000000000..dcdeeab29454e
--- /dev/null
+++ b/libc/src/math/amdgpu/acosh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU acosh function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acosh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, acosh, (double x)) { return __ocml_acosh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/acoshf.cpp b/libc/src/math/amdgpu/acoshf.cpp
new file mode 100644
index 0000000000000..52baa2eaecc79
--- /dev/null
+++ b/libc/src/math/amdgpu/acoshf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the acoshf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/acoshf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return __ocml_acosh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asin.cpp b/libc/src/math/amdgpu/asin.cpp
new file mode 100644
index 0000000000000..835c317112e2e
--- /dev/null
+++ b/libc/src/math/amdgpu/asin.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU asin function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asin.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, asin, (double x)) { return __ocml_asin_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asinf.cpp b/libc/src/math/amdgpu/asinf.cpp
new file mode 100644
index 0000000000000..72c45d5cf1723
--- /dev/null
+++ b/libc/src/math/amdgpu/asinf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the asinf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asinf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, asinf, (float x)) { return __ocml_asin_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asinh.cpp b/libc/src/math/amdgpu/asinh.cpp
new file mode 100644
index 0000000000000..7a9f7ea4e9887
--- /dev/null
+++ b/libc/src/math/amdgpu/asinh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU asinh function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asinh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, asinh, (double x)) { return __ocml_asinh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/asinhf.cpp b/libc/src/math/amdgpu/asinhf.cpp
new file mode 100644
index 0000000000000..28d6bde5c9184
--- /dev/null
+++ b/libc/src/math/amdgpu/asinhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the asinhf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/asinhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return __ocml_asinh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atan.cpp b/libc/src/math/amdgpu/atan.cpp
new file mode 100644
index 0000000000000..a1fa38ba451de
--- /dev/null
+++ b/libc/src/math/amdgpu/atan.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU atan function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, atan, (double x)) { return __ocml_atan_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atan2.cpp b/libc/src/math/amdgpu/atan2.cpp
new file mode 100644
index 0000000000000..9cfdba75eb8d6
--- /dev/null
+++ b/libc/src/math/amdgpu/atan2.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU atan2 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, atan2, (double x, double y)) {
+  return __ocml_atan2_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atan2f.cpp b/libc/src/math/amdgpu/atan2f.cpp
new file mode 100644
index 0000000000000..ef56293b4caf5
--- /dev/null
+++ b/libc/src/math/amdgpu/atan2f.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU atan2f function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, atan2f, (float x, float y)) {
+  return __ocml_atan2_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atanf.cpp b/libc/src/math/amdgpu/atanf.cpp
new file mode 100644
index 0000000000000..bbcceca3ed092
--- /dev/null
+++ b/libc/src/math/amdgpu/atanf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the atanf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atanf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return __ocml_atan_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atanh.cpp b/libc/src/math/amdgpu/atanh.cpp
new file mode 100644
index 0000000000000..ec462586450bf
--- /dev/null
+++ b/libc/src/math/amdgpu/atanh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU atanh function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atanh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, atanh, (double x)) { return __ocml_atanh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/atanhf.cpp b/libc/src/math/amdgpu/atanhf.cpp
new file mode 100644
index 0000000000000..227269369ab6b
--- /dev/null
+++ b/libc/src/math/amdgpu/atanhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the atanhf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atanhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { return __ocml_atanh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/ceil.cpp b/libc/src/math/amdgpu/ceil.cpp
similarity index 100%
rename from libc/src/math/gpu/ceil.cpp
rename to libc/src/math/amdgpu/ceil.cpp
diff --git a/libc/src/math/gpu/ceilf.cpp b/libc/src/math/amdgpu/ceilf.cpp
similarity index 100%
rename from libc/src/math/gpu/ceilf.cpp
rename to libc/src/math/amdgpu/ceilf.cpp
diff --git a/libc/src/math/gpu/copysign.cpp b/libc/src/math/amdgpu/copysign.cpp
similarity index 100%
rename from libc/src/math/gpu/copysign.cpp
rename to libc/src/math/amdgpu/copysign.cpp
diff --git a/libc/src/math/gpu/copysignf.cpp b/libc/src/math/amdgpu/copysignf.cpp
similarity index 100%
rename from libc/src/math/gpu/copysignf.cpp
rename to libc/src/math/amdgpu/copysignf.cpp
diff --git a/libc/src/math/amdgpu/cos.cpp b/libc/src/math/amdgpu/cos.cpp
new file mode 100644
index 0000000000000..68239d9337801
--- /dev/null
+++ b/libc/src/math/amdgpu/cos.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the cos function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cos.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, cos, (double x)) { return __ocml_cos_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/cosf.cpp b/libc/src/math/amdgpu/cosf.cpp
new file mode 100644
index 0000000000000..a60e9ea28907c
--- /dev/null
+++ b/libc/src/math/amdgpu/cosf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the cosf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cosf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return __ocml_cos_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/cosh.cpp b/libc/src/math/amdgpu/cosh.cpp
new file mode 100644
index 0000000000000..b71df0c0170c3
--- /dev/null
+++ b/libc/src/math/amdgpu/cosh.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the cosh function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cosh.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, cosh, (double x)) { return __ocml_cosh_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/coshf.cpp b/libc/src/math/amdgpu/coshf.cpp
new file mode 100644
index 0000000000000..699fb0478aee8
--- /dev/null
+++ b/libc/src/math/amdgpu/coshf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the coshf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/coshf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, coshf, (float x)) { return __ocml_cosh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/amdgpu/declarations.h b/libc/src/math/amdgpu/declarations.h
similarity index 99%
rename from libc/src/math/gpu/vendor/amdgpu/declarations.h
rename to libc/src/math/amdgpu/declarations.h
index 7a01cbc6ab19d..780d5f0a11403 100644
--- a/libc/src/math/gpu/vendor/amdgpu/declarations.h
+++ b/libc/src/math/amdgpu/declarations.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_SRC_MATH_GPU_AMDGPU_DECLARATIONS_H
 #define LLVM_LIBC_SRC_MATH_GPU_AMDGPU_DECLARATIONS_H
 
+#include "platform.h"
+
 #include "src/__support/GPU/utils.h"
 
 namespace LIBC_NAMESPACE {
diff --git a/libc/src/math/amdgpu/erf.cpp b/libc/src/math/amdgpu/erf.cpp
new file mode 100644
index 0000000000000..7a464550c7e06
--- /dev/null
+++ b/libc/src/math/amdgpu/erf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU erf function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/erf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, erf, (double x)) { return __ocml_erf_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/erff.cpp b/libc/src/math/amdgpu/erff.cpp
new file mode 100644
index 0000000000000..1f77d08585a3d
--- /dev/null
+++ b/libc/src/math/amdgpu/erff.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU erff function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/erff.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, erff, (float x)) { return __ocml_erf_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp.cpp b/libc/src/math/amdgpu/exp.cpp
similarity index 84%
rename from libc/src/math/gpu/vendor/exp.cpp
rename to libc/src/math/amdgpu/exp.cpp
index ee5a22019f6a5..8590ac7590193 100644
--- a/libc/src/math/gpu/vendor/exp.cpp
+++ b/libc/src/math/amdgpu/exp.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, exp, (double x)) { return internal::exp(x); }
+LLVM_LIBC_FUNCTION(double, exp, (double x)) { return __builtin_exp(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp10.cpp b/libc/src/math/amdgpu/exp10.cpp
new file mode 100644
index 0000000000000..17d8f3350ac24
--- /dev/null
+++ b/libc/src/math/amdgpu/exp10.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU exp10 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return __ocml_exp10_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp10f.cpp b/libc/src/math/amdgpu/exp10f.cpp
new file mode 100644
index 0000000000000..ddab555a8fbd3
--- /dev/null
+++ b/libc/src/math/amdgpu/exp10f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the exp10f function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return __ocml_exp10_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp2.cpp b/libc/src/math/amdgpu/exp2.cpp
new file mode 100644
index 0000000000000..dfbb1f80d1294
--- /dev/null
+++ b/libc/src/math/amdgpu/exp2.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU exp2 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp2.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return __ocml_exp2_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/exp2f.cpp b/libc/src/math/amdgpu/exp2f.cpp
new file mode 100644
index 0000000000000..016dfe3a62223
--- /dev/null
+++ b/libc/src/math/amdgpu/exp2f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the exp2f function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp2f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { return __ocml_exp2_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/expf.cpp b/libc/src/math/amdgpu/expf.cpp
similarity index 84%
rename from libc/src/math/gpu/vendor/expf.cpp
rename to libc/src/math/amdgpu/expf.cpp
index 89c194e4bc291..d682f6293a6cb 100644
--- a/libc/src/math/gpu/vendor/expf.cpp
+++ b/libc/src/math/amdgpu/expf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/expf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, expf, (float x)) { return internal::expf(x); }
+LLVM_LIBC_FUNCTION(float, expf, (float x)) { return __builtin_expf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/expm1.cpp b/libc/src/math/amdgpu/expm1.cpp
new file mode 100644
index 0000000000000..d2ac28ae6a3e2
--- /dev/null
+++ b/libc/src/math/amdgpu/expm1.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU expm1 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expm1.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, expm1, (double x)) { return __ocml_expm1_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/expm1f.cpp b/libc/src/math/amdgpu/expm1f.cpp
new file mode 100644
index 0000000000000..0ffe1a362af12
--- /dev/null
+++ b/libc/src/math/amdgpu/expm1f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the expm1f function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expm1f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return __ocml_expm1_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/fabs.cpp b/libc/src/math/amdgpu/fabs.cpp
similarity index 100%
rename from libc/src/math/gpu/fabs.cpp
rename to libc/src/math/amdgpu/fabs.cpp
diff --git a/libc/src/math/gpu/fabsf.cpp b/libc/src/math/amdgpu/fabsf.cpp
similarity index 100%
rename from libc/src/math/gpu/fabsf.cpp
rename to libc/src/math/amdgpu/fabsf.cpp
diff --git a/libc/src/math/amdgpu/fdim.cpp b/libc/src/math/amdgpu/fdim.cpp
new file mode 100644
index 0000000000000..f16942dc61934
--- /dev/null
+++ b/libc/src/math/amdgpu/fdim.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the fdim function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fdim.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fdim, (double x, double y)) {
+  return __ocml_fdim_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/fdimf.cpp b/libc/src/math/amdgpu/fdimf.cpp
new file mode 100644
index 0000000000000..eccb441f14556
--- /dev/null
+++ b/libc/src/math/amdgpu/fdimf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the fdimf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fdimf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fdimf, (float x, float y)) {
+  return __ocml_fdim_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/floor.cpp b/libc/src/math/amdgpu/floor.cpp
similarity index 100%
rename from libc/src/math/gpu/floor.cpp
rename to libc/src/math/amdgpu/floor.cpp
diff --git a/libc/src/math/gpu/floorf.cpp b/libc/src/math/amdgpu/floorf.cpp
similarity index 100%
rename from libc/src/math/gpu/floorf.cpp
rename to libc/src/math/amdgpu/floorf.cpp
diff --git a/libc/src/math/gpu/fma.cpp b/libc/src/math/amdgpu/fma.cpp
similarity index 100%
rename from libc/src/math/gpu/fma.cpp
rename to libc/src/math/amdgpu/fma.cpp
diff --git a/libc/src/math/gpu/fmaf.cpp b/libc/src/math/amdgpu/fmaf.cpp
similarity index 100%
rename from libc/src/math/gpu/fmaf.cpp
rename to libc/src/math/amdgpu/fmaf.cpp
diff --git a/libc/src/math/gpu/fmax.cpp b/libc/src/math/amdgpu/fmax.cpp
similarity index 100%
rename from libc/src/math/gpu/fmax.cpp
rename to libc/src/math/amdgpu/fmax.cpp
diff --git a/libc/src/math/gpu/fmaxf.cpp b/libc/src/math/amdgpu/fmaxf.cpp
similarity index 100%
rename from libc/src/math/gpu/fmaxf.cpp
rename to libc/src/math/amdgpu/fmaxf.cpp
diff --git a/libc/src/math/gpu/fmin.cpp b/libc/src/math/amdgpu/fmin.cpp
similarity index 100%
rename from libc/src/math/gpu/fmin.cpp
rename to libc/src/math/amdgpu/fmin.cpp
diff --git a/libc/src/math/gpu/fminf.cpp b/libc/src/math/amdgpu/fminf.cpp
similarity index 100%
rename from libc/src/math/gpu/fminf.cpp
rename to libc/src/math/amdgpu/fminf.cpp
diff --git a/libc/src/math/gpu/fmod.cpp b/libc/src/math/amdgpu/fmod.cpp
similarity index 100%
rename from libc/src/math/gpu/fmod.cpp
rename to libc/src/math/amdgpu/fmod.cpp
diff --git a/libc/src/math/gpu/fmodf.cpp b/libc/src/math/amdgpu/fmodf.cpp
similarity index 100%
rename from libc/src/math/gpu/fmodf.cpp
rename to libc/src/math/amdgpu/fmodf.cpp
diff --git a/libc/src/math/amdgpu/frexp.cpp b/libc/src/math/amdgpu/frexp.cpp
new file mode 100644
index 0000000000000..0acf97342fa08
--- /dev/null
+++ b/libc/src/math/amdgpu/frexp.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the frexp function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexp.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, frexp, (double x, int *p)) {
+  return __builtin_frexp(x, p);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/frexpf.cpp b/libc/src/math/amdgpu/frexpf.cpp
new file mode 100644
index 0000000000000..d870bf3095b25
--- /dev/null
+++ b/libc/src/math/amdgpu/frexpf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the frexpf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/frexpf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, frexpf, (float x, int *p)) {
+  return __builtin_frexpf(x, p);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/hypot.cpp b/libc/src/math/amdgpu/hypot.cpp
new file mode 100644
index 0000000000000..ffc13504c8f41
--- /dev/null
+++ b/libc/src/math/amdgpu/hypot.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the hypot function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/hypot.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, hypot, (double x, double y)) {
+  return __ocml_hypot_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/hypotf.cpp b/libc/src/math/amdgpu/hypotf.cpp
new file mode 100644
index 0000000000000..811fc540488d3
--- /dev/null
+++ b/libc/src/math/amdgpu/hypotf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the hypotf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/hypotf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
+  return __ocml_hypot_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ilogb.cpp b/libc/src/math/amdgpu/ilogb.cpp
new file mode 100644
index 0000000000000..4479908d38564
--- /dev/null
+++ b/libc/src/math/amdgpu/ilogb.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the ilogb function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogb.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return __ocml_ilogb_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ilogbf.cpp b/libc/src/math/amdgpu/ilogbf.cpp
new file mode 100644
index 0000000000000..cded285c72f38
--- /dev/null
+++ b/libc/src/math/amdgpu/ilogbf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the ilogbf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ilogbf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return __ocml_ilogb_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ldexp.cpp b/libc/src/math/amdgpu/ldexp.cpp
new file mode 100644
index 0000000000000..70c5b0d6e555f
--- /dev/null
+++ b/libc/src/math/amdgpu/ldexp.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the ldexp function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ldexp.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, ldexp, (double x, int y)) {
+  return __builtin_ldexp(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/ldexpf.cpp b/libc/src/math/amdgpu/ldexpf.cpp
new file mode 100644
index 0000000000000..8dc7c132ee213
--- /dev/null
+++ b/libc/src/math/amdgpu/ldexpf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the ldexpf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ldexpf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, ldexpf, (float x, int y)) {
+  return __builtin_ldexpf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/llrint.cpp b/libc/src/math/amdgpu/llrint.cpp
similarity index 87%
rename from libc/src/math/gpu/vendor/llrint.cpp
rename to libc/src/math/amdgpu/llrint.cpp
index aafd1609002b2..307420a9b8b26 100644
--- a/libc/src/math/gpu/vendor/llrint.cpp
+++ b/libc/src/math/amdgpu/llrint.cpp
@@ -9,12 +9,12 @@
 #include "src/math/llrint.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long long, llrint, (double x)) {
-  return internal::llrint(x);
+  return static_cast<long long>(__builtin_rint(x));
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/llrintf.cpp b/libc/src/math/amdgpu/llrintf.cpp
similarity index 87%
rename from libc/src/math/gpu/vendor/llrintf.cpp
rename to libc/src/math/amdgpu/llrintf.cpp
index 39cd3ad0c021a..23404990fb1bd 100644
--- a/libc/src/math/gpu/vendor/llrintf.cpp
+++ b/libc/src/math/amdgpu/llrintf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/llrintf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long long, llrintf, (float x)) {
-  return internal::llrintf(x);
+  return static_cast<long long>(__builtin_rintf(x));
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/llround.cpp b/libc/src/math/amdgpu/llround.cpp
similarity index 100%
rename from libc/src/math/gpu/llround.cpp
rename to libc/src/math/amdgpu/llround.cpp
diff --git a/libc/src/math/gpu/llroundf.cpp b/libc/src/math/amdgpu/llroundf.cpp
similarity index 100%
rename from libc/src/math/gpu/llroundf.cpp
rename to libc/src/math/amdgpu/llroundf.cpp
diff --git a/libc/src/math/amdgpu/log.cpp b/libc/src/math/amdgpu/log.cpp
new file mode 100644
index 0000000000000..3f2489580356f
--- /dev/null
+++ b/libc/src/math/amdgpu/log.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log, (double x)) { return __ocml_log_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log10.cpp b/libc/src/math/amdgpu/log10.cpp
new file mode 100644
index 0000000000000..d522d5e84291d
--- /dev/null
+++ b/libc/src/math/amdgpu/log10.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log10 function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log10.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log10, (double x)) { return __ocml_log10_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log10f.cpp b/libc/src/math/amdgpu/log10f.cpp
new file mode 100644
index 0000000000000..47b9b162ad9db
--- /dev/null
+++ b/libc/src/math/amdgpu/log10f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log10f function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log10f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return __ocml_log10_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log1p.cpp b/libc/src/math/amdgpu/log1p.cpp
new file mode 100644
index 0000000000000..fae60e4485861
--- /dev/null
+++ b/libc/src/math/amdgpu/log1p.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log1p function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log1p.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return __ocml_log1p_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log1pf.cpp b/libc/src/math/amdgpu/log1pf.cpp
new file mode 100644
index 0000000000000..e7b1772158fcd
--- /dev/null
+++ b/libc/src/math/amdgpu/log1pf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log1pf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log1pf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return __ocml_log1p_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log2.cpp b/libc/src/math/amdgpu/log2.cpp
new file mode 100644
index 0000000000000..9d84f62dff6fc
--- /dev/null
+++ b/libc/src/math/amdgpu/log2.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log2 function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log2.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, log2, (double x)) { return __ocml_log2_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/log2f.cpp b/libc/src/math/amdgpu/log2f.cpp
new file mode 100644
index 0000000000000..7742a6141c37d
--- /dev/null
+++ b/libc/src/math/amdgpu/log2f.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU log2f function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/log2f.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, log2f, (float x)) { return __ocml_log2_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/logb.cpp b/libc/src/math/amdgpu/logb.cpp
new file mode 100644
index 0000000000000..1344fbb182af6
--- /dev/null
+++ b/libc/src/math/amdgpu/logb.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU logb function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logb.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, logb, (double x)) { return __ocml_logb_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/logbf.cpp b/libc/src/math/amdgpu/logbf.cpp
new file mode 100644
index 0000000000000..fdb493fe28daf
--- /dev/null
+++ b/libc/src/math/amdgpu/logbf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU logbf function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logbf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, logbf, (float x)) { return __ocml_logb_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/logf.cpp b/libc/src/math/amdgpu/logf.cpp
new file mode 100644
index 0000000000000..d4d4b265ee5cd
--- /dev/null
+++ b/libc/src/math/amdgpu/logf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU logf function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/logf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, logf, (float x)) { return __ocml_log_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/lrint.cpp b/libc/src/math/amdgpu/lrint.cpp
new file mode 100644
index 0000000000000..b335b4f06393c
--- /dev/null
+++ b/libc/src/math/amdgpu/lrint.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the lrint function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lrint.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lrint, (double x)) {
+  return static_cast<long>(__builtin_rint(x));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/lrintf.cpp b/libc/src/math/amdgpu/lrintf.cpp
new file mode 100644
index 0000000000000..7959e76728a43
--- /dev/null
+++ b/libc/src/math/amdgpu/lrintf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the lrintf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lrintf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lrintf, (float x)) {
+  return static_cast<long>(__builtin_rintf(x));
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/lround.cpp b/libc/src/math/amdgpu/lround.cpp
similarity index 100%
rename from libc/src/math/gpu/lround.cpp
rename to libc/src/math/amdgpu/lround.cpp
diff --git a/libc/src/math/gpu/lroundf.cpp b/libc/src/math/amdgpu/lroundf.cpp
similarity index 100%
rename from libc/src/math/gpu/lroundf.cpp
rename to libc/src/math/amdgpu/lroundf.cpp
diff --git a/libc/src/math/gpu/modf.cpp b/libc/src/math/amdgpu/modf.cpp
similarity index 100%
rename from libc/src/math/gpu/modf.cpp
rename to libc/src/math/amdgpu/modf.cpp
diff --git a/libc/src/math/gpu/modff.cpp b/libc/src/math/amdgpu/modff.cpp
similarity index 100%
rename from libc/src/math/gpu/modff.cpp
rename to libc/src/math/amdgpu/modff.cpp
diff --git a/libc/src/math/gpu/nearbyint.cpp b/libc/src/math/amdgpu/nearbyint.cpp
similarity index 100%
rename from libc/src/math/gpu/nearbyint.cpp
rename to libc/src/math/amdgpu/nearbyint.cpp
diff --git a/libc/src/math/gpu/nearbyintf.cpp b/libc/src/math/amdgpu/nearbyintf.cpp
similarity index 100%
rename from libc/src/math/gpu/nearbyintf.cpp
rename to libc/src/math/amdgpu/nearbyintf.cpp
diff --git a/libc/src/math/amdgpu/nextafter.cpp b/libc/src/math/amdgpu/nextafter.cpp
new file mode 100644
index 0000000000000..5c74ef165268b
--- /dev/null
+++ b/libc/src/math/amdgpu/nextafter.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the nextafter function for GPU ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextafter.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nextafter, (double x, double y)) {
+  return __ocml_nextafter_f64(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/nextafterf.cpp b/libc/src/math/amdgpu/nextafterf.cpp
new file mode 100644
index 0000000000000..a97b990a61fcd
--- /dev/null
+++ b/libc/src/math/amdgpu/nextafterf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the nextafterf function for GPU -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nextafterf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nextafterf, (float x, float y)) {
+  return __ocml_nextafter_f32(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/amdgpu/platform.h b/libc/src/math/amdgpu/platform.h
similarity index 100%
rename from libc/src/math/gpu/vendor/amdgpu/platform.h
rename to libc/src/math/amdgpu/platform.h
diff --git a/libc/src/math/gpu/vendor/pow.cpp b/libc/src/math/amdgpu/pow.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/pow.cpp
rename to libc/src/math/amdgpu/pow.cpp
index d49f2610a6919..e5056f67292d5 100644
--- a/libc/src/math/gpu/vendor/pow.cpp
+++ b/libc/src/math/amdgpu/pow.cpp
@@ -9,12 +9,12 @@
 #include "src/math/pow.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
-  return internal::pow(x, y);
+  return __ocml_pow_f64(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/powf.cpp b/libc/src/math/amdgpu/powf.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/powf.cpp
rename to libc/src/math/amdgpu/powf.cpp
index 37e02d252b74f..6114bcc642e11 100644
--- a/libc/src/math/gpu/vendor/powf.cpp
+++ b/libc/src/math/amdgpu/powf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/powf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
-  return internal::powf(x, y);
+  return __ocml_pow_f32(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/remainder.cpp b/libc/src/math/amdgpu/remainder.cpp
similarity index 100%
rename from libc/src/math/gpu/remainder.cpp
rename to libc/src/math/amdgpu/remainder.cpp
diff --git a/libc/src/math/gpu/remainderf.cpp b/libc/src/math/amdgpu/remainderf.cpp
similarity index 100%
rename from libc/src/math/gpu/remainderf.cpp
rename to libc/src/math/amdgpu/remainderf.cpp
diff --git a/libc/src/math/amdgpu/remquo.cpp b/libc/src/math/amdgpu/remquo.cpp
new file mode 100644
index 0000000000000..d8074a9626ecb
--- /dev/null
+++ b/libc/src/math/amdgpu/remquo.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of the GPU remquo function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remquo.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, remquo, (double x, double y, int *quo)) {
+  int tmp;
+  double r = __ocml_remquo_f64(x, y, (gpu::Private<int> *)&tmp);
+  *quo = tmp;
+  return r;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/remquof.cpp b/libc/src/math/amdgpu/remquof.cpp
new file mode 100644
index 0000000000000..b6584dfb97c3b
--- /dev/null
+++ b/libc/src/math/amdgpu/remquof.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of the GPU remquof function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remquof.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, remquof, (float x, float y, int *quo)) {
+  int tmp;
+  float r = __ocml_remquo_f32(x, y, (gpu::Private<int> *)&tmp);
+  *quo = tmp;
+  return r;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/rint.cpp b/libc/src/math/amdgpu/rint.cpp
similarity index 100%
rename from libc/src/math/gpu/rint.cpp
rename to libc/src/math/amdgpu/rint.cpp
diff --git a/libc/src/math/gpu/rintf.cpp b/libc/src/math/amdgpu/rintf.cpp
similarity index 100%
rename from libc/src/math/gpu/rintf.cpp
rename to libc/src/math/amdgpu/rintf.cpp
diff --git a/libc/src/math/gpu/round.cpp b/libc/src/math/amdgpu/round.cpp
similarity index 100%
rename from libc/src/math/gpu/round.cpp
rename to libc/src/math/amdgpu/round.cpp
diff --git a/libc/src/math/gpu/roundf.cpp b/libc/src/math/amdgpu/roundf.cpp
similarity index 100%
rename from libc/src/math/gpu/roundf.cpp
rename to libc/src/math/amdgpu/roundf.cpp
diff --git a/libc/src/math/amdgpu/scalbn.cpp b/libc/src/math/amdgpu/scalbn.cpp
new file mode 100644
index 0000000000000..c2a43e03a7bcd
--- /dev/null
+++ b/libc/src/math/amdgpu/scalbn.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU scalbn function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbn.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, scalbn, (double x, int y)) {
+  return __builtin_amdgcn_ldexp(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/scalbnf.cpp b/libc/src/math/amdgpu/scalbnf.cpp
new file mode 100644
index 0000000000000..63de26ccbc470
--- /dev/null
+++ b/libc/src/math/amdgpu/scalbnf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the GPU scalbnf function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbnf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int y)) {
+  return __builtin_amdgcn_ldexpf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sin.cpp b/libc/src/math/amdgpu/sin.cpp
new file mode 100644
index 0000000000000..dbc29a725db07
--- /dev/null
+++ b/libc/src/math/amdgpu/sin.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the sin function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sin.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, sin, (double x)) { return __ocml_sin_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sincos.cpp b/libc/src/math/amdgpu/sincos.cpp
new file mode 100644
index 0000000000000..7cdd0d1f97690
--- /dev/null
+++ b/libc/src/math/amdgpu/sincos.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the sincos function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sincos.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sinptr, double *cosptr)) {
+  *sinptr = __ocml_sincos_f64(x, cosptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sincosf.cpp b/libc/src/math/amdgpu/sincosf.cpp
similarity index 89%
rename from libc/src/math/gpu/vendor/sincosf.cpp
rename to libc/src/math/amdgpu/sincosf.cpp
index 17892ceb107da..37a5e2a6d11c2 100644
--- a/libc/src/math/gpu/vendor/sincosf.cpp
+++ b/libc/src/math/amdgpu/sincosf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/sincosf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinptr, float *cosptr)) {
-  return internal::sincosf(x, sinptr, cosptr);
+  *sinptr = __ocml_sincos_f32(x, cosptr);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sinf.cpp b/libc/src/math/amdgpu/sinf.cpp
new file mode 100644
index 0000000000000..cda2c626f8ac6
--- /dev/null
+++ b/libc/src/math/amdgpu/sinf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the sinf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, sinf, (float x)) { return __ocml_sin_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/sinh.cpp b/libc/src/math/amdgpu/sinh.cpp
similarity index 71%
rename from libc/src/math/gpu/sinh.cpp
rename to libc/src/math/amdgpu/sinh.cpp
index 054e046f2abd3..66cacd19e4632 100644
--- a/libc/src/math/gpu/sinh.cpp
+++ b/libc/src/math/amdgpu/sinh.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the GPU sinh function ---------------------------===//
+//===-- Implementation of the sinh function for GPU -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,10 @@
 #include "src/math/sinh.h"
 #include "src/__support/common.h"
 
+#include "declarations.h"
+
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return __builtin_sinh(x); }
+LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return __ocml_sinh_f64(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/sinhf.cpp b/libc/src/math/amdgpu/sinhf.cpp
new file mode 100644
index 0000000000000..5d3f5ea6b36ae
--- /dev/null
+++ b/libc/src/math/amdgpu/sinhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the sinhf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, sinhf, (float x)) { return __ocml_sinh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/sqrt.cpp b/libc/src/math/amdgpu/sqrt.cpp
similarity index 100%
rename from libc/src/math/gpu/sqrt.cpp
rename to libc/src/math/amdgpu/sqrt.cpp
diff --git a/libc/src/math/gpu/sqrtf.cpp b/libc/src/math/amdgpu/sqrtf.cpp
similarity index 100%
rename from libc/src/math/gpu/sqrtf.cpp
rename to libc/src/math/amdgpu/sqrtf.cpp
diff --git a/libc/src/math/gpu/tan.cpp b/libc/src/math/amdgpu/tan.cpp
similarity index 72%
rename from libc/src/math/gpu/tan.cpp
rename to libc/src/math/amdgpu/tan.cpp
index d02b106356124..6121a9319a2ab 100644
--- a/libc/src/math/gpu/tan.cpp
+++ b/libc/src/math/amdgpu/tan.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the GPU tan function ----------------------------===//
+//===-- Implementation of the tan function for GPU ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,10 @@
 #include "src/math/tan.h"
 #include "src/__support/common.h"
 
+#include "declarations.h"
+
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tan, (double x)) { return __builtin_tan(x); }
+LLVM_LIBC_FUNCTION(double, tan, (double x)) { return __ocml_tan_f64(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tanf.cpp b/libc/src/math/amdgpu/tanf.cpp
new file mode 100644
index 0000000000000..fdd83ee7aeb94
--- /dev/null
+++ b/libc/src/math/amdgpu/tanf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the tanf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tanf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, tanf, (float x)) { return __ocml_tan_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/tanh.cpp b/libc/src/math/amdgpu/tanh.cpp
similarity index 71%
rename from libc/src/math/gpu/tanh.cpp
rename to libc/src/math/amdgpu/tanh.cpp
index 778e883acba04..25a9c2954bf39 100644
--- a/libc/src/math/gpu/tanh.cpp
+++ b/libc/src/math/amdgpu/tanh.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of the GPU tanh function ---------------------------===//
+//===-- Implementation of the tanh function for GPU -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,8 +9,10 @@
 #include "src/math/tanh.h"
 #include "src/__support/common.h"
 
+#include "declarations.h"
+
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return __builtin_tanh(x); }
+LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return __ocml_tanh_f64(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tanhf.cpp b/libc/src/math/amdgpu/tanhf.cpp
new file mode 100644
index 0000000000000..a4bfd2095eb35
--- /dev/null
+++ b/libc/src/math/amdgpu/tanhf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the tanhf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tanhf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, tanhf, (float x)) { return __ocml_tanh_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tgamma.cpp b/libc/src/math/amdgpu/tgamma.cpp
new file mode 100644
index 0000000000000..10f58d566f88a
--- /dev/null
+++ b/libc/src/math/amdgpu/tgamma.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU tgamma function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tgamma.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, tgamma, (double x)) { return __ocml_tgamma_f64(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/amdgpu/tgammaf.cpp b/libc/src/math/amdgpu/tgammaf.cpp
new file mode 100644
index 0000000000000..e7d22059a7c41
--- /dev/null
+++ b/libc/src/math/amdgpu/tgammaf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU tgammaf function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/tgammaf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, tgammaf, (float x)) { return __ocml_tgamma_f32(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/trunc.cpp b/libc/src/math/amdgpu/trunc.cpp
similarity index 100%
rename from libc/src/math/gpu/trunc.cpp
rename to libc/src/math/amdgpu/trunc.cpp
diff --git a/libc/src/math/gpu/truncf.cpp b/libc/src/math/amdgpu/truncf.cpp
similarity index 100%
rename from libc/src/math/gpu/truncf.cpp
rename to libc/src/math/amdgpu/truncf.cpp
diff --git a/libc/src/math/gpu/CMakeLists.txt b/libc/src/math/gpu/CMakeLists.txt
deleted file mode 100644
index 75a916e2a0112..0000000000000
--- a/libc/src/math/gpu/CMakeLists.txt
+++ /dev/null
@@ -1,384 +0,0 @@
-# Math functions not yet available in the libc project, or those not yet tuned
-# for GPU workloads are provided as wrappers over vendor libraries. If we find
-# them ahead of time we will import them statically. Otherwise, we will keep
-# them as external references and expect them to be resolved by the user when
-# they compile. In the future,we will use implementations from the 'libc'
-# project and not provide these wrappers.
-add_subdirectory(vendor)
-
-# For the GPU we want to be able to optionally depend on the vendor libraries
-# until we have a suitable replacement inside `libc`.
-# TODO: We should have an option to enable or disable these on a per-function
-# basis.
-option(LIBC_GPU_VENDOR_MATH "Use vendor wrappers for GPU math" ON)
-function(add_math_entrypoint_gpu_object name)
-  get_fq_target_name("vendor.${name}" fq_vendor_specific_target_name)
-  if(TARGET ${fq_vendor_specific_target_name} AND ${LIBC_GPU_VENDOR_MATH})
-    return()
-  endif()
-
-  add_entrypoint_object(
-    ${name}
-    ${ARGN}
-  )
-endfunction()
-
-add_math_entrypoint_gpu_object(
-  ceil
-  SRCS
-    ceil.cpp
-  HDRS
-    ../ceil.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  ceilf
-  SRCS
-    ceilf.cpp
-  HDRS
-    ../ceilf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  copysign
-  SRCS
-    copysign.cpp
-  HDRS
-    ../copysign.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  copysignf
-  SRCS
-    copysignf.cpp
-  HDRS
-    ../copysignf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fabs
-  SRCS
-    fabs.cpp
-  HDRS
-    ../fabs.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fabsf
-  SRCS
-    fabsf.cpp
-  HDRS
-    ../fabsf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  floor
-  SRCS
-    floor.cpp
-  HDRS
-    ../floor.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  floorf
-  SRCS
-    floorf.cpp
-  HDRS
-    ../floorf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fma
-  SRCS
-    fma.cpp
-  HDRS
-    ../fma.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmaf
-  SRCS
-    fmaf.cpp
-  HDRS
-    ../fmaf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmax
-  SRCS
-    fmax.cpp
-  HDRS
-    ../fmax.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmaxf
-  SRCS
-    fmaxf.cpp
-  HDRS
-    ../fmaxf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmin
-  SRCS
-    fmin.cpp
-  HDRS
-    ../fmin.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fminf
-  SRCS
-    fminf.cpp
-  HDRS
-    ../fminf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmod
-  SRCS
-    fmod.cpp
-  HDRS
-    ../fmod.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  fmodf
-  SRCS
-    fmodf.cpp
-  HDRS
-    ../fmodf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  lround
-  SRCS
-    lround.cpp
-  HDRS
-    ../lround.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  lroundf
-  SRCS
-    lroundf.cpp
-  HDRS
-    ../lroundf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  llround
-  SRCS
-    llround.cpp
-  HDRS
-    ../llround.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  llroundf
-  SRCS
-    llroundf.cpp
-  HDRS
-    ../llroundf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  modf
-  SRCS
-    modf.cpp
-  HDRS
-    ../modf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  modff
-  SRCS
-    modff.cpp
-  HDRS
-    ../modff.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  nearbyint
-  SRCS
-    nearbyint.cpp
-  HDRS
-    ../nearbyint.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  nearbyintf
-  SRCS
-    nearbyintf.cpp
-  HDRS
-    ../nearbyintf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  remainder
-  SRCS
-    remainder.cpp
-  HDRS
-    ../remainder.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  remainderf
-  SRCS
-    remainderf.cpp
-  HDRS
-    ../remainderf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  rint
-  SRCS
-    rint.cpp
-  HDRS
-    ../rint.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  rintf
-  SRCS
-    rintf.cpp
-  HDRS
-    ../rintf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  round
-  SRCS
-    round.cpp
-  HDRS
-    ../round.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  sinh
-  SRCS
-    sinh.cpp
-  HDRS
-    ../sinh.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  sqrt
-  SRCS
-    sqrt.cpp
-  HDRS
-    ../sqrt.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  sqrtf
-  SRCS
-    sqrtf.cpp
-  HDRS
-    ../sqrtf.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  tan
-  SRCS
-    tan.cpp
-  HDRS
-    ../tan.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  tanh
-  SRCS
-    tanh.cpp
-  HDRS
-    ../tanh.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  trunc
-  SRCS
-    trunc.cpp
-  HDRS
-    ../trunc.h
-  COMPILE_OPTIONS
-    -O2
-)
-
-add_math_entrypoint_gpu_object(
-  truncf
-  SRCS
-    truncf.cpp
-  HDRS
-    ../truncf.h
-  COMPILE_OPTIONS
-    -O2
-)
diff --git a/libc/src/math/gpu/vendor/amdgpu/amdgpu.h b/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
deleted file mode 100644
index 43961fc75982a..0000000000000
--- a/libc/src/math/gpu/vendor/amdgpu/amdgpu.h
+++ /dev/null
@@ -1,127 +0,0 @@
-//===-- AMDGPU specific definitions for math support ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GPU_AMDGPU_H
-#define LLVM_LIBC_SRC_MATH_GPU_AMDGPU_H
-
-#include "declarations.h"
-#include "platform.h"
-
-#include "src/__support/macros/attributes.h"
-
-namespace LIBC_NAMESPACE {
-namespace internal {
-LIBC_INLINE double acos(double x) { return __ocml_acos_f64(x); }
-LIBC_INLINE float acosf(float x) { return __ocml_acos_f32(x); }
-LIBC_INLINE double acosh(double x) { return __ocml_acosh_f64(x); }
-LIBC_INLINE float acoshf(float x) { return __ocml_acosh_f32(x); }
-LIBC_INLINE double asin(double x) { return __ocml_asin_f64(x); }
-LIBC_INLINE float asinf(float x) { return __ocml_asin_f32(x); }
-LIBC_INLINE double asinh(double x) { return __ocml_asinh_f64(x); }
-LIBC_INLINE float asinhf(float x) { return __ocml_asinh_f32(x); }
-LIBC_INLINE double atan(double x) { return __ocml_atan_f64(x); }
-LIBC_INLINE float atanf(float x) { return __ocml_atan_f32(x); }
-LIBC_INLINE double atan2(double x, double y) { return __ocml_atan2_f64(x, y); }
-LIBC_INLINE float atan2f(float x, float y) { return __ocml_atan2_f32(x, y); }
-LIBC_INLINE double atanh(double x) { return __ocml_atanh_f64(x); }
-LIBC_INLINE float atanhf(float x) { return __ocml_atanh_f32(x); }
-LIBC_INLINE double cos(double x) { return __ocml_cos_f64(x); }
-LIBC_INLINE float cosf(float x) { return __ocml_cos_f32(x); }
-LIBC_INLINE double cosh(double x) { return __ocml_cosh_f64(x); }
-LIBC_INLINE float coshf(float x) { return __ocml_cosh_f32(x); }
-LIBC_INLINE double erf(double x) { return __ocml_erf_f64(x); }
-LIBC_INLINE float erff(float x) { return __ocml_erf_f32(x); }
-LIBC_INLINE double exp(double x) { return __builtin_exp(x); }
-LIBC_INLINE float expf(float x) { return __builtin_expf(x); }
-LIBC_INLINE double exp2(double x) { return __ocml_exp2_f64(x); }
-LIBC_INLINE float exp2f(float x) { return __ocml_exp2_f32(x); }
-LIBC_INLINE double exp10(double x) { return __ocml_exp10_f64(x); }
-LIBC_INLINE float exp10f(float x) { return __ocml_exp10_f32(x); }
-LIBC_INLINE double expm1(double x) { return __ocml_expm1_f64(x); }
-LIBC_INLINE float expm1f(float x) { return __ocml_expm1_f32(x); }
-LIBC_INLINE double fdim(double x, double y) { return __ocml_fdim_f64(x, y); }
-LIBC_INLINE float fdimf(float x, float y) { return __ocml_fdim_f32(x, y); }
-LIBC_INLINE double hypot(double x, double y) { return __ocml_hypot_f64(x, y); }
-LIBC_INLINE float hypotf(float x, float y) { return __ocml_hypot_f32(x, y); }
-LIBC_INLINE int ilogb(double x) { return __ocml_ilogb_f64(x); }
-LIBC_INLINE int ilogbf(float x) { return __ocml_ilogb_f32(x); }
-LIBC_INLINE double ldexp(double x, int i) { return __builtin_ldexp(x, i); }
-LIBC_INLINE float ldexpf(float x, int i) { return __builtin_ldexpf(x, i); }
-LIBC_INLINE long long llrint(double x) {
-  return static_cast<long long>(__builtin_rint(x));
-}
-LIBC_INLINE long long llrintf(float x) {
-  return static_cast<long long>(__builtin_rintf(x));
-}
-LIBC_INLINE double log10(double x) { return __ocml_log10_f64(x); }
-LIBC_INLINE float log10f(float x) { return __ocml_log10_f32(x); }
-LIBC_INLINE double log1p(double x) { return __ocml_log1p_f64(x); }
-LIBC_INLINE float log1pf(float x) { return __ocml_log1p_f32(x); }
-LIBC_INLINE double log2(double x) { return __ocml_log2_f64(x); }
-LIBC_INLINE float log2f(float x) { return __ocml_log2_f32(x); }
-LIBC_INLINE double log(double x) { return __ocml_log_f64(x); }
-LIBC_INLINE float logf(float x) { return __ocml_log_f32(x); }
-LIBC_INLINE long lrint(double x) {
-  return static_cast<long>(__builtin_rint(x));
-}
-LIBC_INLINE long lrintf(float x) {
-  return static_cast<long>(__builtin_rintf(x));
-}
-LIBC_INLINE double nextafter(double x, double y) {
-  return __ocml_nextafter_f64(x, y);
-}
-LIBC_INLINE float nextafterf(float x, float y) {
-  return __ocml_nextafter_f32(x, y);
-}
-LIBC_INLINE double pow(double x, double y) { return __ocml_pow_f64(x, y); }
-LIBC_INLINE float powf(float x, float y) { return __ocml_pow_f32(x, y); }
-LIBC_INLINE double sin(double x) { return __ocml_sin_f64(x); }
-LIBC_INLINE float sinf(float x) { return __ocml_sin_f32(x); }
-LIBC_INLINE void sincos(double x, double *sinptr, double *cosptr) {
-  *sinptr = __ocml_sincos_f64(x, cosptr);
-}
-LIBC_INLINE void sincosf(float x, float *sinptr, float *cosptr) {
-  *sinptr = __ocml_sincos_f32(x, cosptr);
-}
-LIBC_INLINE double sinh(double x) { return __ocml_sinh_f64(x); }
-LIBC_INLINE float sinhf(float x) { return __ocml_sinh_f32(x); }
-LIBC_INLINE double tan(double x) { return __ocml_tan_f64(x); }
-LIBC_INLINE float tanf(float x) { return __ocml_tan_f32(x); }
-LIBC_INLINE double tanh(double x) { return __ocml_tanh_f64(x); }
-LIBC_INLINE float tanhf(float x) { return __ocml_tanh_f32(x); }
-LIBC_INLINE double scalbn(double x, int i) {
-  return __builtin_amdgcn_ldexp(x, i);
-}
-LIBC_INLINE float scalbnf(float x, int i) {
-  return __builtin_amdgcn_ldexpf(x, i);
-}
-LIBC_INLINE double frexp(double x, int *nptr) {
-  return __builtin_frexp(x, nptr);
-}
-LIBC_INLINE float frexpf(float x, int *nptr) {
-  return __builtin_frexpf(x, nptr);
-}
-LIBC_INLINE double remquo(double x, double y, int *q) {
-  int tmp;
-  double r = __ocml_remquo_f64(x, y, (gpu::Private<int> *)&tmp);
-  *q = tmp;
-  return r;
-}
-LIBC_INLINE float remquof(float x, float y, int *q) {
-  int tmp;
-  float r = __ocml_remquo_f32(x, y, (gpu::Private<int> *)&tmp);
-  *q = tmp;
-  return r;
-}
-LIBC_INLINE double tgamma(double x) { return __ocml_tgamma_f64(x); }
-LIBC_INLINE float tgammaf(float x) { return __ocml_tgamma_f32(x); }
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC_MATH_GPU_AMDGPU_H
diff --git a/libc/src/math/gpu/vendor/common.h b/libc/src/math/gpu/vendor/common.h
deleted file mode 100644
index 041a9a01c30e9..0000000000000
--- a/libc/src/math/gpu/vendor/common.h
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- Common interface for compiling the GPU math -----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GPU_COMMON_H
-#define LLVM_LIBC_SRC_MATH_GPU_COMMON_H
-
-#include "src/__support/macros/properties/architectures.h"
-
-#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
-#include "amdgpu/amdgpu.h"
-#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
-#include "nvptx/nvptx.h"
-#else
-#error "Unsupported platform"
-#endif
-
-#endif // LLVM_LIBC_SRC_MATH_GPU_COMMON_H
diff --git a/libc/src/math/nvptx/CMakeLists.txt b/libc/src/math/nvptx/CMakeLists.txt
new file mode 100644
index 0000000000000..194e1fa7af491
--- /dev/null
+++ b/libc/src/math/nvptx/CMakeLists.txt
@@ -0,0 +1,1179 @@
+# Math functions not yet available in the libc project, or those not yet tuned
+# for GPU workloads are provided as wrappers over vendor libraries. If we find
+# them ahead of time we will import them statically. Otherwise, we will keep
+# them as external references and expect them to be resolved by the user when
+# they compile. In the future,we will use implementations from the 'libc'
+# project and not provide these wrappers.
+if(CUDAToolkit_FOUND)
+  set(libdevice_path ${CUDAToolkit_BIN_DIR}/../nvvm/libdevice/libdevice.10.bc)
+  if (EXISTS ${libdevice_path})
+    message(STATUS "Found the CUDA device library. Implementations falling back "
+                   "to the vendor libraries will be resolved statically.")
+    set(bitcode_link_flags 
+        "SHELL:-Xclang -mlink-builtin-bitcode -Xclang ${libdevice_path}")
+  endif()
+else()
+  message(STATUS "Could not find the CUDA device library. Unimplemented "
+                 "functions will be an external reference to the vendor libraries.")
+endif()
+
+add_entrypoint_object(
+  ceil
+  SRCS
+    ceil.cpp
+  HDRS
+    ../ceil.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  ceilf
+  SRCS
+    ceilf.cpp
+  HDRS
+    ../ceilf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysign
+  SRCS
+    copysign.cpp
+  HDRS
+    ../copysign.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  copysignf
+  SRCS
+    copysignf.cpp
+  HDRS
+    ../copysignf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabs
+  SRCS
+    fabs.cpp
+  HDRS
+    ../fabs.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fabsf
+  SRCS
+    fabsf.cpp
+  HDRS
+    ../fabsf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floor
+  SRCS
+    floor.cpp
+  HDRS
+    ../floor.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  floorf
+  SRCS
+    floorf.cpp
+  HDRS
+    ../floorf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fma
+  SRCS
+    fma.cpp
+  HDRS
+    ../fma.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaf
+  SRCS
+    fmaf.cpp
+  HDRS
+    ../fmaf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmax
+  SRCS
+    fmax.cpp
+  HDRS
+    ../fmax.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaxf
+  SRCS
+    fmaxf.cpp
+  HDRS
+    ../fmaxf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmin
+  SRCS
+    fmin.cpp
+  HDRS
+    ../fmin.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fminf
+  SRCS
+    fminf.cpp
+  HDRS
+    ../fminf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmod
+  SRCS
+    fmod.cpp
+  HDRS
+    ../fmod.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmodf
+  SRCS
+    fmodf.cpp
+  HDRS
+    ../fmodf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lround
+  SRCS
+    lround.cpp
+  HDRS
+    ../lround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  lroundf
+  SRCS
+    lroundf.cpp
+  HDRS
+    ../lroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llround
+  SRCS
+    llround.cpp
+  HDRS
+    ../llround.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  llroundf
+  SRCS
+    llroundf.cpp
+  HDRS
+    ../llroundf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modf
+  SRCS
+    modf.cpp
+  HDRS
+    ../modf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  modff
+  SRCS
+    modff.cpp
+  HDRS
+    ../modff.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyint
+  SRCS
+    nearbyint.cpp
+  HDRS
+    ../nearbyint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  nearbyintf
+  SRCS
+    nearbyintf.cpp
+  HDRS
+    ../nearbyintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainder
+  SRCS
+    remainder.cpp
+  HDRS
+    ../remainder.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  remainderf
+  SRCS
+    remainderf.cpp
+  HDRS
+    ../remainderf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rint
+  SRCS
+    rint.cpp
+  HDRS
+    ../rint.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  rintf
+  SRCS
+    rintf.cpp
+  HDRS
+    ../rintf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  round
+  SRCS
+    round.cpp
+  HDRS
+    ../round.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  sqrt
+  SRCS
+    sqrt.cpp
+  HDRS
+    ../sqrt.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  sqrtf
+  SRCS
+    sqrtf.cpp
+  HDRS
+    ../sqrtf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  trunc
+  SRCS
+    trunc.cpp
+  HDRS
+    ../trunc.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  truncf
+  SRCS
+    truncf.cpp
+  HDRS
+    ../truncf.h
+  COMPILE_OPTIONS
+    -O2
+)
+
+# The following functions currently are not implemented natively and borrow from
+# existing implementations. This will be removed in the future.
+add_entrypoint_object(
+  acos
+  SRCS
+    acos.cpp
+  HDRS
+    ../acos.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  acosf
+  SRCS
+    acosf.cpp
+  HDRS
+    ../acosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  acosh
+  SRCS
+    acosh.cpp
+  HDRS
+    ../acosh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  acoshf
+  SRCS
+    acoshf.cpp
+  HDRS
+    ../acoshf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  asin
+  SRCS
+    asin.cpp
+  HDRS
+    ../asin.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  asinf
+  SRCS
+    asinf.cpp
+  HDRS
+    ../asinf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  asinh
+  SRCS
+    asinh.cpp
+  HDRS
+    ../asinh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atan
+  SRCS
+    atan.cpp
+  HDRS
+    ../atan.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atanf
+  SRCS
+    atanf.cpp
+  HDRS
+    ../atanf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atan2
+  SRCS
+    atan2.cpp
+  HDRS
+    ../atan2.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atan2f
+  SRCS
+    atan2f.cpp
+  HDRS
+    ../atan2f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atanh
+  SRCS
+    atanh.cpp
+  HDRS
+    ../atanh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  atanhf
+  SRCS
+    atanhf.cpp
+  HDRS
+    ../atanhf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  cos
+  SRCS
+    cos.cpp
+  HDRS
+    ../cos.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  cosf
+  SRCS
+    cosf.cpp
+  HDRS
+    ../cosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  cosh
+  SRCS
+    cosh.cpp
+  HDRS
+    ../cosh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  coshf
+  SRCS
+    coshf.cpp
+  HDRS
+    ../coshf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  erf
+  SRCS
+    erf.cpp
+  HDRS
+    ../erf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  erff
+  SRCS
+    erff.cpp
+  HDRS
+    ../erff.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp
+  SRCS
+    exp.cpp
+  HDRS
+    ../exp.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp10
+  SRCS
+    exp10.cpp
+  HDRS
+    ../exp10.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp10f
+  SRCS
+    exp10f.cpp
+  HDRS
+    ../exp10f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp2
+  SRCS
+    exp2.cpp
+  HDRS
+    ../exp2.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  exp2f
+  SRCS
+    exp2f.cpp
+  HDRS
+    ../exp2f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  expf
+  SRCS
+    expf.cpp
+  HDRS
+    ../expf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  expm1
+  SRCS
+    expm1.cpp
+  HDRS
+    ../expm1.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  expm1f
+  SRCS
+    expm1f.cpp
+  HDRS
+    ../expm1f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  fdim
+  SRCS
+    fdim.cpp
+  HDRS
+    ../fdim.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  fdimf
+  SRCS
+    fdimf.cpp
+  HDRS
+    ../fdimf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  hypot
+  SRCS
+    hypot.cpp
+  HDRS
+    ../hypot.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  hypotf
+  SRCS
+    hypotf.cpp
+  HDRS
+    ../hypotf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ilogb
+  SRCS
+    ilogb.cpp
+  HDRS
+    ../ilogb.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ilogbf
+  SRCS
+    ilogbf.cpp
+  HDRS
+    ../ilogbf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log10
+  SRCS
+    log10.cpp
+  HDRS
+    ../log10.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log10f
+  SRCS
+    log10f.cpp
+  HDRS
+    ../log10f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log2
+  SRCS
+    log2.cpp
+  HDRS
+    ../log2.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log2f
+  SRCS
+    log2f.cpp
+  HDRS
+    ../log2f.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log
+  SRCS
+    log.cpp
+  HDRS
+    ../log.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  logf
+  SRCS
+    logf.cpp
+  HDRS
+    ../logf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  lrint
+  SRCS
+    lrint.cpp
+  HDRS
+    ../lrint.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  lrintf
+  SRCS
+    lrintf.cpp
+  HDRS
+    ../lrintf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ldexp
+  SRCS
+    ldexp.cpp
+  HDRS
+    ../ldexp.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  ldexpf
+  SRCS
+    ldexpf.cpp
+  HDRS
+    ../ldexpf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log1p
+  SRCS
+    log1p.cpp
+  HDRS
+    ../log1p.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  log1pf
+  SRCS
+    log1pf.cpp
+  HDRS
+    ../log1pf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  llrint
+  SRCS
+    llrint.cpp
+  HDRS
+    ../llrint.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  llrintf
+  SRCS
+    llrintf.cpp
+  HDRS
+    ../llrintf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  remquo
+  SRCS
+    remquo.cpp
+  HDRS
+    ../remquo.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  remquof
+  SRCS
+    remquof.cpp
+  HDRS
+    ../remquof.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  scalbn
+  SRCS
+    scalbn.cpp
+  HDRS
+    ../scalbn.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  scalbnf
+  SRCS
+    scalbnf.cpp
+  HDRS
+    ../scalbnf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+
+add_entrypoint_object(
+  nextafter
+  SRCS
+    nextafter.cpp
+  HDRS
+    ../nextafter.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  nextafterf
+  SRCS
+    nextafterf.cpp
+  HDRS
+    ../nextafterf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  pow
+  SRCS
+    pow.cpp
+  HDRS
+    ../pow.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  powf
+  SRCS
+    powf.cpp
+  HDRS
+    ../powf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sin
+  SRCS
+    sin.cpp
+  HDRS
+    ../sin.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sinf
+  SRCS
+    sinf.cpp
+  HDRS
+    ../sinf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sincos
+  SRCS
+    sincos.cpp
+  HDRS
+    ../sincos.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sincosf
+  SRCS
+    sincosf.cpp
+  HDRS
+    ../sincosf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sinh
+  SRCS
+    sinh.cpp
+  HDRS
+    ../sinh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  sinhf
+  SRCS
+    sinhf.cpp
+  HDRS
+    ../sinhf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tan
+  SRCS
+    tan.cpp
+  HDRS
+    ../tan.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tanf
+  SRCS
+    tanf.cpp
+  HDRS
+    ../tanf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tanh
+  SRCS
+    tanh.cpp
+  HDRS
+    ../tanh.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tanhf
+  SRCS
+    tanhf.cpp
+  HDRS
+    ../tanhf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tgamma
+  SRCS
+    tgamma.cpp
+  HDRS
+    ../tgamma.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  tgammaf
+  SRCS
+    tgammaf.cpp
+  HDRS
+    ../tgammaf.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  frexp
+  SRCS
+    frexp.cpp
+  HDRS
+    ../frexp.h
+  COMPILE_OPTIONS
+    ${bitcode_link_flags}
+    -O2
+  VENDOR
+)
+
+add_entrypoint_object(
+  frexpf
+  SRCS
+    frexpf.cpp
+  HDRS
+    ../frexpf.h
+  COMPILE_OPTIONS
+    ${itcode_link_flags}
+    -O2
+  VENDOR
+)
diff --git a/libc/src/math/gpu/vendor/acos.cpp b/libc/src/math/nvptx/acos.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/acos.cpp
rename to libc/src/math/nvptx/acos.cpp
index 83b674fa6ae3b..da2c7952feba5 100644
--- a/libc/src/math/gpu/vendor/acos.cpp
+++ b/libc/src/math/nvptx/acos.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acos.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, acos, (double x)) { return internal::acos(x); }
+LLVM_LIBC_FUNCTION(double, acos, (double x)) { return __nv_acos(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/acosf.cpp b/libc/src/math/nvptx/acosf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/acosf.cpp
rename to libc/src/math/nvptx/acosf.cpp
index ac629761a4398..8a4125f03b8cd 100644
--- a/libc/src/math/gpu/vendor/acosf.cpp
+++ b/libc/src/math/nvptx/acosf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acosf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return internal::acosf(x); }
+LLVM_LIBC_FUNCTION(float, acosf, (float x)) { return __nv_acosf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/acosh.cpp b/libc/src/math/nvptx/acosh.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/acosh.cpp
rename to libc/src/math/nvptx/acosh.cpp
index cc1b8b572b3db..06f6e2922f569 100644
--- a/libc/src/math/gpu/vendor/acosh.cpp
+++ b/libc/src/math/nvptx/acosh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acosh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, acosh, (double x)) { return internal::acosh(x); }
+LLVM_LIBC_FUNCTION(double, acosh, (double x)) { return __nv_acosh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/acoshf.cpp b/libc/src/math/nvptx/acoshf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/acoshf.cpp
rename to libc/src/math/nvptx/acoshf.cpp
index a0384f89eed3a..00e8053a50783 100644
--- a/libc/src/math/gpu/vendor/acoshf.cpp
+++ b/libc/src/math/nvptx/acoshf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/acoshf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return internal::acoshf(x); }
+LLVM_LIBC_FUNCTION(float, acoshf, (float x)) { return __nv_acoshf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asin.cpp b/libc/src/math/nvptx/asin.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/asin.cpp
rename to libc/src/math/nvptx/asin.cpp
index 24a8a136e88eb..74d92fded72b9 100644
--- a/libc/src/math/gpu/vendor/asin.cpp
+++ b/libc/src/math/nvptx/asin.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asin.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, asin, (double x)) { return internal::asin(x); }
+LLVM_LIBC_FUNCTION(double, asin, (double x)) { return __nv_asin(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asinf.cpp b/libc/src/math/nvptx/asinf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/asinf.cpp
rename to libc/src/math/nvptx/asinf.cpp
index 595a48f82744d..30544bc1313d6 100644
--- a/libc/src/math/gpu/vendor/asinf.cpp
+++ b/libc/src/math/nvptx/asinf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asinf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, asinf, (float x)) { return internal::asinf(x); }
+LLVM_LIBC_FUNCTION(float, asinf, (float x)) { return __nv_asinf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asinh.cpp b/libc/src/math/nvptx/asinh.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/asinh.cpp
rename to libc/src/math/nvptx/asinh.cpp
index f417d9fd8c1f7..0e5dbb47e667e 100644
--- a/libc/src/math/gpu/vendor/asinh.cpp
+++ b/libc/src/math/nvptx/asinh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asinh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, asinh, (double x)) { return internal::asinh(x); }
+LLVM_LIBC_FUNCTION(double, asinh, (double x)) { return __nv_asinh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/asinhf.cpp b/libc/src/math/nvptx/asinhf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/asinhf.cpp
rename to libc/src/math/nvptx/asinhf.cpp
index 78e5543cf3665..6648108646cd3 100644
--- a/libc/src/math/gpu/vendor/asinhf.cpp
+++ b/libc/src/math/nvptx/asinhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/asinhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return internal::asinhf(x); }
+LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { return __nv_asinhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atan.cpp b/libc/src/math/nvptx/atan.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/atan.cpp
rename to libc/src/math/nvptx/atan.cpp
index 45d7f02c02cd6..3af793a53ae5e 100644
--- a/libc/src/math/gpu/vendor/atan.cpp
+++ b/libc/src/math/nvptx/atan.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atan.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, atan, (double x)) { return internal::atan(x); }
+LLVM_LIBC_FUNCTION(double, atan, (double x)) { return __nv_atan(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atan2.cpp b/libc/src/math/nvptx/atan2.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/atan2.cpp
rename to libc/src/math/nvptx/atan2.cpp
index 94e215e52ca60..0c54e0e048996 100644
--- a/libc/src/math/gpu/vendor/atan2.cpp
+++ b/libc/src/math/nvptx/atan2.cpp
@@ -9,12 +9,12 @@
 #include "src/math/atan2.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, atan2, (double x, double y)) {
-  return internal::atan2(x, y);
+  return __nv_atan2(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atan2f.cpp b/libc/src/math/nvptx/atan2f.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/atan2f.cpp
rename to libc/src/math/nvptx/atan2f.cpp
index 70caa568e32d9..c3327d92c97e9 100644
--- a/libc/src/math/gpu/vendor/atan2f.cpp
+++ b/libc/src/math/nvptx/atan2f.cpp
@@ -9,12 +9,12 @@
 #include "src/math/atan2f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, atan2f, (float x, float y)) {
-  return internal::atan2f(x, y);
+  return __nv_atan2f(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atanf.cpp b/libc/src/math/nvptx/atanf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/atanf.cpp
rename to libc/src/math/nvptx/atanf.cpp
index 132c43d9e3af7..5595262977323 100644
--- a/libc/src/math/gpu/vendor/atanf.cpp
+++ b/libc/src/math/nvptx/atanf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atanf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return internal::atanf(x); }
+LLVM_LIBC_FUNCTION(float, atanf, (float x)) { return __nv_atanf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atanh.cpp b/libc/src/math/nvptx/atanh.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/atanh.cpp
rename to libc/src/math/nvptx/atanh.cpp
index 07a75fcbbfc7c..6699d959df187 100644
--- a/libc/src/math/gpu/vendor/atanh.cpp
+++ b/libc/src/math/nvptx/atanh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atanh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, atanh, (double x)) { return internal::atanh(x); }
+LLVM_LIBC_FUNCTION(double, atanh, (double x)) { return __nv_atanh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/atanhf.cpp b/libc/src/math/nvptx/atanhf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/atanhf.cpp
rename to libc/src/math/nvptx/atanhf.cpp
index 521c4133243d9..526b7b3e37122 100644
--- a/libc/src/math/gpu/vendor/atanhf.cpp
+++ b/libc/src/math/nvptx/atanhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/atanhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { return internal::atanhf(x); }
+LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { return __nv_atanhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/ceil.cpp b/libc/src/math/nvptx/ceil.cpp
new file mode 100644
index 0000000000000..ad1407d61f620
--- /dev/null
+++ b/libc/src/math/nvptx/ceil.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the ceil function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ceil.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, ceil, (double x)) { return __builtin_ceil(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/ceilf.cpp b/libc/src/math/nvptx/ceilf.cpp
new file mode 100644
index 0000000000000..c4fc58d936038
--- /dev/null
+++ b/libc/src/math/nvptx/ceilf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the ceilf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ceilf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, ceilf, (float x)) { return __builtin_ceilf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/copysign.cpp b/libc/src/math/nvptx/copysign.cpp
new file mode 100644
index 0000000000000..6f804bdb90a1f
--- /dev/null
+++ b/libc/src/math/nvptx/copysign.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the copysign function for GPU -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/copysign.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, copysign, (double x, double y)) {
+  return __builtin_copysign(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/copysignf.cpp b/libc/src/math/nvptx/copysignf.cpp
new file mode 100644
index 0000000000000..4d7e132462ac9
--- /dev/null
+++ b/libc/src/math/nvptx/copysignf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the copysignf function for GPU ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/copysignf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, copysignf, (float x, float y)) {
+  return __builtin_copysignf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/cos.cpp b/libc/src/math/nvptx/cos.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/cos.cpp
rename to libc/src/math/nvptx/cos.cpp
index 37c7507911ec2..185ad3cf92159 100644
--- a/libc/src/math/gpu/vendor/cos.cpp
+++ b/libc/src/math/nvptx/cos.cpp
@@ -9,10 +9,10 @@
 #include "src/math/cos.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, cos, (double x)) { return internal::cos(x); }
+LLVM_LIBC_FUNCTION(double, cos, (double x)) { return __nv_cos(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/cosf.cpp b/libc/src/math/nvptx/cosf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/cosf.cpp
rename to libc/src/math/nvptx/cosf.cpp
index 1bd42ba37e92d..3d34de4be51bc 100644
--- a/libc/src/math/gpu/vendor/cosf.cpp
+++ b/libc/src/math/nvptx/cosf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/cosf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return internal::cosf(x); }
+LLVM_LIBC_FUNCTION(float, cosf, (float x)) { return __nv_cosf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/cosh.cpp b/libc/src/math/nvptx/cosh.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/cosh.cpp
rename to libc/src/math/nvptx/cosh.cpp
index 3be05e58b51d9..179864c5f910d 100644
--- a/libc/src/math/gpu/vendor/cosh.cpp
+++ b/libc/src/math/nvptx/cosh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/cosh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, cosh, (double x)) { return internal::cosh(x); }
+LLVM_LIBC_FUNCTION(double, cosh, (double x)) { return __nv_cosh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/coshf.cpp b/libc/src/math/nvptx/coshf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/coshf.cpp
rename to libc/src/math/nvptx/coshf.cpp
index 1b945bbd7c47e..9147499db97cf 100644
--- a/libc/src/math/gpu/vendor/coshf.cpp
+++ b/libc/src/math/nvptx/coshf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/coshf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, coshf, (float x)) { return internal::coshf(x); }
+LLVM_LIBC_FUNCTION(float, coshf, (float x)) { return __nv_coshf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nvptx/declarations.h b/libc/src/math/nvptx/declarations.h
similarity index 100%
rename from libc/src/math/gpu/vendor/nvptx/declarations.h
rename to libc/src/math/nvptx/declarations.h
diff --git a/libc/src/math/gpu/vendor/erf.cpp b/libc/src/math/nvptx/erf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/erf.cpp
rename to libc/src/math/nvptx/erf.cpp
index 190321ca25992..5ea0177d5cd34 100644
--- a/libc/src/math/gpu/vendor/erf.cpp
+++ b/libc/src/math/nvptx/erf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/erf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, erf, (double x)) { return internal::erf(x); }
+LLVM_LIBC_FUNCTION(double, erf, (double x)) { return __nv_erf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/erff.cpp b/libc/src/math/nvptx/erff.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/erff.cpp
rename to libc/src/math/nvptx/erff.cpp
index a5a08be54b075..03fdceace8e9e 100644
--- a/libc/src/math/gpu/vendor/erff.cpp
+++ b/libc/src/math/nvptx/erff.cpp
@@ -9,10 +9,10 @@
 #include "src/math/erff.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, erff, (float x)) { return internal::erff(x); }
+LLVM_LIBC_FUNCTION(float, erff, (float x)) { return __nv_erff(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/exp.cpp b/libc/src/math/nvptx/exp.cpp
new file mode 100644
index 0000000000000..6bbe87ba2e783
--- /dev/null
+++ b/libc/src/math/nvptx/exp.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU exp function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, exp, (double x)) { return __nv_exp(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp10.cpp b/libc/src/math/nvptx/exp10.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/exp10.cpp
rename to libc/src/math/nvptx/exp10.cpp
index 8557a33f01884..11bb734fd1131 100644
--- a/libc/src/math/gpu/vendor/exp10.cpp
+++ b/libc/src/math/nvptx/exp10.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp10.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return internal::exp10(x); }
+LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return __nv_exp10(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp10f.cpp b/libc/src/math/nvptx/exp10f.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/exp10f.cpp
rename to libc/src/math/nvptx/exp10f.cpp
index 844809355087b..4e3121a0b46e2 100644
--- a/libc/src/math/gpu/vendor/exp10f.cpp
+++ b/libc/src/math/nvptx/exp10f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp10f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return internal::exp10f(x); }
+LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return __nv_exp10f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp2.cpp b/libc/src/math/nvptx/exp2.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/exp2.cpp
rename to libc/src/math/nvptx/exp2.cpp
index ffa23d810a9b1..35fc27b3a26a6 100644
--- a/libc/src/math/gpu/vendor/exp2.cpp
+++ b/libc/src/math/nvptx/exp2.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp2.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return internal::exp2(x); }
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return __nv_exp2(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/exp2f.cpp b/libc/src/math/nvptx/exp2f.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/exp2f.cpp
rename to libc/src/math/nvptx/exp2f.cpp
index cb61557383dfa..8d137346fe00e 100644
--- a/libc/src/math/gpu/vendor/exp2f.cpp
+++ b/libc/src/math/nvptx/exp2f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/exp2f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { return internal::exp2f(x); }
+LLVM_LIBC_FUNCTION(float, exp2f, (float x)) { return __nv_exp2f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/expf.cpp b/libc/src/math/nvptx/expf.cpp
new file mode 100644
index 0000000000000..a6362bd734611
--- /dev/null
+++ b/libc/src/math/nvptx/expf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the expf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, expf, (float x)) { return __nv_expf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/expm1.cpp b/libc/src/math/nvptx/expm1.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/expm1.cpp
rename to libc/src/math/nvptx/expm1.cpp
index 6ac5f753b9e4d..0331903b8fd86 100644
--- a/libc/src/math/gpu/vendor/expm1.cpp
+++ b/libc/src/math/nvptx/expm1.cpp
@@ -9,10 +9,10 @@
 #include "src/math/expm1.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, expm1, (double x)) { return internal::expm1(x); }
+LLVM_LIBC_FUNCTION(double, expm1, (double x)) { return __nv_expm1(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/expm1f.cpp b/libc/src/math/nvptx/expm1f.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/expm1f.cpp
rename to libc/src/math/nvptx/expm1f.cpp
index c5497797dbe1b..7b74c548f3df6 100644
--- a/libc/src/math/gpu/vendor/expm1f.cpp
+++ b/libc/src/math/nvptx/expm1f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/expm1f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return internal::expm1f(x); }
+LLVM_LIBC_FUNCTION(float, expm1f, (float x)) { return __nv_expm1f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fabs.cpp b/libc/src/math/nvptx/fabs.cpp
new file mode 100644
index 0000000000000..c0d063d50ae53
--- /dev/null
+++ b/libc/src/math/nvptx/fabs.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the fabs function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fabs.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fabs, (double x)) { return __builtin_fabs(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fabsf.cpp b/libc/src/math/nvptx/fabsf.cpp
new file mode 100644
index 0000000000000..398ffd0c74c07
--- /dev/null
+++ b/libc/src/math/nvptx/fabsf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the fabsf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fabsf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fabsf, (float x)) { return __builtin_fabsf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/fdim.cpp b/libc/src/math/nvptx/fdim.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/fdim.cpp
rename to libc/src/math/nvptx/fdim.cpp
index f30dafb46e548..2f1ff51800266 100644
--- a/libc/src/math/gpu/vendor/fdim.cpp
+++ b/libc/src/math/nvptx/fdim.cpp
@@ -9,12 +9,12 @@
 #include "src/math/fdim.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fdim, (double x, double y)) {
-  return internal::fdim(x, y);
+  return __nv_fdim(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/fdimf.cpp b/libc/src/math/nvptx/fdimf.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/fdimf.cpp
rename to libc/src/math/nvptx/fdimf.cpp
index e30736206a9a9..c24e6be72482a 100644
--- a/libc/src/math/gpu/vendor/fdimf.cpp
+++ b/libc/src/math/nvptx/fdimf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/fdimf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fdimf, (float x, float y)) {
-  return internal::fdimf(x, y);
+  return __nv_fdimf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/floor.cpp b/libc/src/math/nvptx/floor.cpp
new file mode 100644
index 0000000000000..eada89c178d75
--- /dev/null
+++ b/libc/src/math/nvptx/floor.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the floor function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/floor.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, floor, (double x)) { return __builtin_floor(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/floorf.cpp b/libc/src/math/nvptx/floorf.cpp
new file mode 100644
index 0000000000000..a5611c515a88d
--- /dev/null
+++ b/libc/src/math/nvptx/floorf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the floorf function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/floorf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, floorf, (float x)) { return __builtin_floorf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fma.cpp b/libc/src/math/nvptx/fma.cpp
new file mode 100644
index 0000000000000..41a6ddf60dbc3
--- /dev/null
+++ b/libc/src/math/nvptx/fma.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fma function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fma.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) {
+  return __builtin_fma(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmaf.cpp b/libc/src/math/nvptx/fmaf.cpp
new file mode 100644
index 0000000000000..c948e32f77eb9
--- /dev/null
+++ b/libc/src/math/nvptx/fmaf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmaf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmaf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmaf, (float x, float y, float z)) {
+  return __builtin_fmaf(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmax.cpp b/libc/src/math/nvptx/fmax.cpp
new file mode 100644
index 0000000000000..a2c35371d12b6
--- /dev/null
+++ b/libc/src/math/nvptx/fmax.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmax function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmax.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
+  return __builtin_fmax(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmaxf.cpp b/libc/src/math/nvptx/fmaxf.cpp
new file mode 100644
index 0000000000000..67178b3e27357
--- /dev/null
+++ b/libc/src/math/nvptx/fmaxf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmaxf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmaxf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
+  return __builtin_fmaxf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmin.cpp b/libc/src/math/nvptx/fmin.cpp
new file mode 100644
index 0000000000000..7303adcd347ee
--- /dev/null
+++ b/libc/src/math/nvptx/fmin.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmin function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmin.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
+  return __builtin_fmin(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fminf.cpp b/libc/src/math/nvptx/fminf.cpp
new file mode 100644
index 0000000000000..bbf0c677b5e3a
--- /dev/null
+++ b/libc/src/math/nvptx/fminf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fminf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fminf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
+  return __builtin_fminf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmod.cpp b/libc/src/math/nvptx/fmod.cpp
new file mode 100644
index 0000000000000..0654cdd2abe08
--- /dev/null
+++ b/libc/src/math/nvptx/fmod.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmod function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmod.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, fmod, (double x, double y)) {
+  return __builtin_fmod(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/fmodf.cpp b/libc/src/math/nvptx/fmodf.cpp
new file mode 100644
index 0000000000000..b689046468fbe
--- /dev/null
+++ b/libc/src/math/nvptx/fmodf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the fmodf function for GPU ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, fmodf, (float x, float y)) {
+  return __builtin_fmodf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/frexp.cpp b/libc/src/math/nvptx/frexp.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/frexp.cpp
rename to libc/src/math/nvptx/frexp.cpp
index 5fc2c1409c6e9..2423961f7c61e 100644
--- a/libc/src/math/gpu/vendor/frexp.cpp
+++ b/libc/src/math/nvptx/frexp.cpp
@@ -9,12 +9,12 @@
 #include "src/math/frexp.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, frexp, (double x, int *p)) {
-  return internal::frexp(x, p);
+  return __nv_frexp(x, p);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/frexpf.cpp b/libc/src/math/nvptx/frexpf.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/frexpf.cpp
rename to libc/src/math/nvptx/frexpf.cpp
index e928d375e03db..f1ea29068777b 100644
--- a/libc/src/math/gpu/vendor/frexpf.cpp
+++ b/libc/src/math/nvptx/frexpf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/frexpf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, frexpf, (float x, int *p)) {
-  return internal::frexpf(x, p);
+  return __nv_frexpf(x, p);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/hypot.cpp b/libc/src/math/nvptx/hypot.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/hypot.cpp
rename to libc/src/math/nvptx/hypot.cpp
index 45b629e3e2858..28bf04aa4a2bd 100644
--- a/libc/src/math/gpu/vendor/hypot.cpp
+++ b/libc/src/math/nvptx/hypot.cpp
@@ -9,12 +9,12 @@
 #include "src/math/hypot.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, hypot, (double x, double y)) {
-  return internal::hypot(x, y);
+  return __nv_hypot(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/hypotf.cpp b/libc/src/math/nvptx/hypotf.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/hypotf.cpp
rename to libc/src/math/nvptx/hypotf.cpp
index 533e9dcb8dbfb..c506aab1acc0b 100644
--- a/libc/src/math/gpu/vendor/hypotf.cpp
+++ b/libc/src/math/nvptx/hypotf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/hypotf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
-  return internal::hypotf(x, y);
+  return __nv_hypotf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ilogb.cpp b/libc/src/math/nvptx/ilogb.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/ilogb.cpp
rename to libc/src/math/nvptx/ilogb.cpp
index 1d075027b41c0..fc75e2fd847a8 100644
--- a/libc/src/math/gpu/vendor/ilogb.cpp
+++ b/libc/src/math/nvptx/ilogb.cpp
@@ -9,10 +9,10 @@
 #include "src/math/ilogb.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return internal::ilogb(x); }
+LLVM_LIBC_FUNCTION(int, ilogb, (double x)) { return __nv_ilogb(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ilogbf.cpp b/libc/src/math/nvptx/ilogbf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/ilogbf.cpp
rename to libc/src/math/nvptx/ilogbf.cpp
index 8dc2ff0a374af..3d14fcfa878fb 100644
--- a/libc/src/math/gpu/vendor/ilogbf.cpp
+++ b/libc/src/math/nvptx/ilogbf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/ilogbf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return internal::ilogbf(x); }
+LLVM_LIBC_FUNCTION(int, ilogbf, (float x)) { return __nv_ilogbf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ldexp.cpp b/libc/src/math/nvptx/ldexp.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/ldexp.cpp
rename to libc/src/math/nvptx/ldexp.cpp
index f760a42563305..761dc4816b7a4 100644
--- a/libc/src/math/gpu/vendor/ldexp.cpp
+++ b/libc/src/math/nvptx/ldexp.cpp
@@ -9,12 +9,12 @@
 #include "src/math/ldexp.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, ldexp, (double x, int y)) {
-  return internal::ldexp(x, y);
+  return __nv_ldexp(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/ldexpf.cpp b/libc/src/math/nvptx/ldexpf.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/ldexpf.cpp
rename to libc/src/math/nvptx/ldexpf.cpp
index d00d39115ffc3..2d4c556a27140 100644
--- a/libc/src/math/gpu/vendor/ldexpf.cpp
+++ b/libc/src/math/nvptx/ldexpf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/ldexpf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, ldexpf, (float x, int y)) {
-  return internal::ldexpf(x, y);
+  return __nv_ldexpf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llrint.cpp b/libc/src/math/nvptx/llrint.cpp
new file mode 100644
index 0000000000000..8f95e75e779b5
--- /dev/null
+++ b/libc/src/math/nvptx/llrint.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the llrint function for GPU ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llrint.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llrint, (double x)) { return __nv_llrint(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llrintf.cpp b/libc/src/math/nvptx/llrintf.cpp
new file mode 100644
index 0000000000000..1432ffbd1bdad
--- /dev/null
+++ b/libc/src/math/nvptx/llrintf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the llrintf function for GPU --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llrintf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llrintf, (float x)) { return __nv_llrintf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llround.cpp b/libc/src/math/nvptx/llround.cpp
new file mode 100644
index 0000000000000..afd98308730a6
--- /dev/null
+++ b/libc/src/math/nvptx/llround.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU llround function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llround.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llround, (double x)) {
+  return __builtin_llround(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/llroundf.cpp b/libc/src/math/nvptx/llroundf.cpp
new file mode 100644
index 0000000000000..897ed15b69284
--- /dev/null
+++ b/libc/src/math/nvptx/llroundf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU llroundf function -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/llroundf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long long, llroundf, (float x)) {
+  return __builtin_lroundf(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log.cpp b/libc/src/math/nvptx/log.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log.cpp
rename to libc/src/math/nvptx/log.cpp
index a97689abcc217..26b6dfa607b94 100644
--- a/libc/src/math/gpu/vendor/log.cpp
+++ b/libc/src/math/nvptx/log.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log, (double x)) { return internal::log(x); }
+LLVM_LIBC_FUNCTION(double, log, (double x)) { return __nv_log(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log10.cpp b/libc/src/math/nvptx/log10.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log10.cpp
rename to libc/src/math/nvptx/log10.cpp
index c7a917a75cc96..ff27025395672 100644
--- a/libc/src/math/gpu/vendor/log10.cpp
+++ b/libc/src/math/nvptx/log10.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log10.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log10, (double x)) { return internal::log10(x); }
+LLVM_LIBC_FUNCTION(double, log10, (double x)) { return __nv_log10(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log10f.cpp b/libc/src/math/nvptx/log10f.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log10f.cpp
rename to libc/src/math/nvptx/log10f.cpp
index 489f5f558be1f..af903b60a7ce6 100644
--- a/libc/src/math/gpu/vendor/log10f.cpp
+++ b/libc/src/math/nvptx/log10f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log10f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return internal::log10f(x); }
+LLVM_LIBC_FUNCTION(float, log10f, (float x)) { return __nv_log10f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log1p.cpp b/libc/src/math/nvptx/log1p.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log1p.cpp
rename to libc/src/math/nvptx/log1p.cpp
index 720d23e2f952b..47bc96b0d8812 100644
--- a/libc/src/math/gpu/vendor/log1p.cpp
+++ b/libc/src/math/nvptx/log1p.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log1p.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return internal::log1p(x); }
+LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return __nv_log1p(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log1pf.cpp b/libc/src/math/nvptx/log1pf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log1pf.cpp
rename to libc/src/math/nvptx/log1pf.cpp
index 96ad48b529cf6..bfa4f7f22d54c 100644
--- a/libc/src/math/gpu/vendor/log1pf.cpp
+++ b/libc/src/math/nvptx/log1pf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log1pf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return internal::log1pf(x); }
+LLVM_LIBC_FUNCTION(float, log1pf, (float x)) { return __nv_log1pf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log2.cpp b/libc/src/math/nvptx/log2.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log2.cpp
rename to libc/src/math/nvptx/log2.cpp
index 9fc8a81e7e757..86a980de65d4d 100644
--- a/libc/src/math/gpu/vendor/log2.cpp
+++ b/libc/src/math/nvptx/log2.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log2.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, log2, (double x)) { return internal::log2(x); }
+LLVM_LIBC_FUNCTION(double, log2, (double x)) { return __nv_log2(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/log2f.cpp b/libc/src/math/nvptx/log2f.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/log2f.cpp
rename to libc/src/math/nvptx/log2f.cpp
index 62df41b69b0bd..5ce46291610dd 100644
--- a/libc/src/math/gpu/vendor/log2f.cpp
+++ b/libc/src/math/nvptx/log2f.cpp
@@ -9,10 +9,10 @@
 #include "src/math/log2f.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, log2f, (float x)) { return internal::log2f(x); }
+LLVM_LIBC_FUNCTION(float, log2f, (float x)) { return __nv_log2f(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/logb.cpp b/libc/src/math/nvptx/logb.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/logb.cpp
rename to libc/src/math/nvptx/logb.cpp
index 5dea57d41b08b..b620b16184fc9 100644
--- a/libc/src/math/gpu/vendor/logb.cpp
+++ b/libc/src/math/nvptx/logb.cpp
@@ -9,10 +9,10 @@
 #include "src/math/logb.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, logb, (double x)) { return internal::logb(x); }
+LLVM_LIBC_FUNCTION(double, logb, (double x)) { return __nv_logb(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/logbf.cpp b/libc/src/math/nvptx/logbf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/logbf.cpp
rename to libc/src/math/nvptx/logbf.cpp
index 1a59df3e09a89..f19f0320db9d4 100644
--- a/libc/src/math/gpu/vendor/logbf.cpp
+++ b/libc/src/math/nvptx/logbf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/logbf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, logbf, (float x)) { return internal::logbf(x); }
+LLVM_LIBC_FUNCTION(float, logbf, (float x)) { return __nv_logbf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/logf.cpp b/libc/src/math/nvptx/logf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/logf.cpp
rename to libc/src/math/nvptx/logf.cpp
index 527b028e100d5..6deb482c0ace3 100644
--- a/libc/src/math/gpu/vendor/logf.cpp
+++ b/libc/src/math/nvptx/logf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/logf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, logf, (float x)) { return internal::logf(x); }
+LLVM_LIBC_FUNCTION(float, logf, (float x)) { return __nv_logf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/lrint.cpp b/libc/src/math/nvptx/lrint.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/lrint.cpp
rename to libc/src/math/nvptx/lrint.cpp
index a08996b755b52..8585f4ce53a4d 100644
--- a/libc/src/math/gpu/vendor/lrint.cpp
+++ b/libc/src/math/nvptx/lrint.cpp
@@ -9,10 +9,10 @@
 #include "src/math/lrint.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(long, lrint, (double x)) { return internal::lrint(x); }
+LLVM_LIBC_FUNCTION(long, lrint, (double x)) { return __nv_lrint(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/lrintf.cpp b/libc/src/math/nvptx/lrintf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/lrintf.cpp
rename to libc/src/math/nvptx/lrintf.cpp
index 695a9b8202cf4..312a9469fade0 100644
--- a/libc/src/math/gpu/vendor/lrintf.cpp
+++ b/libc/src/math/nvptx/lrintf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/lrintf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(long, lrintf, (float x)) { return internal::lrintf(x); }
+LLVM_LIBC_FUNCTION(long, lrintf, (float x)) { return __nv_lrintf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/lround.cpp b/libc/src/math/nvptx/lround.cpp
new file mode 100644
index 0000000000000..51e8f2245af8e
--- /dev/null
+++ b/libc/src/math/nvptx/lround.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU lround function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lround.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lround, (double x)) { return __builtin_lround(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/lroundf.cpp b/libc/src/math/nvptx/lroundf.cpp
new file mode 100644
index 0000000000000..2a6fe7200d8cb
--- /dev/null
+++ b/libc/src/math/nvptx/lroundf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU lroundf function ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/lroundf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(long, lroundf, (float x)) { return __builtin_lroundf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/modf.cpp b/libc/src/math/nvptx/modf.cpp
new file mode 100644
index 0000000000000..07dbbd6059c35
--- /dev/null
+++ b/libc/src/math/nvptx/modf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU modf function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/modf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, modf, (double x, double *iptr)) {
+  return __builtin_modf(x, iptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/modff.cpp b/libc/src/math/nvptx/modff.cpp
new file mode 100644
index 0000000000000..ad35f9006b512
--- /dev/null
+++ b/libc/src/math/nvptx/modff.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU modff function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/modff.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, modff, (float x, float *iptr)) {
+  return __builtin_modff(x, iptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/nearbyint.cpp b/libc/src/math/nvptx/nearbyint.cpp
new file mode 100644
index 0000000000000..9c7b600df7082
--- /dev/null
+++ b/libc/src/math/nvptx/nearbyint.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU nearbyint function ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nearbyint.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, nearbyint, (double x)) {
+  return __builtin_nearbyint(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/nearbyintf.cpp b/libc/src/math/nvptx/nearbyintf.cpp
new file mode 100644
index 0000000000000..7fbe9f4f0e0be
--- /dev/null
+++ b/libc/src/math/nvptx/nearbyintf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU nearbyintf function ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/nearbyintf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, nearbyintf, (float x)) {
+  return __builtin_nearbyintf(x);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nextafter.cpp b/libc/src/math/nvptx/nextafter.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/nextafter.cpp
rename to libc/src/math/nvptx/nextafter.cpp
index f88e17f109084..171aaad6f7cc1 100644
--- a/libc/src/math/gpu/vendor/nextafter.cpp
+++ b/libc/src/math/nvptx/nextafter.cpp
@@ -9,12 +9,12 @@
 #include "src/math/nextafter.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, nextafter, (double x, double y)) {
-  return internal::nextafter(x, y);
+  return __nv_nextafter(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nextafterf.cpp b/libc/src/math/nvptx/nextafterf.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/nextafterf.cpp
rename to libc/src/math/nvptx/nextafterf.cpp
index 7a39dc8fc814e..a45937c0dc8ba 100644
--- a/libc/src/math/gpu/vendor/nextafterf.cpp
+++ b/libc/src/math/nvptx/nextafterf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/nextafterf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, nextafterf, (float x, float y)) {
-  return internal::nextafterf(x, y);
+  return __nv_nextafterf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/nvptx/nvptx.h b/libc/src/math/nvptx/nvptx.h
similarity index 100%
rename from libc/src/math/gpu/vendor/nvptx/nvptx.h
rename to libc/src/math/nvptx/nvptx.h
diff --git a/libc/src/math/nvptx/pow.cpp b/libc/src/math/nvptx/pow.cpp
new file mode 100644
index 0000000000000..7de3c9e7e5448
--- /dev/null
+++ b/libc/src/math/nvptx/pow.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the pow function for GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/pow.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { return __nv_pow(x, y); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/powf.cpp b/libc/src/math/nvptx/powf.cpp
new file mode 100644
index 0000000000000..f9f7dbae63ac5
--- /dev/null
+++ b/libc/src/math/nvptx/powf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the powf function for GPU -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/powf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { return __nv_powf(x, y); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/remainder.cpp b/libc/src/math/nvptx/remainder.cpp
new file mode 100644
index 0000000000000..89b235f9c22af
--- /dev/null
+++ b/libc/src/math/nvptx/remainder.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU remainder function ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remainder.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, remainder, (double x, double y)) {
+  return __builtin_remainder(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/remainderf.cpp b/libc/src/math/nvptx/remainderf.cpp
new file mode 100644
index 0000000000000..9fee6f856dc8b
--- /dev/null
+++ b/libc/src/math/nvptx/remainderf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of the GPU remainderf function ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/remainderf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, remainderf, (float x, float y)) {
+  return __builtin_remainderf(x, y);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/remquo.cpp b/libc/src/math/nvptx/remquo.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/remquo.cpp
rename to libc/src/math/nvptx/remquo.cpp
index e92c9b3c2a6e3..da69a20f8f4f8 100644
--- a/libc/src/math/gpu/vendor/remquo.cpp
+++ b/libc/src/math/nvptx/remquo.cpp
@@ -9,12 +9,12 @@
 #include "src/math/remquo.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, remquo, (double x, double y, int *quo)) {
-  return internal::remquo(x, y, quo);
+  return __nv_remquo(x, y, quo);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/remquof.cpp b/libc/src/math/nvptx/remquof.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/remquof.cpp
rename to libc/src/math/nvptx/remquof.cpp
index b234885aa88c0..dcfba5d7b5fa9 100644
--- a/libc/src/math/gpu/vendor/remquof.cpp
+++ b/libc/src/math/nvptx/remquof.cpp
@@ -9,12 +9,12 @@
 #include "src/math/remquof.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, remquof, (float x, float y, int *quo)) {
-  return internal::remquof(x, y, quo);
+  return __nv_remquof(x, y, quo);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/rint.cpp b/libc/src/math/nvptx/rint.cpp
new file mode 100644
index 0000000000000..44d494a8ed57a
--- /dev/null
+++ b/libc/src/math/nvptx/rint.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU rint function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/rint.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, rint, (double x)) { return __builtin_rint(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/rintf.cpp b/libc/src/math/nvptx/rintf.cpp
new file mode 100644
index 0000000000000..daf98d9436051
--- /dev/null
+++ b/libc/src/math/nvptx/rintf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU rintf function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/rintf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, rintf, (float x)) { return __builtin_rintf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/round.cpp b/libc/src/math/nvptx/round.cpp
new file mode 100644
index 0000000000000..9d8b5582f0407
--- /dev/null
+++ b/libc/src/math/nvptx/round.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU round function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/round.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, round, (double x)) { return __builtin_round(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/roundf.cpp b/libc/src/math/nvptx/roundf.cpp
new file mode 100644
index 0000000000000..8743e4eb7fb8d
--- /dev/null
+++ b/libc/src/math/nvptx/roundf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU roundf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/roundf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, roundf, (float x)) { return __builtin_roundf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/scalbn.cpp b/libc/src/math/nvptx/scalbn.cpp
similarity index 91%
rename from libc/src/math/gpu/vendor/scalbn.cpp
rename to libc/src/math/nvptx/scalbn.cpp
index 435533a90501b..80374db4c1c93 100644
--- a/libc/src/math/gpu/vendor/scalbn.cpp
+++ b/libc/src/math/nvptx/scalbn.cpp
@@ -9,12 +9,12 @@
 #include "src/math/scalbn.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, scalbn, (double x, int y)) {
-  return internal::scalbn(x, y);
+  return __nv_scalbn(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/scalbnf.cpp b/libc/src/math/nvptx/scalbnf.cpp
similarity index 90%
rename from libc/src/math/gpu/vendor/scalbnf.cpp
rename to libc/src/math/nvptx/scalbnf.cpp
index 0a4844c8a433e..24fa3a5ed698a 100644
--- a/libc/src/math/gpu/vendor/scalbnf.cpp
+++ b/libc/src/math/nvptx/scalbnf.cpp
@@ -9,12 +9,12 @@
 #include "src/math/scalbnf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, scalbnf, (float x, int y)) {
-  return internal::scalbnf(x, y);
+  return __nv_scalbnf(x, y);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sin.cpp b/libc/src/math/nvptx/sin.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/sin.cpp
rename to libc/src/math/nvptx/sin.cpp
index 96e07c99b71a3..1bff129a0151c 100644
--- a/libc/src/math/gpu/vendor/sin.cpp
+++ b/libc/src/math/nvptx/sin.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sin.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sin, (double x)) { return internal::sin(x); }
+LLVM_LIBC_FUNCTION(double, sin, (double x)) { return __nv_sin(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sincos.cpp b/libc/src/math/nvptx/sincos.cpp
similarity index 89%
rename from libc/src/math/gpu/vendor/sincos.cpp
rename to libc/src/math/nvptx/sincos.cpp
index d882157b0bc8d..73f92cfb7c348 100644
--- a/libc/src/math/gpu/vendor/sincos.cpp
+++ b/libc/src/math/nvptx/sincos.cpp
@@ -9,12 +9,12 @@
 #include "src/math/sincos.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sinptr, double *cosptr)) {
-  return internal::sincos(x, sinptr, cosptr);
+  return __nv_sincos(x, sinptr, cosptr);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/sincosf.cpp b/libc/src/math/nvptx/sincosf.cpp
new file mode 100644
index 0000000000000..d053aa38151b6
--- /dev/null
+++ b/libc/src/math/nvptx/sincosf.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of the sincosf function for GPU --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sincosf.h"
+#include "src/__support/common.h"
+
+#include "declarations.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinptr, float *cosptr)) {
+  return __nv_sincosf(x, sinptr, cosptr);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sinf.cpp b/libc/src/math/nvptx/sinf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/sinf.cpp
rename to libc/src/math/nvptx/sinf.cpp
index af93227ab63bb..9abd5cb4d5c62 100644
--- a/libc/src/math/gpu/vendor/sinf.cpp
+++ b/libc/src/math/nvptx/sinf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sinf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, sinf, (float x)) { return internal::sinf(x); }
+LLVM_LIBC_FUNCTION(float, sinf, (float x)) { return __nv_sinf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sinh.cpp b/libc/src/math/nvptx/sinh.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/sinh.cpp
rename to libc/src/math/nvptx/sinh.cpp
index be6b3ae2e2fa0..dc6a1e16c6341 100644
--- a/libc/src/math/gpu/vendor/sinh.cpp
+++ b/libc/src/math/nvptx/sinh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sinh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return internal::sinh(x); }
+LLVM_LIBC_FUNCTION(double, sinh, (double x)) { return __nv_sinh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/sinhf.cpp b/libc/src/math/nvptx/sinhf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/sinhf.cpp
rename to libc/src/math/nvptx/sinhf.cpp
index 99c399b62b7f6..c9ab470ed823c 100644
--- a/libc/src/math/gpu/vendor/sinhf.cpp
+++ b/libc/src/math/nvptx/sinhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/sinhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, sinhf, (float x)) { return internal::sinhf(x); }
+LLVM_LIBC_FUNCTION(float, sinhf, (float x)) { return __nv_sinhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/sqrt.cpp b/libc/src/math/nvptx/sqrt.cpp
new file mode 100644
index 0000000000000..60ca5af4987b6
--- /dev/null
+++ b/libc/src/math/nvptx/sqrt.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU sqrt function ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sqrt.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, sqrt, (double x)) { return __builtin_sqrt(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/sqrtf.cpp b/libc/src/math/nvptx/sqrtf.cpp
new file mode 100644
index 0000000000000..e17f942a4d5fc
--- /dev/null
+++ b/libc/src/math/nvptx/sqrtf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU sqrtf function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sqrtf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, sqrtf, (float x)) { return __builtin_sqrtf(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tan.cpp b/libc/src/math/nvptx/tan.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/tan.cpp
rename to libc/src/math/nvptx/tan.cpp
index 9a1bd9c89fcf0..deb03dca250a5 100644
--- a/libc/src/math/gpu/vendor/tan.cpp
+++ b/libc/src/math/nvptx/tan.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tan.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tan, (double x)) { return internal::tan(x); }
+LLVM_LIBC_FUNCTION(double, tan, (double x)) { return __nv_tan(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tanf.cpp b/libc/src/math/nvptx/tanf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/tanf.cpp
rename to libc/src/math/nvptx/tanf.cpp
index a5266a8c154c5..5739e4a1624de 100644
--- a/libc/src/math/gpu/vendor/tanf.cpp
+++ b/libc/src/math/nvptx/tanf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tanf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, tanf, (float x)) { return internal::tanf(x); }
+LLVM_LIBC_FUNCTION(float, tanf, (float x)) { return __nv_tanf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tanh.cpp b/libc/src/math/nvptx/tanh.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/tanh.cpp
rename to libc/src/math/nvptx/tanh.cpp
index 57d764f1f2a0d..eabee2cbaf068 100644
--- a/libc/src/math/gpu/vendor/tanh.cpp
+++ b/libc/src/math/nvptx/tanh.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tanh.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return internal::tanh(x); }
+LLVM_LIBC_FUNCTION(double, tanh, (double x)) { return __nv_tanh(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tanhf.cpp b/libc/src/math/nvptx/tanhf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/tanhf.cpp
rename to libc/src/math/nvptx/tanhf.cpp
index 1c9c2f3843a95..582424cb9490a 100644
--- a/libc/src/math/gpu/vendor/tanhf.cpp
+++ b/libc/src/math/nvptx/tanhf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tanhf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, tanhf, (float x)) { return internal::tanhf(x); }
+LLVM_LIBC_FUNCTION(float, tanhf, (float x)) { return __nv_tanhf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tgamma.cpp b/libc/src/math/nvptx/tgamma.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/tgamma.cpp
rename to libc/src/math/nvptx/tgamma.cpp
index e86116a2b0abe..f92193831f9bb 100644
--- a/libc/src/math/gpu/vendor/tgamma.cpp
+++ b/libc/src/math/nvptx/tgamma.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tgamma.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, tgamma, (double x)) { return internal::tgamma(x); }
+LLVM_LIBC_FUNCTION(double, tgamma, (double x)) { return __nv_tgamma(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/gpu/vendor/tgammaf.cpp b/libc/src/math/nvptx/tgammaf.cpp
similarity index 83%
rename from libc/src/math/gpu/vendor/tgammaf.cpp
rename to libc/src/math/nvptx/tgammaf.cpp
index 552919bae4469..833994455d575 100644
--- a/libc/src/math/gpu/vendor/tgammaf.cpp
+++ b/libc/src/math/nvptx/tgammaf.cpp
@@ -9,10 +9,10 @@
 #include "src/math/tgammaf.h"
 #include "src/__support/common.h"
 
-#include "common.h"
+#include "declarations.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, tgammaf, (float x)) { return internal::tgammaf(x); }
+LLVM_LIBC_FUNCTION(float, tgammaf, (float x)) { return __nv_tgammaf(x); }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/trunc.cpp b/libc/src/math/nvptx/trunc.cpp
new file mode 100644
index 0000000000000..773600f0f2501
--- /dev/null
+++ b/libc/src/math/nvptx/trunc.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU trunc function --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/trunc.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(double, trunc, (double x)) { return __builtin_trunc(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/nvptx/truncf.cpp b/libc/src/math/nvptx/truncf.cpp
new file mode 100644
index 0000000000000..534797a3e5860
--- /dev/null
+++ b/libc/src/math/nvptx/truncf.cpp
@@ -0,0 +1,16 @@
+//===-- Implementation of the GPU truncf function -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/truncf.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float, truncf, (float x)) { return __builtin_truncf(x); }
+
+} // namespace LIBC_NAMESPACE

From ae94354721d546eb20fc64384bfdaeb87a08564d Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 27 Feb 2024 17:39:37 +0000
Subject: [PATCH 461/546] [LLVM][LangRef] Remove bogus ':' from vector constant
 text.

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 570a058bde8da..60e682ae328a8 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -4364,7 +4364,7 @@ constants and smaller complex constants.
 
     When creating a vector whose elements have the same constant value, the
     preferred syntax is ``splat (<Ty> Val)``. For example: "``splat (i32 11)``".
-    These vector constants must have ::ref:`vector type <t_vector>` with an
+    These vector constants must have :ref:`vector type <t_vector>` with an
     element type that matches the ``splat`` operand.
 **Zero initialization**
     The string '``zeroinitializer``' can be used to zero initialize a

From 9d9c01243038487402c366787f749349b76473ba Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 09:41:40 -0800
Subject: [PATCH 462/546] [flang][runtime] Added F128 wrappers for
 LDBL_MANT_DIG == 113 targets. (#83102)

We can use 'long double' variants of the math functions in this case.
I used the callees from STD namespace, except for the Bessel's
functions.
The new code can be enabled with -DFLANG_RUNTIME_F128_MATH_LIB=libm.
Support for complex data types is pending.
---
 flang/runtime/Float128Math/CMakeLists.txt | 18 +++++-
 flang/runtime/Float128Math/math-entries.h | 68 ++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 2e20f4fd612f2..8d276e8f12272 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -14,8 +14,9 @@
 # will have a dependency on the third-party library that is being
 # used for building this FortranFloat128Math library.
 
-if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath" OR
-    ${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "quadmath")
+include(CheckLibraryExists)
+
+if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath")
   check_include_file(quadmath.h FOUND_QUADMATH_HEADER)
   if(FOUND_QUADMATH_HEADER)
     add_compile_definitions(HAS_QUADMATHLIB)
@@ -25,7 +26,18 @@ if (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libquadmath" OR
       "to be available: ${FLANG_RUNTIME_F128_MATH_LIB}"
       )
   endif()
-else()
+elseif (${FLANG_RUNTIME_F128_MATH_LIB} STREQUAL "libm")
+  check_library_exists(m sinl "" FOUND_LIBM)
+  check_library_exists(m sinf128 "" FOUND_LIBMF128)
+  if (FOUND_LIBM)
+    add_compile_definitions(HAS_LIBM)
+  endif()
+  if (FOUND_LIBMF128)
+    add_compile_definitions(HAS_LIBMF128)
+  endif()
+endif()
+
+if (NOT FOUND_QUADMATH_HEADER AND NOT FOUND_LIBM)
   message(FATAL_ERROR
     "Unsupported third-party library for Fortran F128 math runtime: "
     "${FLANG_RUNTIME_F128_MATH_LIB}"
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index d7896ac827913..fe1525468edca 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -12,6 +12,7 @@
 #include "tools.h"
 #include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
+#include <cfloat>
 #include <type_traits>
 
 namespace Fortran::runtime {
@@ -42,7 +43,8 @@ namespace Fortran::runtime {
 #define DEFINE_SIMPLE_ALIAS(caller, callee) \
   template <typename RT, typename... ATs, RT (*p)(ATs...)> struct caller<p> { \
     static RT invoke(ATs... args) { \
-      static_assert(std::is_invocable_r_v<RT, decltype(callee), ATs...>); \
+      static_assert(std::is_invocable_r_v<RT, \
+          decltype(callee(std::declval<ATs>()...))(ATs...), ATs...>); \
       if constexpr (std::is_same_v<RT, void>) { \
         callee(args...); \
       } else { \
@@ -98,7 +100,69 @@ typedef _Complex float __attribute__((mode(TC))) ComplexF128;
 typedef _Complex float __attribute__((mode(KC))) ComplexF128;
 #endif
 
-#if HAS_QUADMATHLIB
+#if HAS_LIBM
+// Define wrapper callers for libm.
+#include <ccomplex>
+#include <cmath>
+
+#if LDBL_MANT_DIG == 113
+// Use STD math functions. They provide IEEE-754 128-bit float
+// support either via 'long double' or __float128.
+// The Bessel's functions are not present in STD namespace.
+DEFINE_SIMPLE_ALIAS(Acos, std::acos)
+DEFINE_SIMPLE_ALIAS(Acosh, std::acosh)
+DEFINE_SIMPLE_ALIAS(Asin, std::asin)
+DEFINE_SIMPLE_ALIAS(Asinh, std::asinh)
+DEFINE_SIMPLE_ALIAS(Atan, std::atan)
+DEFINE_SIMPLE_ALIAS(Atan2, std::atan2)
+DEFINE_SIMPLE_ALIAS(Atanh, std::atanh)
+// TODO: enable complex abs, when ABI adjustment for complex
+// data type is resolved.
+// DEFINE_SIMPLE_ALIAS(CAbs, std::abs)
+DEFINE_SIMPLE_ALIAS(Ceil, std::ceil)
+DEFINE_SIMPLE_ALIAS(Cos, std::cos)
+DEFINE_SIMPLE_ALIAS(Cosh, std::cosh)
+DEFINE_SIMPLE_ALIAS(Erf, std::erf)
+DEFINE_SIMPLE_ALIAS(Erfc, std::erfc)
+DEFINE_SIMPLE_ALIAS(Exp, std::exp)
+DEFINE_SIMPLE_ALIAS(Floor, std::floor)
+DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
+DEFINE_SIMPLE_ALIAS(J0, j0l)
+DEFINE_SIMPLE_ALIAS(J1, j1l)
+DEFINE_SIMPLE_ALIAS(Jn, jnl)
+DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma)
+DEFINE_SIMPLE_ALIAS(Llround, std::llround)
+DEFINE_SIMPLE_ALIAS(Lround, std::lround)
+DEFINE_SIMPLE_ALIAS(Log, std::log)
+DEFINE_SIMPLE_ALIAS(Log10, std::log10)
+DEFINE_SIMPLE_ALIAS(Pow, std::pow)
+DEFINE_SIMPLE_ALIAS(Round, std::round)
+DEFINE_SIMPLE_ALIAS(Sin, std::sin)
+DEFINE_SIMPLE_ALIAS(Sinh, std::sinh)
+DEFINE_SIMPLE_ALIAS(Sqrt, std::sqrt)
+DEFINE_SIMPLE_ALIAS(Tan, std::tan)
+DEFINE_SIMPLE_ALIAS(Tanh, std::tanh)
+DEFINE_SIMPLE_ALIAS(Tgamma, std::tgamma)
+DEFINE_SIMPLE_ALIAS(Trunc, std::trunc)
+DEFINE_SIMPLE_ALIAS(Y0, y0l)
+DEFINE_SIMPLE_ALIAS(Y1, y1l)
+DEFINE_SIMPLE_ALIAS(Yn, ynl)
+#else // LDBL_MANT_DIG != 113
+#if !HAS_LIBMF128
+// glibc >=2.26 seems to have complete support for __float128
+// versions of the math functions.
+#error "FLANG_RUNTIME_F128_MATH_LIB=libm build requires libm >=2.26"
+#endif
+
+// We can use __float128 versions of libm functions.
+// __STDC_WANT_IEC_60559_TYPES_EXT__ needs to be defined
+// before including cmath to enable the *f128 prototypes.
+// TODO: this needs to be enabled separately, especially
+// for complex data types that require C++ complex to C complex
+// adjustment to match the ABIs.
+#error "Unsupported FLANG_RUNTIME_F128_MATH_LIB=libm build"
+#endif // LDBL_MANT_DIG != 113
+#elif HAS_QUADMATHLIB
 // Define wrapper callers for libquadmath.
 #include "quadmath.h"
 DEFINE_SIMPLE_ALIAS(Acos, acosq)

From c95febcb4008c40a2b514bd6e2b56e0aa17457a3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 27 Feb 2024 17:46:18 +0000
Subject: [PATCH 463/546] [X86] LowerBITREVERSE - add handling for all legal
 128/256/512-bit vector types, not just vXi8

Move the BITREVERSE(BSWAP(X)) expansion into LowerBITREVERSE to help simplify #81764
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 40 +++++++++++++------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a134ca32c66bd..0722c402348ee 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1241,11 +1241,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
-    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
-    setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
+
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::BITREVERSE,       VT, Custom);
+      setOperationAction(ISD::CTLZ,             VT, Custom);
+    }
 
     // These might be better off as horizontal vector ops.
     setOperationAction(ISD::ADD,                MVT::i16, Custom);
@@ -1341,10 +1341,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // XOP can efficiently perform BITREVERSE with VPPERM.
     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
       setOperationAction(ISD::BITREVERSE, VT, Custom);
-
-    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
-                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
-      setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
@@ -1461,12 +1457,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
 
-    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
-
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
+      setOperationAction(ISD::BITREVERSE,      VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1841,8 +1836,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
 
-    setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
-
     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
@@ -1852,6 +1845,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::ABDS,             VT, Custom);
       setOperationAction(ISD::ABDU,             VT, Custom);
+      setOperationAction(ISD::BITREVERSE,       VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -31180,17 +31174,25 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
-  assert(VT.getScalarType() == MVT::i8 &&
-         "Only byte vector BITREVERSE supported");
-
-  // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
-  if (VT == MVT::v64i8 && !Subtarget.hasBWI())
+  // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
+  if (VT.is512BitVector() && !Subtarget.hasBWI())
     return splitVectorIntUnary(Op, DAG);
 
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
-  if (VT == MVT::v32i8 && !Subtarget.hasInt256())
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
     return splitVectorIntUnary(Op, DAG);
 
+  // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
+  if (VT.getScalarType() != MVT::i8) {
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+    SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
+    Res = DAG.getBitcast(ByteVT, Res);
+    Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
+    return DAG.getBitcast(VT, Res);
+  }
+  assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
+         "Only byte vector BITREVERSE supported");
+
   unsigned NumElts = VT.getVectorNumElements();
 
   // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

From f7cf1f6236ee299d65c2b33429c1d3b729f54c32 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Thu, 21 Dec 2023 13:17:34 -0800
Subject: [PATCH 464/546] [CodeGen][MISched] dumpSched direction depends on
 field in DAG.

This is a precommit to supporting post reg-alloc bottom up scheduling.
We'd like to have post-ra scheduling direction that can be different from
pre-ra direction. The current dumpSchedule function is changed in this
patch to support the fact that the post-ra and pre-ra directions will
depend on different command line options.
---
 llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h | 12 ++++++++++
 llvm/lib/CodeGen/MachineScheduler.cpp         | 24 ++++++++++++++++---
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index ef7662a8e7a26..85de18f5169e5 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -191,7 +191,19 @@ namespace llvm {
     /// applicable).
     using SUList = std::list<SUnit *>;
 
+    /// The direction that should be used to dump the scheduled Sequence.
+    enum DumpDirection {
+      TopDown,
+      BottomUp,
+      Bidirectional,
+      NotSet,
+    };
+
+    void setDumpDirection(DumpDirection D) { DumpDir = D; }
+
   protected:
+    DumpDirection DumpDir = NotSet;
+
     /// A map from ValueType to SUList, used during DAG construction, as
     /// a means of remembering which SUs depend on which memory locations.
     class Value2SUsMap;
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 750f739a55990..265d3ac59f7c8 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -440,6 +440,14 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
+  ScheduleDAGMI::DumpDirection Dir;
+  if (ForceTopDown)
+    Dir = ScheduleDAGMI::DumpDirection::TopDown;
+  else if (ForceBottomUp)
+    Dir = ScheduleDAGMI::DumpDirection::BottomUp;
+  else
+    Dir = ScheduleDAGMI::DumpDirection::Bidirectional;
+  Scheduler->setDumpDirection(Dir);
   scheduleRegions(*Scheduler, false);
 
   LLVM_DEBUG(LIS->dump());
@@ -473,6 +481,9 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
+  Scheduler->setDumpDirection(PostRADirection == MISchedPostRASched::TopDown
+                                  ? ScheduleDAGMI::DumpDirection::TopDown
+                                  : ScheduleDAGMI::DumpDirection::BottomUp);
   scheduleRegions(*Scheduler, true);
 
   if (VerifyScheduling)
@@ -1125,12 +1136,14 @@ LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
   if (MISchedDumpScheduleTrace) {
-    if (ForceTopDown)
+    if (DumpDir == TopDown)
       dumpScheduleTraceTopDown();
-    else if (ForceBottomUp)
+    else if (DumpDir == BottomUp)
       dumpScheduleTraceBottomUp();
-    else {
+    else if (DumpDir == BottomUp) {
       dbgs() << "* Schedule table (Bidirectional): not implemented\n";
+    } else {
+      dbgs() << "* Schedule table: DumpDirection not set.\n";
     }
   }
 
@@ -3842,6 +3855,11 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
         DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
             Itin, DAG);
   }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
 }
 
 void PostGenericScheduler::registerRoots() {

From 9106b58ce4e8dada167eec50178a9e154342e4ba Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Thu, 21 Dec 2023 11:36:15 -0800
Subject: [PATCH 465/546] [CodeGen][MISched] Add misched post-regalloc
 bottom-up scheduling

There is the possibility that the bottom-up direction will lead to
performance improvements on certain targets, as this is certainly the case for
the pre-regalloc GenericScheduler. This patch will give people the
opportunity to experiment for their sub-targets. However, this patch
keeps the top-down approach as the default for the PostGenericScheduler
since that is what subtargets expect today.
---
 llvm/include/llvm/CodeGen/MachineScheduler.h  |  19 +--
 llvm/lib/CodeGen/MachineScheduler.cpp         | 128 +++++++++++++-----
 .../RISCV/misched-postra-direction.mir        |  53 ++++++++
 3 files changed, 158 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/misched-postra-direction.mir

diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 47127a6c29603..25703dd6b61f0 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1293,19 +1293,19 @@ class PostGenericScheduler : public GenericSchedulerBase {
 protected:
   ScheduleDAGMI *DAG = nullptr;
   SchedBoundary Top;
-  SmallVector<SUnit*, 8> BotRoots;
+  SchedBoundary Bot;
+  MachineSchedPolicy RegionPolicy;
 
 public:
-  PostGenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
+  PostGenericScheduler(const MachineSchedContext *C)
+      : GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
+        Bot(SchedBoundary::BotQID, "BotQ") {}
 
   ~PostGenericScheduler() override = default;
 
   void initPolicy(MachineBasicBlock::iterator Begin,
                   MachineBasicBlock::iterator End,
-                  unsigned NumRegionInstrs) override {
-    /* no configurable policy */
-  }
+                  unsigned NumRegionInstrs) override;
 
   /// PostRA scheduling does not track pressure.
   bool shouldTrackPressure() const override { return false; }
@@ -1328,15 +1328,16 @@ class PostGenericScheduler : public GenericSchedulerBase {
     Top.releaseNode(SU, SU->TopReadyCycle, false);
   }
 
-  // Only called for roots.
   void releaseBottomNode(SUnit *SU) override {
-    BotRoots.push_back(SU);
+    if (SU->isScheduled)
+      return;
+    Bot.releaseNode(SU, SU->BotReadyCycle, false);
   }
 
 protected:
   virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
 
-  void pickNodeFromQueue(SchedCandidate &Cand);
+  void pickNodeFromQueue(SchedBoundary &Zone, SchedCandidate &Cand);
 };
 
 /// Create the standard converging machine scheduler. This will be used as the
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 265d3ac59f7c8..3bbd126bdaf1a 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -81,6 +81,22 @@ cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
                             cl::desc("Force bottom-up list scheduling"));
+namespace MISchedPostRASched {
+enum Direction {
+  TopDown,
+  BottomUp,
+};
+} // end namespace MISchedPostRASched
+cl::opt<MISchedPostRASched::Direction> PostRADirection(
+    "misched-postra-direction", cl::Hidden,
+    cl::desc("Post reg-alloc list scheduling direction"),
+    // Default to top-down because it was implemented first and existing targets
+    // expect that behavior by default.
+    cl::init(MISchedPostRASched::TopDown),
+    cl::values(clEnumValN(MISchedPostRASched::TopDown, "topdown",
+                          "Force top-down post reg-alloc list scheduling"),
+               clEnumValN(MISchedPostRASched::BottomUp, "bottomup",
+                          "Force bottom-up post reg-alloc list scheduling")));
 cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
@@ -440,14 +456,14 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
-  ScheduleDAGMI::DumpDirection Dir;
+  ScheduleDAGMI::DumpDirection D;
   if (ForceTopDown)
-    Dir = ScheduleDAGMI::DumpDirection::TopDown;
+    D = ScheduleDAGMI::DumpDirection::TopDown;
   else if (ForceBottomUp)
-    Dir = ScheduleDAGMI::DumpDirection::BottomUp;
+    D = ScheduleDAGMI::DumpDirection::BottomUp;
   else
-    Dir = ScheduleDAGMI::DumpDirection::Bidirectional;
-  Scheduler->setDumpDirection(Dir);
+    D = ScheduleDAGMI::DumpDirection::Bidirectional;
+  Scheduler->setDumpDirection(D);
   scheduleRegions(*Scheduler, false);
 
   LLVM_DEBUG(LIS->dump());
@@ -481,9 +497,12 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
   std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
-  Scheduler->setDumpDirection(PostRADirection == MISchedPostRASched::TopDown
-                                  ? ScheduleDAGMI::DumpDirection::TopDown
-                                  : ScheduleDAGMI::DumpDirection::BottomUp);
+  ScheduleDAGMI::DumpDirection D;
+  if (PostRADirection == MISchedPostRASched::TopDown)
+    D = ScheduleDAGMI::DumpDirection::TopDown;
+  else
+    D = ScheduleDAGMI::DumpDirection::BottomUp;
+  Scheduler->setDumpDirection(D);
   scheduleRegions(*Scheduler, true);
 
   if (VerifyScheduling)
@@ -1136,11 +1155,11 @@ LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
   if (MISchedDumpScheduleTrace) {
-    if (DumpDir == TopDown)
+    if (DumpDir == DumpDirection::TopDown)
       dumpScheduleTraceTopDown();
-    else if (DumpDir == BottomUp)
+    else if (DumpDir == DumpDirection::BottomUp)
       dumpScheduleTraceBottomUp();
-    else if (DumpDir == BottomUp) {
+    else if (DumpDir == DumpDirection::Bidirectional) {
       dbgs() << "* Schedule table (Bidirectional): not implemented\n";
     } else {
       dbgs() << "* Schedule table: DumpDirection not set.\n";
@@ -3845,7 +3864,7 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
 
   Rem.init(DAG, SchedModel);
   Top.init(DAG, SchedModel, &Rem);
-  BotRoots.clear();
+  Bot.init(DAG, SchedModel, &Rem);
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
   // or are disabled, then these HazardRecs will be disabled.
@@ -3862,11 +3881,23 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
   }
 }
 
+void PostGenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                      MachineBasicBlock::iterator End,
+                                      unsigned NumRegionInstrs) {
+  if (PostRADirection == MISchedPostRASched::TopDown) {
+    RegionPolicy.OnlyTopDown = true;
+    RegionPolicy.OnlyBottomUp = false;
+  } else if (PostRADirection == MISchedPostRASched::BottomUp) {
+    RegionPolicy.OnlyTopDown = false;
+    RegionPolicy.OnlyBottomUp = true;
+  }
+}
+
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
   // Some roots may not feed into ExitSU. Check all of them in case.
-  for (const SUnit *SU : BotRoots) {
+  for (const SUnit *SU : Bot.Available) {
     if (SU->getDepth() > Rem.CriticalPath)
       Rem.CriticalPath = SU->getDepth();
   }
@@ -3923,12 +3954,13 @@ bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
   return false;
 }
 
-void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
-  ReadyQueue &Q = Top.Available;
+void PostGenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                             SchedCandidate &Cand) {
+  ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
     SchedCandidate TryCand(Cand.Policy);
     TryCand.SU = SU;
-    TryCand.AtTop = true;
+    TryCand.AtTop = Zone.isTop();
     TryCand.initResourceDelta(DAG, SchedModel);
     if (tryCandidate(Cand, TryCand)) {
       Cand.setBest(TryCand);
@@ -3940,29 +3972,54 @@ void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
 /// Pick the next node to schedule.
 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
-    assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
   SUnit *SU;
   do {
-    SU = Top.pickOnlyChoice();
-    if (SU) {
-      tracePick(Only1, true);
+    if (RegionPolicy.OnlyBottomUp) {
+      assert(!RegionPolicy.OnlyTopDown);
+      SU = Bot.pickOnlyChoice();
+      if (SU) {
+        tracePick(Only1, true);
+      } else {
+        CandPolicy NoPolicy;
+        SchedCandidate BotCand(NoPolicy);
+        // Set the bottom-up policy based on the state of the current bottom
+        // zone and the instructions outside the zone, including the top zone.
+        setPolicy(BotCand.Policy, /*IsPostRA=*/true, Bot, nullptr);
+        pickNodeFromQueue(Bot, BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand);
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
     } else {
-      CandPolicy NoPolicy;
-      SchedCandidate TopCand(NoPolicy);
-      // Set the top-down policy based on the state of the current top zone and
-      // the instructions outside the zone, including the bottom zone.
-      setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
-      pickNodeFromQueue(TopCand);
-      assert(TopCand.Reason != NoCand && "failed to find a candidate");
-      tracePick(TopCand);
-      SU = TopCand.SU;
+
+      assert(RegionPolicy.OnlyTopDown);
+      SU = Top.pickOnlyChoice();
+      if (SU) {
+        tracePick(Only1, true);
+      } else {
+        CandPolicy NoPolicy;
+        SchedCandidate TopCand(NoPolicy);
+        // Set the top-down policy based on the state of the current top zone
+        // and the instructions outside the zone, including the bottom zone.
+        setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
+        pickNodeFromQueue(Top, TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand);
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
     }
   } while (SU->isScheduled);
 
-  IsTopNode = true;
-  Top.removeReady(SU);
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
 
   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());
@@ -3972,8 +4029,13 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
 /// Called after ScheduleDAGMI has scheduled an instruction and updated
 /// scheduled/remaining flags in the DAG nodes.
 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
-  SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
-  Top.bumpNode(SU);
+  if (IsTopNode) {
+    SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
+    Bot.bumpNode(SU);
+  }
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
diff --git a/llvm/test/CodeGen/RISCV/misched-postra-direction.mir b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir
new file mode 100644
index 0000000000000..841d0e6d65da6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/misched-postra-direction.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -run-pass=postmisched -enable-post-misched -debug-only=machine-scheduler -misched-dump-schedule-trace -misched-postra-direction=topdown -o - %s 2>&1 | FileCheck --check-prefix=TOPDOWN %s
+# RUN: llc -mtriple=riscv64 -mcpu=sifive-x280 -run-pass=postmisched -enable-post-misched -debug-only=machine-scheduler -misched-dump-schedule-trace -misched-postra-direction=bottomup -o - %s 2>&1 | FileCheck --check-prefix=BOTTOMUP %s
+
+# REQUIRES: asserts
+
+---
+name:            test
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+
+    renamable $x12 = MUL renamable $x11, renamable $x10
+    renamable $x13 = ADD renamable $x11, renamable $x10
+    renamable $x14 = DIVW killed renamable $x12, killed renamable $x13
+    PseudoRET implicit $x14
+...
+
+# TOPDOWN: *** Final schedule for %bb.0 ***
+# TOPDOWN-NEXT:  * Schedule table (TopDown):
+# TOPDOWN-NEXT:   i: issue
+# TOPDOWN-NEXT:   x: resource booked
+# TOPDOWN-NEXT: Cycle              | 0  | 1  | 2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |
+# TOPDOWN-NEXT: SU(0)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:       SiFive7PipeB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT: SU(1)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT: SU(2)              |    |    |    | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:      SiFive7PipeAB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:       SiFive7PipeB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# TOPDOWN-NEXT:        SiFive7IDiv |    |    |    | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  |
+# TOPDOWN-NEXT: SU(0):   renamable $x12 = MUL renamable $x11, renamable $x10
+# TOPDOWN-NEXT: SU(1):   renamable $x13 = ADD renamable $x11, renamable $x10
+# TOPDOWN-NEXT: SU(2):   renamable $x14 = DIVW renamable $x12, renamable $x13
+
+# BOTTOMUP: *** Final schedule for %bb.0 ***
+# BOTTOMUP-NEXT:  * Schedule table (BottomUp):
+# BOTTOMUP-NEXT:   i: issue
+# BOTTOMUP-NEXT:   x: resource booked
+# BOTTOMUP-NEXT: Cycle              | 37 | 36 | 35 | 34 | 33 | 32 | 31 | 30 | 29 | 28 | 27 | 26 | 25 | 24 | 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | 15 | 14 | 13 | 12 | 11 | 10 | 9  | 8  | 7  | 6  | 5  | 4  | 3  | 2  |
+# BOTTOMUP-NEXT: SU(1)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT: SU(0)              | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:      SiFive7PipeAB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:       SiFive7PipeB | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT: SU(2)              |    |    |    | i  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:      SiFive7PipeAB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:       SiFive7PipeB |    |    |    | x  |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |    |
+# BOTTOMUP-NEXT:        SiFive7IDiv |    |    |    | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  | x  |
+# BOTTOMUP-NEXT: SU(1):   renamable $x13 = ADD renamable $x11, renamable $x10
+# BOTTOMUP-NEXT: SU(0):   renamable $x12 = MUL renamable $x11, renamable $x10
+# BOTTOMUP-NEXT: SU(2):   renamable $x14 = DIVW renamable $x12, renamable $x13

From 770694539bde0f1b706c70d87dd70777c94f178f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 11:57:47 -0600
Subject: [PATCH 466/546] [libc] Work around incorrect fmin/fmax results for
 +/-x (#83158)

Summary:
The IEEE 754 standard as of the 2019 revision states that for fmin -0.0
is always less than 0.0 and for fmax 0.0 is always greater than 0.0.
These are currently not respected by the builtin value and thus cause
the tests to fail. This patch works around it in the implementation for
now by explicitly modifying the sign bit.
---
 libc/src/math/amdgpu/fmax.cpp           |   7 +
 libc/src/math/amdgpu/fmaxf.cpp          |   7 +
 libc/src/math/amdgpu/fmin.cpp           |   7 +
 libc/src/math/amdgpu/fminf.cpp          |   7 +
 libc/src/math/nvptx/fmax.cpp            |   7 +
 libc/src/math/nvptx/fmaxf.cpp           |   7 +
 libc/src/math/nvptx/fmin.cpp            |   7 +
 libc/src/math/nvptx/fminf.cpp           |   7 +
 libc/test/src/math/smoke/CMakeLists.txt | 195 ++++++++++++------------
 9 files changed, 152 insertions(+), 99 deletions(-)

diff --git a/libc/src/math/amdgpu/fmax.cpp b/libc/src/math/amdgpu/fmax.cpp
index a2c35371d12b6..09624cc6f092a 100644
--- a/libc/src/math/amdgpu/fmax.cpp
+++ b/libc/src/math/amdgpu/fmax.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmax.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) &
+                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmax(x, y);
 }
 
diff --git a/libc/src/math/amdgpu/fmaxf.cpp b/libc/src/math/amdgpu/fmaxf.cpp
index 67178b3e27357..f6ed46699a049 100644
--- a/libc/src/math/amdgpu/fmaxf.cpp
+++ b/libc/src/math/amdgpu/fmaxf.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmaxf.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) &
+                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fmaxf(x, y);
 }
 
diff --git a/libc/src/math/amdgpu/fmin.cpp b/libc/src/math/amdgpu/fmin.cpp
index 7303adcd347ee..8977ff7a066c6 100644
--- a/libc/src/math/amdgpu/fmin.cpp
+++ b/libc/src/math/amdgpu/fmin.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmin.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) |
+                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmin(x, y);
 }
 
diff --git a/libc/src/math/amdgpu/fminf.cpp b/libc/src/math/amdgpu/fminf.cpp
index bbf0c677b5e3a..3be55257f6164 100644
--- a/libc/src/math/amdgpu/fminf.cpp
+++ b/libc/src/math/amdgpu/fminf.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fminf.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) |
+                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fminf(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fmax.cpp b/libc/src/math/nvptx/fmax.cpp
index a2c35371d12b6..09624cc6f092a 100644
--- a/libc/src/math/nvptx/fmax.cpp
+++ b/libc/src/math/nvptx/fmax.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmax.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmax, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) &
+                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmax(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fmaxf.cpp b/libc/src/math/nvptx/fmaxf.cpp
index 67178b3e27357..f6ed46699a049 100644
--- a/libc/src/math/nvptx/fmaxf.cpp
+++ b/libc/src/math/nvptx/fmaxf.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmaxf.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmaxf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) &
+                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fmaxf(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fmin.cpp b/libc/src/math/nvptx/fmin.cpp
index 7303adcd347ee..8977ff7a066c6 100644
--- a/libc/src/math/nvptx/fmin.cpp
+++ b/libc/src/math/nvptx/fmin.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fmin.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fmin, (double x, double y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<double>(cpp::bit_cast<uint64_t>(x) |
+                                 cpp::bit_cast<uint64_t>(y));
   return __builtin_fmin(x, y);
 }
 
diff --git a/libc/src/math/nvptx/fminf.cpp b/libc/src/math/nvptx/fminf.cpp
index bbf0c677b5e3a..3be55257f6164 100644
--- a/libc/src/math/nvptx/fminf.cpp
+++ b/libc/src/math/nvptx/fminf.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/fminf.h"
+
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/optimization.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fminf, (float x, float y)) {
+  // FIXME: The builtin function does not correctly handle the +/-0.0 case.
+  if (LIBC_UNLIKELY(x == y))
+    return cpp::bit_cast<float>(cpp::bit_cast<uint32_t>(x) |
+                                cpp::bit_cast<uint32_t>(y));
   return __builtin_fminf(x, y);
 }
 
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 7ef87a0dd2eec..825000e1cb7af 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1092,112 +1092,109 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-# FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_OS_IS_GPU)
-  add_fp_unittest(
-    fminf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fminf_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fminf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fminf_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fminf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmin_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmin_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fmin
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmin_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmin_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fmin
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fminl_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fminl_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fminl
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fminl_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fminl
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fminf128_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fminf128_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.src.math.fminf128
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fminf128_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.src.math.fminf128
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmaxf_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmaxf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmaxf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmaxf_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmaxf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmax_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmax_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmax
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmax_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmax_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmax
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxl_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmaxl_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmaxl
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmaxl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmaxl_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmaxl
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxf128_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      fmaxf128_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.src.math.fmaxf128
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  fmaxf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    fmaxf128_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.src.math.fmaxf128
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   sqrtf_test

From f4fad827ca2060ebf31b7e26b5ff6604bd18015b Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Tue, 27 Feb 2024 09:59:56 -0800
Subject: [PATCH 467/546] [NFC] Add REQUIRES: asserts to limit the test to
 debug only. (#83145)

---
 mlir/test/Target/LLVMIR/erase-dangling-constants.mlir | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
index b3b5d540ae88f..dbb6755956003 100644
--- a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
+++ b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
@@ -1,3 +1,4 @@
+// REQUIRES: asserts
 // RUN: mlir-translate -mlir-to-llvmir %s -debug-only=llvm-dialect-to-llvm-ir 2>&1 | FileCheck %s
 
 // CHECK: Convert initializer for dup_const

From 335ac4108dda907e152ac22e03b53c30860157a8 Mon Sep 17 00:00:00 2001
From: DeanSturtevant1 <dsturtevant@google.com>
Date: Tue, 27 Feb 2024 13:01:25 -0500
Subject: [PATCH 468/546] Improve readability of "undefined reference" message
 (#82671)

The current message implies a command line flag caused an undefined
reference. This of course is wrong and causes confusion. The message now
more accurately reflects the true state of affairs.
---
 lld/ELF/Writer.cpp                   |  5 +++--
 lld/test/ELF/allow-shlib-undefined.s | 16 ++++++++--------
 lld/test/ELF/unresolved-symbols.s    |  2 +-
 lld/test/ELF/wrap-shlib-undefined.s  |  2 +-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 0bbf43ddf694a..a9292b3b1a224 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2057,8 +2057,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
           if (sym->dsoDefined)
             continue;
           if (sym->isUndefined() && !sym->isWeak()) {
-            diagnose("undefined reference due to --no-allow-shlib-undefined: " +
-                     toString(*sym) + "\n>>> referenced by " + toString(file));
+            diagnose("undefined reference: " + toString(*sym) +
+                     "\n>>> referenced by " + toString(file) +
+                     " (disallowed by --no-allow-shlib-undefined)");
           } else if (sym->isDefined() && sym->computeBinding() == STB_LOCAL) {
             diagnose("non-exported symbol '" + toString(*sym) + "' in '" +
                      toString(sym->file) + "' is referenced by DSO '" +
diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s
index 969e87b69eb85..4b7151c8bc0d5 100644
--- a/lld/test/ELF/allow-shlib-undefined.s
+++ b/lld/test/ELF/allow-shlib-undefined.s
@@ -43,19 +43,19 @@
 # RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings -o /dev/null
 
 # CHECK-NOT:   error:
-# CHECK:       error: undefined reference due to --no-allow-shlib-undefined: x1{{$}}
-# CHECK-NEXT:  >>> referenced by a.so{{$}}
+# CHECK:       error: undefined reference: x1{{$}}
+# CHECK-NEXT:  >>> referenced by a.so (disallowed by --no-allow-shlib-undefined){{$}}
 # CHECK-NOT:   {{.}}
 
 # CHECK2-NOT:  error:
-# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: x1
-# CHECK2-NEXT: >>> referenced by a.so
-# CHECK2:      error: undefined reference due to --no-allow-shlib-undefined: x1
-# CHECK2-NEXT: >>> referenced by b.so
+# CHECK2:      error: undefined reference: x1
+# CHECK2-NEXT: >>> referenced by a.so (disallowed by --no-allow-shlib-undefined)
+# CHECK2:      error: undefined reference: x1
+# CHECK2-NEXT: >>> referenced by b.so (disallowed by --no-allow-shlib-undefined)
 # CHECK2-NOT:  {{.}}
 
-# WARN:        warning: undefined reference due to --no-allow-shlib-undefined: x1
-# WARN-NEXT:   >>> referenced by a.so
+# WARN:        warning: undefined reference: x1
+# WARN-NEXT:   >>> referenced by a.so (disallowed by --no-allow-shlib-undefined)
 
 # NONEXPORTED-NOT: error:
 # NONEXPORTED:     error: non-exported symbol 'x1' in 'def-hidden.o' is referenced by DSO 'a.so'
diff --git a/lld/test/ELF/unresolved-symbols.s b/lld/test/ELF/unresolved-symbols.s
index 68fe1e7065a84..91194d376ca88 100644
--- a/lld/test/ELF/unresolved-symbols.s
+++ b/lld/test/ELF/unresolved-symbols.s
@@ -36,7 +36,7 @@
 ## --unresolved-symbols overrides a previous --allow-shlib-undefined.
 # RUN: not ld.lld %t1.o %t.so -o /dev/null --allow-shlib-undefined --unresolved-symbols=ignore-in-object-files 2>&1 | FileCheck %s --check-prefix=SHLIB
 
-# SHLIB: error: undefined reference due to --no-allow-shlib-undefined: undef
+# SHLIB: error: undefined reference: undef
 
 ## Ignoring undefines in shared should produce error for symbol from object.
 # RUN: not ld.lld %t2.o -o /dev/null --unresolved-symbols=ignore-in-shared-libs 2>&1 | \
diff --git a/lld/test/ELF/wrap-shlib-undefined.s b/lld/test/ELF/wrap-shlib-undefined.s
index 96c4629cb497e..0692ae30e889b 100644
--- a/lld/test/ELF/wrap-shlib-undefined.s
+++ b/lld/test/ELF/wrap-shlib-undefined.s
@@ -17,7 +17,7 @@
 
 ## --no-allow-shlib-undefined errors because __real_foo is not defined.
 # RUN: not ld.lld %t/main.o %t/bar.so -o /dev/null 2>&1 | FileCheck --check-prefix=ERR %s
-# ERR: error: undefined reference due to --no-allow-shlib-undefined: __real_foo
+# ERR: error: undefined reference: __real_foo
 
 ## --wrap=foo defines __real_foo.
 # RUN: ld.lld %t/main.o %t/bar.so --wrap=foo -o %t2

From 12df1cfdd130d8d2648881b62061153a2732bee4 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 27 Feb 2024 18:03:10 +0000
Subject: [PATCH 469/546] [LLVM] Remove bogus whitespace from
 AArch64TargetParser.h

---
 llvm/include/llvm/TargetParser/AArch64TargetParser.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 7376ac98a2b09..93e9ed46642df 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -116,7 +116,7 @@ enum ArchExtKind : unsigned {
   AEK_PROFILE =       7,  // FEAT_SPE
   AEK_RAS =           8,  // FEAT_RAS, FEAT_RASv1p1
   AEK_LSE =           9,  // FEAT_LSE
-  AEK_SVE =           10,  // FEAT_SVE
+  AEK_SVE =           10, // FEAT_SVE
   AEK_DOTPROD =       11, // FEAT_DotProd
   AEK_RCPC =          12, // FEAT_LRCPC
   AEK_RDM =           13, // FEAT_RDM

From 0ef66fcc858cc8abb978d83d48b3e7a8b23742c9 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Tue, 27 Feb 2024 10:25:56 -0800
Subject: [PATCH 470/546] [lldb] Use CreateOptionParsingError in
 CommandObjectBreakpoint (#83086)

This updates the remaining SetOptionValue methods in
CommandObjectBreakpoint to use CreateOptionParsingError.

I found a few minor bugs that were fixed during this refactor (e.g.
using the wrong flag in an error message). That is one of the benefits
of centralizing error message creation.

I also found some option parsing code that is written incorrectly. I do
not make an attempt to update those here because this PR is primarily
about changing existing error handling code, not adding new error
handling code.
---
 lldb/include/lldb/Interpreter/Options.h       |   2 +
 .../Commands/CommandObjectBreakpoint.cpp      | 101 +++++++++---------
 2 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/Options.h b/lldb/include/lldb/Interpreter/Options.h
index 18a87e49deee5..9a6a17c2793fa 100644
--- a/lldb/include/lldb/Interpreter/Options.h
+++ b/lldb/include/lldb/Interpreter/Options.h
@@ -368,6 +368,8 @@ static constexpr llvm::StringLiteral g_bool_parsing_error_message =
     "Failed to parse as boolean";
 static constexpr llvm::StringLiteral g_int_parsing_error_message =
     "Failed to parse as integer";
+static constexpr llvm::StringLiteral g_language_parsing_error_message =
+    "Unknown language";
 
 } // namespace lldb_private
 
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index fc2217608a0bb..cfb0b87a59e64 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -266,6 +266,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
       Status error;
       const int short_option =
           g_breakpoint_set_options[option_idx].short_option;
+      const char *long_option =
+          g_breakpoint_set_options[option_idx].long_option;
 
       switch (short_option) {
       case 'a': {
@@ -284,13 +286,15 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
 
       case 'u':
         if (option_arg.getAsInteger(0, m_column))
-          error.SetErrorStringWithFormat("invalid column number: %s",
-                                         option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_int_parsing_error_message);
         break;
 
       case 'E': {
         LanguageType language = Language::GetLanguageTypeFromString(option_arg);
 
+        llvm::StringRef error_context;
         switch (language) {
         case eLanguageTypeC89:
         case eLanguageTypeC:
@@ -308,19 +312,18 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           m_exception_language = eLanguageTypeObjC;
           break;
         case eLanguageTypeObjC_plus_plus:
-          error.SetErrorStringWithFormat(
-              "Set exception breakpoints separately for c++ and objective-c");
+          error_context =
+              "Set exception breakpoints separately for c++ and objective-c";
           break;
         case eLanguageTypeUnknown:
-          error.SetErrorStringWithFormat(
-              "Unknown language type: '%s' for exception breakpoint",
-              option_arg.str().c_str());
+          error_context = "Unknown language type for exception breakpoint";
           break;
         default:
-          error.SetErrorStringWithFormat(
-              "Unsupported language type: '%s' for exception breakpoint",
-              option_arg.str().c_str());
+          error_context = "Unsupported language type for exception breakpoint";
         }
+        if (!error_context.empty())
+          error = CreateOptionParsingError(option_arg, short_option,
+                                           long_option, error_context);
       } break;
 
       case 'f':
@@ -336,9 +339,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         bool success;
         m_catch_bp = OptionArgParser::ToBoolean(option_arg, true, &success);
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for on-catch option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
       } break;
 
       case 'H':
@@ -355,23 +358,24 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           m_skip_prologue = eLazyBoolNo;
 
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for skip prologue option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
       } break;
 
       case 'l':
         if (option_arg.getAsInteger(0, m_line_num))
-          error.SetErrorStringWithFormat("invalid line number: %s.",
-                                         option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_int_parsing_error_message);
         break;
 
       case 'L':
         m_language = Language::GetLanguageTypeFromString(option_arg);
         if (m_language == eLanguageTypeUnknown)
-          error.SetErrorStringWithFormat(
-              "Unknown language type: '%s' for breakpoint",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_language_parsing_error_message);
         break;
 
       case 'm': {
@@ -384,9 +388,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
           m_move_to_nearest_code = eLazyBoolNo;
 
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for move-to-nearest-code option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
         break;
       }
 
@@ -404,8 +408,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         if (BreakpointID::StringIsBreakpointName(option_arg, error))
           m_breakpoint_names.push_back(std::string(option_arg));
         else
-          error.SetErrorStringWithFormat("Invalid breakpoint name: %s",
-                                         option_arg.str().c_str());
+          error = CreateOptionParsingError(
+              option_arg, short_option, long_option, "Invalid breakpoint name");
         break;
       }
 
@@ -443,9 +447,9 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         bool success;
         m_throw_bp = OptionArgParser::ToBoolean(option_arg, true, &success);
         if (!success)
-          error.SetErrorStringWithFormat(
-              "Invalid boolean value for on-throw option: '%s'",
-              option_arg.str().c_str());
+          error =
+              CreateOptionParsingError(option_arg, short_option, long_option,
+                                       g_bool_parsing_error_message);
       } break;
 
       case 'X':
@@ -457,9 +461,8 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         OptionValueFileColonLine value;
         Status fcl_err = value.SetValueFromString(option_arg);
         if (!fcl_err.Success()) {
-          error.SetErrorStringWithFormat(
-              "Invalid value for file:line specifier: %s",
-              fcl_err.AsCString());
+          error = CreateOptionParsingError(option_arg, short_option,
+                                           long_option, fcl_err.AsCString());
         } else {
           m_filenames.AppendIfUnique(value.GetFileSpec());
           m_line_num = value.GetLineNumber();
@@ -1557,6 +1560,7 @@ class BreakpointNameOptionGroup : public OptionGroup {
                         ExecutionContext *execution_context) override {
     Status error;
     const int short_option = g_breakpoint_name_options[option_idx].short_option;
+    const char *long_option = g_breakpoint_name_options[option_idx].long_option;
 
     switch (short_option) {
     case 'N':
@@ -1566,15 +1570,13 @@ class BreakpointNameOptionGroup : public OptionGroup {
       break;
     case 'B':
       if (m_breakpoint.SetValueFromString(option_arg).Fail())
-        error.SetErrorStringWithFormat(
-            "unrecognized value \"%s\" for breakpoint",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_int_parsing_error_message);
       break;
     case 'D':
       if (m_use_dummy.SetValueFromString(option_arg).Fail())
-        error.SetErrorStringWithFormat(
-            "unrecognized value \"%s\" for use-dummy",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
       break;
     case 'H':
       m_help_string.SetValueFromString(option_arg);
@@ -1617,6 +1619,8 @@ class BreakpointAccessOptionGroup : public OptionGroup {
     Status error;
     const int short_option =
         g_breakpoint_access_options[option_idx].short_option;
+    const char *long_option =
+        g_breakpoint_access_options[option_idx].long_option;
 
     switch (short_option) {
     case 'L': {
@@ -1625,9 +1629,8 @@ class BreakpointAccessOptionGroup : public OptionGroup {
       if (success) {
         m_permissions.SetAllowList(value);
       } else
-        error.SetErrorStringWithFormat(
-            "invalid boolean value '%s' passed for -L option",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
     } break;
     case 'A': {
       bool value, success;
@@ -1635,9 +1638,8 @@ class BreakpointAccessOptionGroup : public OptionGroup {
       if (success) {
         m_permissions.SetAllowDisable(value);
       } else
-        error.SetErrorStringWithFormat(
-            "invalid boolean value '%s' passed for -L option",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
     } break;
     case 'D': {
       bool value, success;
@@ -1645,9 +1647,8 @@ class BreakpointAccessOptionGroup : public OptionGroup {
       if (success) {
         m_permissions.SetAllowDelete(value);
       } else
-        error.SetErrorStringWithFormat(
-            "invalid boolean value '%s' passed for -L option",
-            option_arg.str().c_str());
+        error = CreateOptionParsingError(option_arg, short_option, long_option,
+                                         g_bool_parsing_error_message);
     } break;
     default:
       llvm_unreachable("Unimplemented option");
@@ -2141,6 +2142,8 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
                           ExecutionContext *execution_context) override {
       Status error;
       const int short_option = m_getopt_table[option_idx].val;
+      const char *long_option =
+          m_getopt_table[option_idx].definition->long_option;
 
       switch (short_option) {
       case 'f':
@@ -2150,8 +2153,8 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
         Status name_error;
         if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(option_arg),
                                                   name_error)) {
-          error.SetErrorStringWithFormat("Invalid breakpoint name: %s",
-                                         name_error.AsCString());
+          error = CreateOptionParsingError(option_arg, short_option,
+                                           long_option, name_error.AsCString());
         }
         m_names.push_back(std::string(option_arg));
         break;

From abc693fb4051dfb3a49ba2dcdbc2d164c53f2a51 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Tue, 27 Feb 2024 18:32:15 +0000
Subject: [PATCH 471/546] [AArch64] Skip over shadow space for ARM64EC entry
 thunk variadic calls (#80994)

When in an entry thunk the x64 SP is passed in x4 but this cannot be
directly passed through since x64 varargs calls have a 32 byte shadow
store at SP followed by the in-stack parameters. ARM64EC varargs calls
on the other hand expect x4 to point to the first in-stack parameter.
---
 .../AArch64/AArch64Arm64ECCallLowering.cpp    | 35 ++++++++++++++-----
 .../AArch64/AArch64CallingConvention.td       |  3 ++
 .../CodeGen/AArch64/arm64ec-entry-thunks.ll   |  2 +-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index a99856dcc9439..f147ded2ab701 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -119,8 +119,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
   getThunkArgTypes(FT, AttrList, TT, Out, Arm64ArgTypes, X64ArgTypes,
                    HasSretPtr);
 
-  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes,
-                              TT == ThunkType::Entry && FT->isVarArg());
+  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
+
   X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false);
 }
 
@@ -158,13 +158,13 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
       X64ArgTypes.push_back(I64Ty);
     }
 
+    // x4
+    Arm64ArgTypes.push_back(PtrTy);
+    X64ArgTypes.push_back(PtrTy);
+    // x5
+    Arm64ArgTypes.push_back(I64Ty);
     if (TT != ThunkType::Entry) {
-      // x4
-      Arm64ArgTypes.push_back(PtrTy);
-      X64ArgTypes.push_back(PtrTy);
-      // x5
-      Arm64ArgTypes.push_back(I64Ty);
-      // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
+      // FIXME: x5 isn't actually used by the x64 side; revisit once we
       // have proper isel for varargs
       X64ArgTypes.push_back(I64Ty);
     }
@@ -473,10 +473,11 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 
   bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy();
   unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1;
+  unsigned PassthroughArgSize = F->isVarArg() ? 5 : Thunk->arg_size();
 
   // Translate arguments to call.
   SmallVector<Value *> Args;
-  for (unsigned i = ThunkArgOffset, e = Thunk->arg_size(); i != e; ++i) {
+  for (unsigned i = ThunkArgOffset, e = PassthroughArgSize; i != e; ++i) {
     Value *Arg = Thunk->getArg(i);
     Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
     if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
@@ -493,6 +494,22 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
     Args.push_back(Arg);
   }
 
+  if (F->isVarArg()) {
+    // The 5th argument to variadic entry thunks is used to model the x64 sp
+    // which is passed to the thunk in x4, this can be passed to the callee as
+    // the variadic argument start address after skipping over the 32 byte
+    // shadow store.
+
+    // The EC thunk CC will assign any argument marked as InReg to x4.
+    Thunk->addParamAttr(5, Attribute::InReg);
+    Value *Arg = Thunk->getArg(5);
+    Arg = IRB.CreatePtrAdd(Arg, IRB.getInt64(0x20));
+    Args.push_back(Arg);
+
+    // Pass in a zero variadic argument size (in x5).
+    Args.push_back(IRB.getInt64(0));
+  }
+
   // Call the function passed to the thunk.
   Value *Callee = Thunk->getArg(0);
   Callee = IRB.CreateBitCast(Callee, PtrTy);
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 78ea4a5180f70..8e67f0f5c8815 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -213,6 +213,9 @@ def CC_AArch64_Arm64EC_VarArg : CallingConv<[
 // address is passed in X9.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_Thunk : CallingConv<[
+  // ARM64EC-specific: the InReg attribute can be used to access the x64 sp passed into entry thunks in x4 from the IR.
+  CCIfInReg<CCIfType<[i64], CCAssignToReg<[X4]>>>,
+
   // Byval aggregates are passed by pointer
   CCIfByVal<CCPassIndirect<i64>>,
 
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
index 0083818def151..bb9ba05f7a272 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -147,7 +147,7 @@ define void @has_varargs(...) nounwind {
 ; CHECK-NEXT:     add     x29, sp, #160
 ; CHECK-NEXT:     .seh_add_fp     160
 ; CHECK-NEXT:     .seh_endprologue
-; CHECK-NEXT:     mov     x4, sp
+; CHECK-NEXT:     add     x4, x4, #32
 ; CHECK-NEXT:     mov     x5, xzr
 ; CHECK-NEXT:     blr     x9
 ; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret

From 2d704f4bf2edb0f9343dac818ab4d29442be9968 Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Tue, 27 Feb 2024 10:34:01 -0800
Subject: [PATCH 472/546] Start to clean up the process of defining command
 arguments. (#83097)

Partly, there's just a lot of unnecessary boiler plate. It's also
possible to define combinations of arguments that make no sense (e.g.
eArgRepeatPlus followed by eArgRepeatPlain...) but these are never
checked since we just push_back directly into the argument definitions.

This commit is step 1 of this cleanup - do the obvious stuff. In it, all
the simple homogenous argument lists and the breakpoint/watchpoint
ID/Range types, are set with common functions. This is an NFC change, it
just centralizes boiler plate. There's no checking yet because you can't
get a single argument wrong.

The end goal is that all argument definition goes through functions and
m_arguments is hidden so that you can't define inconsistent argument
sets.
---
 lldb/include/lldb/Interpreter/CommandObject.h |  20 ++-
 lldb/source/Commands/CommandObjectApropos.cpp |  14 +--
 .../Commands/CommandObjectBreakpoint.cpp      |  72 ++---------
 .../CommandObjectBreakpointCommand.cpp        |  42 +------
 .../source/Commands/CommandObjectCommands.cpp | 119 ++----------------
 .../Commands/CommandObjectDWIMPrint.cpp       |   3 +-
 .../Commands/CommandObjectExpression.cpp      |  14 +--
 lldb/source/Commands/CommandObjectFrame.cpp   |  59 +--------
 lldb/source/Commands/CommandObjectHelp.cpp    |  13 +-
 lldb/source/Commands/CommandObjectLog.cpp     |  56 +--------
 .../source/Commands/CommandObjectPlatform.cpp |  80 ++----------
 lldb/source/Commands/CommandObjectPlugin.cpp  |  14 +--
 lldb/source/Commands/CommandObjectProcess.cpp |  50 ++------
 lldb/source/Commands/CommandObjectQuit.cpp    |   3 +-
 .../source/Commands/CommandObjectRegister.cpp |  22 +---
 lldb/source/Commands/CommandObjectSession.cpp |   4 +-
 .../source/Commands/CommandObjectSettings.cpp |  42 +------
 lldb/source/Commands/CommandObjectTarget.cpp  | 107 +++-------------
 lldb/source/Commands/CommandObjectThread.cpp  |  90 ++-----------
 .../Commands/CommandObjectThreadUtil.cpp      |   6 +-
 lldb/source/Commands/CommandObjectTrace.cpp   |   9 +-
 lldb/source/Commands/CommandObjectType.cpp    | 113 ++---------------
 .../Commands/CommandObjectWatchpoint.cpp      |  68 ++--------
 .../CommandObjectWatchpointCommand.cpp        |  42 +------
 lldb/source/Interpreter/CommandObject.cpp     |  39 ++++--
 .../ItaniumABI/ItaniumABILanguageRuntime.cpp  |  14 +--
 .../AppleObjCRuntime/AppleObjCRuntimeV2.cpp   |  28 +----
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |   6 +-
 28 files changed, 162 insertions(+), 987 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index a326c6dc38a37..a641a468b49d2 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -207,6 +207,20 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
   static const ArgumentTableEntry *
   FindArgumentDataByType(lldb::CommandArgumentType arg_type);
 
+  // Sets the argument list for this command to one homogenous argument type,
+  // with the repeat specified.
+  void AddSimpleArgumentList(
+      lldb::CommandArgumentType arg_type,
+      ArgumentRepetitionType repetition_type = eArgRepeatPlain);
+
+  // Helper function to set BP IDs or ID ranges as the command argument data
+  // for this command.
+  // This used to just populate an entry you could add to, but that was never
+  // used.  If we ever need that we can take optional extra args here.
+  // Use this to define a simple argument list:
+  enum IDType { eBreakpointArgs = 0, eWatchpointArgs = 1 };
+  void AddIDsArgumentData(IDType type);
+
   int GetNumArgumentEntries();
 
   CommandArgumentEntry *GetArgumentEntryAtIndex(int idx);
@@ -391,12 +405,6 @@ class CommandObject : public std::enable_shared_from_this<CommandObject> {
   lldb_private::CommandOverrideCallbackWithResult m_command_override_callback;
   void *m_command_override_baton;
   bool m_is_user_command = false;
-
-  // Helper function to populate IDs or ID ranges as the command argument data
-  // to the specified command argument entry.
-  static void AddIDsArgumentData(CommandArgumentEntry &arg,
-                                 lldb::CommandArgumentType ID,
-                                 lldb::CommandArgumentType IDRange);
 };
 
 class CommandObjectParsed : public CommandObject {
diff --git a/lldb/source/Commands/CommandObjectApropos.cpp b/lldb/source/Commands/CommandObjectApropos.cpp
index 88c214d4fc56a..d663f2bd923fe 100644
--- a/lldb/source/Commands/CommandObjectApropos.cpp
+++ b/lldb/source/Commands/CommandObjectApropos.cpp
@@ -21,19 +21,7 @@ CommandObjectApropos::CommandObjectApropos(CommandInterpreter &interpreter)
     : CommandObjectParsed(
           interpreter, "apropos",
           "List debugger commands related to a word or subject.", nullptr) {
-  CommandArgumentEntry arg;
-  CommandArgumentData search_word_arg;
-
-  // Define the first (and only) variant of this arg.
-  search_word_arg.arg_type = eArgTypeSearchWord;
-  search_word_arg.arg_repetition = eArgRepeatPlain;
-
-  // There is only one variant this argument could be; put it into the argument
-  // entry.
-  arg.push_back(search_word_arg);
-
-  // Push the data for the first argument into the m_arguments vector.
-  m_arguments.push_back(arg);
+  AddSimpleArgumentList(eArgTypeSearchWord);
 }
 
 CommandObjectApropos::~CommandObjectApropos() = default;
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index cfb0b87a59e64..fbece865f1131 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -813,12 +813,7 @@ class CommandObjectBreakpointModify : public CommandObjectParsed {
                             "With the exception of -e, -d and -i, passing an "
                             "empty argument clears the modification.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
 
     m_options.Append(&m_bp_opts,
                      LLDB_OPT_SET_1 | LLDB_OPT_SET_2 | LLDB_OPT_SET_3,
@@ -890,12 +885,7 @@ class CommandObjectBreakpointEnable : public CommandObjectParsed {
                             "Enable the specified disabled breakpoint(s). If "
                             "no breakpoints are specified, enable all of them.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointEnable() override = default;
@@ -1002,12 +992,7 @@ execution will NOT stop at location 1.1.  To achieve that, type:
         "The first command disables all locations for breakpoint 1, \
 the second re-enables the first location.");
 
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointDisable() override = default;
@@ -1098,15 +1083,7 @@ class CommandObjectBreakpointList : public CommandObjectParsed {
     CommandArgumentData bp_id_arg;
 
     // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
   }
 
   ~CommandObjectBreakpointList() override = default;
@@ -1372,12 +1349,7 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
                             "Delete the specified breakpoint(s).  If no "
                             "breakpoints are specified, delete them all.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointDelete() override = default;
@@ -1677,14 +1649,7 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed {
             "on the name.",
             "breakpoint name configure <command-options> "
             "<breakpoint-name-list>") {
-    // Create the first variant for the first (and only) argument for this
-    // command.
-    CommandArgumentEntry arg1;
-    CommandArgumentData id_arg;
-    id_arg.arg_type = eArgTypeBreakpointName;
-    id_arg.arg_repetition = eArgRepeatOptional;
-    arg1.push_back(id_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeBreakpointName, eArgRepeatOptional);
 
     m_option_group.Append(&m_bp_opts, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_access_options, LLDB_OPT_SET_ALL,
@@ -1770,14 +1735,7 @@ class CommandObjectBreakpointNameAdd : public CommandObjectParsed {
       : CommandObjectParsed(
             interpreter, "add", "Add a name to the breakpoints provided.",
             "breakpoint name add <command-options> <breakpoint-id-list>") {
-    // Create the first variant for the first (and only) argument for this
-    // command.
-    CommandArgumentEntry arg1;
-    CommandArgumentData id_arg;
-    id_arg.arg_type = eArgTypeBreakpointID;
-    id_arg.arg_repetition = eArgRepeatOptional;
-    arg1.push_back(id_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
 
     m_option_group.Append(&m_name_options, LLDB_OPT_SET_1, LLDB_OPT_SET_ALL);
     m_option_group.Finalize();
@@ -1851,14 +1809,7 @@ class CommandObjectBreakpointNameDelete : public CommandObjectParsed {
             interpreter, "delete",
             "Delete a name from the breakpoints provided.",
             "breakpoint name delete <command-options> <breakpoint-id-list>") {
-    // Create the first variant for the first (and only) argument for this
-    // command.
-    CommandArgumentEntry arg1;
-    CommandArgumentData id_arg;
-    id_arg.arg_type = eArgTypeBreakpointID;
-    id_arg.arg_repetition = eArgRepeatOptional;
-    arg1.push_back(id_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
 
     m_option_group.Append(&m_name_options, LLDB_OPT_SET_1, LLDB_OPT_SET_ALL);
     m_option_group.Finalize();
@@ -2308,12 +2259,7 @@ class CommandObjectBreakpointWrite : public CommandObjectParsed {
                             "be read in with \"breakpoint read\".  "
                             "If given no arguments, writes all breakpoints.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeBreakpointID,
-                                      eArgTypeBreakpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eBreakpointArgs);
   }
 
   ~CommandObjectBreakpointWrite() override = default;
diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
index fefafcd94546a..6ebe6e8a35570 100644
--- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -185,19 +185,7 @@ are no syntax errors may indicate that a function was declared but never called.
                          LLDB_OPT_SET_2);
     m_all_options.Finalize();
 
-    CommandArgumentEntry arg;
-    CommandArgumentData bp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
   }
 
   ~CommandObjectBreakpointCommandAdd() override = default;
@@ -449,19 +437,7 @@ class CommandObjectBreakpointCommandDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "delete",
                             "Delete the set of commands from a breakpoint.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData bp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID);
   }
 
   ~CommandObjectBreakpointCommandDelete() override = default;
@@ -565,19 +541,7 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
                             "List the script or set of commands to be "
                             "executed when the breakpoint is hit.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData bp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    bp_id_arg.arg_type = eArgTypeBreakpointID;
-    bp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBreakpointID);
   }
 
   ~CommandObjectBreakpointCommandList() override = default;
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index 7c459bdaf3802..f4903e373b086 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -41,19 +41,7 @@ class CommandObjectCommandsSource : public CommandObjectParsed {
             interpreter, "command source",
             "Read and execute LLDB commands from the file <filename>.",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   ~CommandObjectCommandsSource() override = default;
@@ -614,19 +602,7 @@ class CommandObjectCommandsUnalias : public CommandObjectParsed {
             interpreter, "command unalias",
             "Delete one or more custom commands defined by 'command alias'.",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData alias_arg;
-
-    // Define the first (and only) variant of this arg.
-    alias_arg.arg_type = eArgTypeAliasName;
-    alias_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(alias_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeAliasName);
   }
 
   ~CommandObjectCommandsUnalias() override = default;
@@ -701,19 +677,7 @@ class CommandObjectCommandsDelete : public CommandObjectParsed {
             interpreter, "command delete",
             "Delete one or more custom commands defined by 'command regex'.",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData alias_arg;
-
-    // Define the first (and only) variant of this arg.
-    alias_arg.arg_type = eArgTypeCommandName;
-    alias_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(alias_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeCommandName);
   }
 
   ~CommandObjectCommandsDelete() override = default;
@@ -815,8 +779,7 @@ a number follows 'f':"
         R"(
 
     (lldb) command regex f s/^$/finish/ 's/([0-9]+)/frame select %1/')");
-    CommandArgumentData thread_arg{eArgTypeSEDStylePair, eArgRepeatOptional};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeSEDStylePair, eArgRepeatOptional);
   }
 
   ~CommandObjectCommandsAddRegex() override = default;
@@ -1944,19 +1907,7 @@ class CommandObjectCommandsScriptImport : public CommandObjectParsed {
   CommandObjectCommandsScriptImport(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "command script import",
                             "Import a scripting module in LLDB.", nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // Define the first (and only) variant of this arg.
-    cmd_arg.arg_type = eArgTypeFilename;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeFilename, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsScriptImport() override = default;
@@ -2066,20 +2017,7 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
                             "command, and the last element will be the new "
                             "command name."),
         IOHandlerDelegateMultiline("DONE") {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is one or more command names, which form the path to the command
-    // you want to add.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsScriptAdd() override = default;
@@ -2400,20 +2338,7 @@ class CommandObjectCommandsScriptDelete : public CommandObjectParsed {
             interpreter, "command script delete",
             "Delete a scripted command by specifying the path to the command.",
             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is a list of command names forming the path to the command
-    // to be deleted.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsScriptDelete() override = default;
@@ -2549,20 +2474,7 @@ class CommandObjectCommandsContainerAdd : public CommandObjectParsed {
             "Add a container command to lldb.  Adding to built-"
             "in container commands is not allowed.",
             "command container add [[path1]...] container-name") {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is one or more command names, which form the path to the command
-    // you want to add.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsContainerAdd() override = default;
@@ -2690,20 +2602,7 @@ class CommandObjectCommandsContainerDelete : public CommandObjectParsed {
             "Delete a container command previously added to "
             "lldb.",
             "command container delete [[path1] ...] container-cmd") {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // This is one or more command names, which form the path to the command
-    // you want to add.
-    cmd_arg.arg_type = eArgTypeCommand;
-    cmd_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeCommand, eArgRepeatPlus);
   }
 
   ~CommandObjectCommandsContainerDelete() override = default;
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index fb2cc106ffd2d..b183cb423111f 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -37,8 +37,7 @@ CommandObjectDWIMPrint::CommandObjectDWIMPrint(CommandInterpreter &interpreter)
                        "dwim-print [<variable-name> | <expression>]",
                        eCommandProcessMustBePaused | eCommandTryTargetAPILock) {
 
-  CommandArgumentData var_name_arg(eArgTypeVarName, eArgRepeatPlain);
-  m_arguments.push_back({var_name_arg});
+  AddSimpleArgumentList(eArgTypeVarName);
 
   m_option_group.Append(&m_format_options,
                         OptionGroupFormat::OPTION_GROUP_FORMAT |
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index 3a2dc11e1e71c..2319ddd3c80af 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -311,19 +311,7 @@ format."
     expr unsigned int $foo = 5
     expr char c[] = \"foo\"; c[0])");
 
-  CommandArgumentEntry arg;
-  CommandArgumentData expression_arg;
-
-  // Define the first (and only) variant of this arg.
-  expression_arg.arg_type = eArgTypeExpression;
-  expression_arg.arg_repetition = eArgRepeatPlain;
-
-  // There is only one variant this argument could be; put it into the argument
-  // entry.
-  arg.push_back(expression_arg);
-
-  // Push the data for the first argument into the m_arguments vector.
-  m_arguments.push_back(arg);
+  AddSimpleArgumentList(eArgTypeExpression);
 
   // Add the "--format" and "--gdb-format"
   m_option_group.Append(&m_format_options,
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index f092d54ffe993..b1d060b3c6cf2 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -113,19 +113,7 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed {
                             eCommandRequiresThread | eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeFrameIndex;
-    index_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFrameIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectFrameDiagnose() override = default;
@@ -269,19 +257,7 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
                             eCommandRequiresThread | eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeFrameIndex;
-    index_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFrameIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectFrameSelect() override = default;
@@ -409,19 +385,7 @@ However, 'frame variable' is more efficient, since it uses debug information and
 memory reads directly, rather than parsing and evaluating an expression, which
 may even involve JITing and running code in the target program.)");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeVarName;
-    var_name_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeVarName, eArgRepeatStar);
 
     m_option_group.Append(&m_option_variable, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_option_format,
@@ -939,8 +903,7 @@ class CommandObjectFrameRecognizerDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "frame recognizer delete",
                             "Delete an existing frame recognizer by id.",
                             nullptr) {
-    CommandArgumentData thread_arg{eArgTypeRecognizerID, eArgRepeatPlain};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeRecognizerID);
   }
 
   ~CommandObjectFrameRecognizerDelete() override = default;
@@ -1065,19 +1028,7 @@ class CommandObjectFrameRecognizerInfo : public CommandObjectParsed {
             interpreter, "frame recognizer info",
             "Show which frame recognizer is applied a stack frame (if any).",
             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeFrameIndex;
-    index_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFrameIndex);
   }
 
   ~CommandObjectFrameRecognizerInfo() override = default;
diff --git a/lldb/source/Commands/CommandObjectHelp.cpp b/lldb/source/Commands/CommandObjectHelp.cpp
index ddb006e52d2c5..f1dbd03fe97cb 100644
--- a/lldb/source/Commands/CommandObjectHelp.cpp
+++ b/lldb/source/Commands/CommandObjectHelp.cpp
@@ -48,20 +48,9 @@ CommandObjectHelp::CommandObjectHelp(CommandInterpreter &interpreter)
                           "commands, or give details "
                           "about a specific command.",
                           "help [<cmd-name>]") {
-  CommandArgumentEntry arg;
-  CommandArgumentData command_arg;
-
   // A list of command names forming a path to the command we want help on.
   // No names is allowed - in which case we dump the top-level help.
-  command_arg.arg_type = eArgTypeCommand;
-  command_arg.arg_repetition = eArgRepeatStar;
-
-  // There is only one variant this argument could be; put it into the argument
-  // entry.
-  arg.push_back(command_arg);
-
-  // Push the data for the first argument into the m_arguments vector.
-  m_arguments.push_back(arg);
+  AddSimpleArgumentList(eArgTypeCommand, eArgRepeatStar);
 }
 
 CommandObjectHelp::~CommandObjectHelp() = default;
diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp
index 6bfbf98078e6e..48dfd9456a660 100644
--- a/lldb/source/Commands/CommandObjectLog.cpp
+++ b/lldb/source/Commands/CommandObjectLog.cpp
@@ -288,19 +288,7 @@ class CommandObjectLogList : public CommandObjectParsed {
                             "List the log categories for one or more log "
                             "channels.  If none specified, lists them all.",
                             nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData channel_arg;
-
-    // Define the first (and only) variant of this arg.
-    channel_arg.arg_type = eArgTypeLogChannel;
-    channel_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(channel_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeLogChannel, eArgRepeatStar);
   }
 
   ~CommandObjectLogList() override = default;
@@ -335,19 +323,7 @@ class CommandObjectLogDump : public CommandObjectParsed {
   CommandObjectLogDump(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "log dump",
                             "dump circular buffer logs", nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData channel_arg;
-
-    // Define the first (and only) variant of this arg.
-    channel_arg.arg_type = eArgTypeLogChannel;
-    channel_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(channel_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeLogChannel);
   }
 
   ~CommandObjectLogDump() override = default;
@@ -444,19 +420,7 @@ class CommandObjectLogTimerEnable : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "log timers enable",
                             "enable LLDB internal performance timers",
                             "log timers enable <depth>") {
-    CommandArgumentEntry arg;
-    CommandArgumentData depth_arg;
-
-    // Define the first (and only) variant of this arg.
-    depth_arg.arg_type = eArgTypeCount;
-    depth_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(depth_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeCount, eArgRepeatOptional);
   }
 
   ~CommandObjectLogTimerEnable() override = default;
@@ -559,19 +523,7 @@ class CommandObjectLogTimerIncrement : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "log timers increment",
                             "increment LLDB internal performance timers",
                             "log timers increment <bool>") {
-    CommandArgumentEntry arg;
-    CommandArgumentData bool_arg;
-
-    // Define the first (and only) variant of this arg.
-    bool_arg.arg_type = eArgTypeBoolean;
-    bool_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(bool_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeBoolean);
   }
 
   ~CommandObjectLogTimerIncrement() override = default;
diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp
index b25c391bd4faa..5b18f2b60e92d 100644
--- a/lldb/source/Commands/CommandObjectPlatform.cpp
+++ b/lldb/source/Commands/CommandObjectPlatform.cpp
@@ -155,8 +155,7 @@ class CommandObjectPlatformSelect : public CommandObjectParsed {
   {
     m_option_group.Append(&m_platform_options, LLDB_OPT_SET_ALL, 1);
     m_option_group.Finalize();
-    CommandArgumentData platform_arg{eArgTypePlatform, eArgRepeatPlain};
-    m_arguments.push_back({platform_arg});
+    AddSimpleArgumentList(eArgTypePlatform);
   }
 
   ~CommandObjectPlatformSelect() override = default;
@@ -276,8 +275,7 @@ class CommandObjectPlatformConnect : public CommandObjectParsed {
             interpreter, "platform connect",
             "Select the current platform by providing a connection URL.",
             "platform connect <connect-url>", 0) {
-    CommandArgumentData platform_arg{eArgTypeConnectURL, eArgRepeatPlain};
-    m_arguments.push_back({platform_arg});
+    AddSimpleArgumentList(eArgTypeConnectURL);
   }
 
   ~CommandObjectPlatformConnect() override = default;
@@ -418,8 +416,7 @@ class CommandObjectPlatformMkDir : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "platform mkdir",
                             "Make a new directory on the remote end.", nullptr,
                             0) {
-    CommandArgumentData thread_arg{eArgTypeRemotePath, eArgRepeatPlain};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeRemotePath);
   }
 
   ~CommandObjectPlatformMkDir() override = default;
@@ -467,8 +464,7 @@ class CommandObjectPlatformFOpen : public CommandObjectParsed {
   CommandObjectPlatformFOpen(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "platform file open",
                             "Open a file on the remote end.", nullptr, 0) {
-    CommandArgumentData path_arg{eArgTypeRemotePath, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeRemotePath);
   }
 
   ~CommandObjectPlatformFOpen() override = default;
@@ -521,8 +517,7 @@ class CommandObjectPlatformFClose : public CommandObjectParsed {
   CommandObjectPlatformFClose(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "platform file close",
                             "Close a file on the remote end.", nullptr, 0) {
-    CommandArgumentData path_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectPlatformFClose() override = default;
@@ -564,8 +559,7 @@ class CommandObjectPlatformFRead : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "platform file read",
                             "Read data from a file on the remote end.", nullptr,
                             0) {
-    CommandArgumentData path_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectPlatformFRead() override = default;
@@ -659,8 +653,7 @@ class CommandObjectPlatformFWrite : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "platform file write",
                             "Write data to a file on the remote end.", nullptr,
                             0) {
-    CommandArgumentData path_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({path_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectPlatformFWrite() override = default;
@@ -863,18 +856,7 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed {
 
     Get the file size from the remote end with path /the/remote/file/path.)");
 
-    CommandArgumentEntry arg1;
-    CommandArgumentData file_arg_remote;
-
-    // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeRemoteFilename;
-    file_arg_remote.arg_repetition = eArgRepeatPlain;
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(file_arg_remote);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRemoteFilename);
   }
 
   ~CommandObjectPlatformGetSize() override = default;
@@ -922,18 +904,7 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed {
 
     Get the file permissions from the remote end with path /the/remote/file/path.)");
 
-    CommandArgumentEntry arg1;
-    CommandArgumentData file_arg_remote;
-
-    // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeRemoteFilename;
-    file_arg_remote.arg_repetition = eArgRepeatPlain;
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(file_arg_remote);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRemoteFilename);
   }
 
   ~CommandObjectPlatformGetPermissions() override = default;
@@ -980,18 +951,7 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed {
 
     Check if /the/remote/file/path exists on the remote end.)");
 
-    CommandArgumentEntry arg1;
-    CommandArgumentData file_arg_remote;
-
-    // Define the first (and only) variant of this arg.
-    file_arg_remote.arg_type = eArgTypeRemoteFilename;
-    file_arg_remote.arg_repetition = eArgRepeatPlain;
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(file_arg_remote);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRemoteFilename);
   }
 
   ~CommandObjectPlatformFileExists() override = default;
@@ -1093,8 +1053,7 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed {
     m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2,
                          LLDB_OPT_SET_ALL);
     m_all_options.Finalize();
-    CommandArgumentData run_arg_arg{eArgTypeRunArgs, eArgRepeatStar};
-    m_arguments.push_back({run_arg_arg});
+    AddSimpleArgumentList(eArgTypeRunArgs, eArgRepeatStar);
   }
 
   void
@@ -1503,19 +1462,7 @@ class CommandObjectPlatformProcessInfo : public CommandObjectParsed {
             interpreter, "platform process info",
             "Get detailed information for one or more process by process ID.",
             "platform process info <pid> [<pid> <pid> ...]", 0) {
-    CommandArgumentEntry arg;
-    CommandArgumentData pid_args;
-
-    // Define the first (and only) variant of this arg.
-    pid_args.arg_type = eArgTypePid;
-    pid_args.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(pid_args);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypePid, eArgRepeatStar);
   }
 
   ~CommandObjectPlatformProcessInfo() override = default;
@@ -1721,8 +1668,7 @@ class CommandObjectPlatformShell : public CommandObjectRaw {
       : CommandObjectRaw(interpreter, "platform shell",
                          "Run a shell command on the current platform.",
                          "platform shell <shell-command>", 0) {
-    CommandArgumentData thread_arg{eArgTypeNone, eArgRepeatStar};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeNone, eArgRepeatStar);
   }
 
   ~CommandObjectPlatformShell() override = default;
diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp
index da3b5f0518a69..f3108b8a768d2 100644
--- a/lldb/source/Commands/CommandObjectPlugin.cpp
+++ b/lldb/source/Commands/CommandObjectPlugin.cpp
@@ -19,19 +19,7 @@ class CommandObjectPluginLoad : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "plugin load",
                             "Import a dylib that implements an LLDB plugin.",
                             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData cmd_arg;
-
-    // Define the first (and only) variant of this arg.
-    cmd_arg.arg_type = eArgTypeFilename;
-    cmd_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(cmd_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   ~CommandObjectPluginLoad() override = default;
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 7cd5ad656f1b0..9ac97eb66b623 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -126,19 +126,7 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach {
                          LLDB_OPT_SET_ALL);
     m_all_options.Finalize();
 
-    CommandArgumentEntry arg;
-    CommandArgumentData run_args_arg;
-
-    // Define the first (and only) variant of this arg.
-    run_args_arg.arg_type = eArgTypeRunArgs;
-    run_args_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(run_args_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeRunArgs, eArgRepeatOptional);
   }
 
   ~CommandObjectProcessLaunch() override = default;
@@ -870,8 +858,7 @@ class CommandObjectProcessConnect : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "process connect",
                             "Connect to a remote debug service.",
                             "process connect <remote-url>", 0) {
-    CommandArgumentData connect_arg{eArgTypeConnectURL, eArgRepeatPlain};
-    m_arguments.push_back({connect_arg});
+    AddSimpleArgumentList(eArgTypeConnectURL);
   }
 
   ~CommandObjectProcessConnect() override = default;
@@ -996,8 +983,7 @@ class CommandObjectProcessLoad : public CommandObjectParsed {
                             eCommandRequiresProcess | eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentData file_arg{eArgTypePath, eArgRepeatPlus};
-    m_arguments.push_back({file_arg});
+    AddSimpleArgumentList(eArgTypePath, eArgRepeatPlus);
   }
 
   ~CommandObjectProcessLoad() override = default;
@@ -1070,8 +1056,7 @@ class CommandObjectProcessUnload : public CommandObjectParsed {
             "process unload <index>",
             eCommandRequiresProcess | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused) {
-    CommandArgumentData load_idx_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({load_idx_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectProcessUnload() override = default;
@@ -1131,19 +1116,7 @@ class CommandObjectProcessSignal : public CommandObjectParsed {
             interpreter, "process signal",
             "Send a UNIX signal to the current target process.", nullptr,
             eCommandRequiresProcess | eCommandTryTargetAPILock) {
-    CommandArgumentEntry arg;
-    CommandArgumentData signal_arg;
-
-    // Define the first (and only) variant of this arg.
-    signal_arg.arg_type = eArgTypeUnixSignal;
-    signal_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(signal_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeUnixSignal);
   }
 
   ~CommandObjectProcessSignal() override = default;
@@ -1274,8 +1247,7 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
             "process save-core [-s corefile-style -p plugin-name] FILE",
             eCommandRequiresProcess | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched) {
-    CommandArgumentData file_arg{eArgTypePath, eArgRepeatPlain};
-    m_arguments.push_back({file_arg});
+    AddSimpleArgumentList(eArgTypePath);
   }
 
   ~CommandObjectProcessSaveCore() override = default;
@@ -1559,15 +1531,7 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
                 "by passing the -t option."
                 "\nYou can also clear the target modification for a signal"
                 "by passing the -c option");
-    CommandArgumentEntry arg;
-    CommandArgumentData signal_arg;
-
-    signal_arg.arg_type = eArgTypeUnixSignal;
-    signal_arg.arg_repetition = eArgRepeatStar;
-
-    arg.push_back(signal_arg);
-
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeUnixSignal, eArgRepeatStar);
   }
 
   ~CommandObjectProcessHandle() override = default;
diff --git a/lldb/source/Commands/CommandObjectQuit.cpp b/lldb/source/Commands/CommandObjectQuit.cpp
index d7caf1546fb57..8e7830b2afc63 100644
--- a/lldb/source/Commands/CommandObjectQuit.cpp
+++ b/lldb/source/Commands/CommandObjectQuit.cpp
@@ -21,8 +21,7 @@ using namespace lldb_private;
 CommandObjectQuit::CommandObjectQuit(CommandInterpreter &interpreter)
     : CommandObjectParsed(interpreter, "quit", "Quit the LLDB debugger.",
                           "quit [exit-code]") {
-  CommandArgumentData exit_code_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-  m_arguments.push_back({exit_code_arg});
+  AddSimpleArgumentList(eArgTypeUnsignedInteger);
 }
 
 CommandObjectQuit::~CommandObjectQuit() = default;
diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp
index 4ffdde1ee09f9..4e047ccbc10b9 100644
--- a/lldb/source/Commands/CommandObjectRegister.cpp
+++ b/lldb/source/Commands/CommandObjectRegister.cpp
@@ -50,19 +50,7 @@ class CommandObjectRegisterRead : public CommandObjectParsed {
                          {{CommandArgumentType::eArgTypeFormat,
                            "Specify a format to be used for display. If this "
                            "is set, register fields will not be displayed."}}) {
-    CommandArgumentEntry arg;
-    CommandArgumentData register_arg;
-
-    // Define the first (and only) variant of this arg.
-    register_arg.arg_type = eArgTypeRegisterName;
-    register_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(register_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeRegisterName, eArgRepeatStar);
 
     // Add the "--format"
     m_option_group.Append(&m_format_options,
@@ -422,13 +410,7 @@ Fields      (*)  A table of the names and bit positions of the values contained
 Fields marked with (*) may not always be present. Some information may be
 different for the same register when connected to different debug servers.)");
 
-    CommandArgumentData register_arg;
-    register_arg.arg_type = eArgTypeRegisterName;
-    register_arg.arg_repetition = eArgRepeatPlain;
-
-    CommandArgumentEntry arg1;
-    arg1.push_back(register_arg);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeRegisterName);
   }
 
   ~CommandObjectRegisterInfo() override = default;
diff --git a/lldb/source/Commands/CommandObjectSession.cpp b/lldb/source/Commands/CommandObjectSession.cpp
index 28506d6c59512..c381ba4f74f12 100644
--- a/lldb/source/Commands/CommandObjectSession.cpp
+++ b/lldb/source/Commands/CommandObjectSession.cpp
@@ -21,9 +21,7 @@ class CommandObjectSessionSave : public CommandObjectParsed {
                             "If no file if specified, transcripts will be "
                             "saved to a temporary file.",
                             "session save [file]") {
-    CommandArgumentEntry arg1;
-    arg1.emplace_back(eArgTypePath, eArgRepeatOptional);
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypePath, eArgRepeatOptional);
   }
 
   ~CommandObjectSessionSave() override = default;
diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp
index 0cf3d1daf7f52..7bbb0dd567ab1 100644
--- a/lldb/source/Commands/CommandObjectSettings.cpp
+++ b/lldb/source/Commands/CommandObjectSettings.cpp
@@ -245,19 +245,7 @@ class CommandObjectSettingsShow : public CommandObjectParsed {
                             "Show matching debugger settings and their current "
                             "values.  Defaults to showing all settings.",
                             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeSettingVariableName;
-    var_name_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeSettingVariableName, eArgRepeatOptional);
   }
 
   ~CommandObjectSettingsShow() override = default;
@@ -297,19 +285,7 @@ class CommandObjectSettingsWrite : public CommandObjectParsed {
             "current values to a file that can be read in with "
             "\"settings read\". Defaults to writing all settings.",
             nullptr) {
-    CommandArgumentEntry arg1;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeSettingVariableName;
-    var_name_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg1.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg1);
+    AddSimpleArgumentList(eArgTypeSettingVariableName, eArgRepeatOptional);
   }
 
   ~CommandObjectSettingsWrite() override = default;
@@ -997,19 +973,7 @@ class CommandObjectSettingsClear : public CommandObjectParsed {
             interpreter, "settings clear",
             "Clear a debugger setting array, dictionary, or string. "
             "If '-a' option is specified, it clears all settings.", nullptr) {
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeSettingVariableName;
-    var_name_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeSettingVariableName);
   }
 
   ~CommandObjectSettingsClear() override = default;
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 4e006e4bb0e0f..45265577e8b61 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -229,19 +229,8 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
         m_remote_file(
             LLDB_OPT_SET_1, false, "remote-file", 'r', 0, eArgTypeFilename,
             "Fullpath to the file on the remote host if debugging remotely.") {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatPlain;
 
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename);
 
     m_option_group.Append(&m_arch_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_platform_options, LLDB_OPT_SET_ALL, 1);
@@ -503,8 +492,7 @@ class CommandObjectTargetSelect : public CommandObjectParsed {
       : CommandObjectParsed(
             interpreter, "target select",
             "Select a target as the current target by target index.", nullptr) {
-    CommandArgumentData target_arg{eArgTypeTargetID, eArgRepeatPlain};
-    m_arguments.push_back({target_arg});
+    AddSimpleArgumentList(eArgTypeTargetID);
   }
 
   ~CommandObjectTargetSelect() override = default;
@@ -586,8 +574,7 @@ class CommandObjectTargetDelete : public CommandObjectParsed {
     m_option_group.Append(&m_all_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_cleanup_option, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Finalize();
-    CommandArgumentData target_arg{eArgTypeTargetID, eArgRepeatStar};
-    m_arguments.push_back({target_arg});
+    AddSimpleArgumentList(eArgTypeTargetID, eArgRepeatStar);
   }
 
   ~CommandObjectTargetDelete() override = default;
@@ -729,19 +716,7 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
             "A basename or fullpath to a shared library to use in the search "
             "for global "
             "variables. This option can be specified multiple times.") {
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the first (and only) variant of this arg.
-    var_name_arg.arg_type = eArgTypeVarName;
-    var_name_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeVarName, eArgRepeatPlus);
 
     m_option_group.Append(&m_varobj_options, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Append(&m_option_variable, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
@@ -1243,19 +1218,7 @@ class CommandObjectTargetModulesSearchPathsQuery : public CommandObjectParsed {
             interpreter, "target modules search-paths query",
             "Transform a path using the first applicable image search path.",
             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData path_arg;
-
-    // Define the first (and only) variant of this arg.
-    path_arg.arg_type = eArgTypeDirectoryName;
-    path_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(path_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeDirectoryName);
   }
 
   ~CommandObjectTargetModulesSearchPathsQuery() override = default;
@@ -1881,19 +1844,7 @@ class CommandObjectTargetModulesModuleAutoComplete
                                                const char *syntax,
                                                uint32_t flags = 0)
       : CommandObjectParsed(interpreter, name, help, syntax, flags) {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesModuleAutoComplete() override = default;
@@ -1918,19 +1869,7 @@ class CommandObjectTargetModulesSourceFileAutoComplete
       CommandInterpreter &interpreter, const char *name, const char *help,
       const char *syntax, uint32_t flags)
       : CommandObjectParsed(interpreter, name, help, syntax, flags) {
-    CommandArgumentEntry arg;
-    CommandArgumentData source_file_arg;
-
-    // Define the first (and only) variant of this arg.
-    source_file_arg.arg_type = eArgTypeSourceFile;
-    source_file_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(source_file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeSourceFile, eArgRepeatPlus);
   }
 
   ~CommandObjectTargetModulesSourceFileAutoComplete() override = default;
@@ -2234,8 +2173,7 @@ class CommandObjectTargetModulesDumpClangPCMInfo : public CommandObjectParsed {
             interpreter, "target modules dump pcm-info",
             "Dump information about the given clang module (pcm).") {
     // Take a single file argument.
-    CommandArgumentData arg{eArgTypeFilename, eArgRepeatPlain};
-    m_arguments.push_back({arg});
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   ~CommandObjectTargetModulesDumpClangPCMInfo() override = default;
@@ -2774,8 +2712,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
                           LLDB_OPT_SET_1);
     m_option_group.Append(&m_symbol_file, LLDB_OPT_SET_ALL, LLDB_OPT_SET_1);
     m_option_group.Finalize();
-    CommandArgumentData module_arg{eArgTypePath, eArgRepeatStar};
-    m_arguments.push_back({module_arg});
+    AddSimpleArgumentList(eArgTypePath, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesAdd() override = default;
@@ -3219,8 +3156,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
       : CommandObjectParsed(
             interpreter, "target modules list",
             "List current executable and dependent shared library images.") {
-    CommandArgumentData module_arg{eArgTypeModule, eArgRepeatStar};
-    m_arguments.push_back({module_arg});
+    AddSimpleArgumentList(eArgTypeModule, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesList() override = default;
@@ -3992,19 +3928,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
                             "Look up information within executable and "
                             "dependent shared library images.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData file_arg;
-
-    // Define the first (and only) variant of this arg.
-    file_arg.arg_type = eArgTypeFilename;
-    file_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(file_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeFilename, eArgRepeatStar);
   }
 
   ~CommandObjectTargetModulesLookup() override = default;
@@ -4323,8 +4247,7 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
     m_option_group.Append(&m_current_stack_option, LLDB_OPT_SET_2,
                           LLDB_OPT_SET_2);
     m_option_group.Finalize();
-    CommandArgumentData module_arg{eArgTypeShlibName, eArgRepeatPlain};
-    m_arguments.push_back({module_arg});
+    AddSimpleArgumentList(eArgTypeShlibName);
   }
 
   ~CommandObjectTargetSymbolsAdd() override = default;
@@ -5163,8 +5086,7 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "target stop-hook delete",
                             "Delete a stop-hook.",
                             "target stop-hook delete [<idx>]") {
-    CommandArgumentData hook_arg{eArgTypeStopHookID, eArgRepeatStar};
-    m_arguments.push_back({hook_arg});
+    AddSimpleArgumentList(eArgTypeStopHookID, eArgRepeatStar);
   }
 
   ~CommandObjectTargetStopHookDelete() override = default;
@@ -5218,8 +5140,7 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed {
                                            bool enable, const char *name,
                                            const char *help, const char *syntax)
       : CommandObjectParsed(interpreter, name, help, syntax), m_enable(enable) {
-    CommandArgumentData hook_arg{eArgTypeStopHookID, eArgRepeatStar};
-    m_arguments.push_back({hook_arg});
+    AddSimpleArgumentList(eArgTypeStopHookID, eArgRepeatStar);
   }
 
   ~CommandObjectTargetStopHookEnableDisable() override = default;
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 52e493b13c61c..9cfff059d6bfa 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -374,19 +374,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
                                 eCommandProcessMustBePaused),
         m_step_type(step_type), m_step_scope(step_scope),
         m_class_options("scripted step") {
-    CommandArgumentEntry arg;
-    CommandArgumentData thread_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    thread_id_arg.arg_type = eArgTypeThreadID;
-    thread_id_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(thread_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeThreadID, eArgRepeatOptional);
 
     if (step_type == eStepTypeScripted) {
       m_all_options.Append(&m_class_options, LLDB_OPT_SET_1 | LLDB_OPT_SET_2,
@@ -643,19 +631,7 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
             nullptr,
             eCommandRequiresThread | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData thread_idx_arg;
-
-    // Define the first (and only) variant of this arg.
-    thread_idx_arg.arg_type = eArgTypeThreadIndex;
-    thread_idx_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(thread_idx_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatPlus);
   }
 
   ~CommandObjectThreadContinue() override = default;
@@ -886,19 +862,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
             nullptr,
             eCommandRequiresThread | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData line_num_arg;
-
-    // Define the first (and only) variant of this arg.
-    line_num_arg.arg_type = eArgTypeLineNum;
-    line_num_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(line_num_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeLineNum);
   }
 
   ~CommandObjectThreadUntil() override = default;
@@ -1539,19 +1503,7 @@ class CommandObjectThreadReturn : public CommandObjectRaw {
                          eCommandRequiresFrame | eCommandTryTargetAPILock |
                              eCommandProcessMustBeLaunched |
                              eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData expression_arg;
-
-    // Define the first (and only) variant of this arg.
-    expression_arg.arg_type = eArgTypeExpression;
-    expression_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(expression_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeExpression, eArgRepeatOptional);
   }
 
   ~CommandObjectThreadReturn() override = default;
@@ -1919,19 +1871,7 @@ class CommandObjectThreadPlanDiscard : public CommandObjectParsed {
                                 eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData plan_index_arg;
-
-    // Define the first (and only) variant of this arg.
-    plan_index_arg.arg_type = eArgTypeUnsignedInteger;
-    plan_index_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(plan_index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectThreadPlanDiscard() override = default;
@@ -1992,19 +1932,7 @@ class CommandObjectThreadPlanPrune : public CommandObjectParsed {
                                 eCommandTryTargetAPILock |
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData tid_arg;
-
-    // Define the first (and only) variant of this arg.
-    tid_arg.arg_type = eArgTypeThreadID;
-    tid_arg.arg_repetition = eArgRepeatStar;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(tid_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeThreadID, eArgRepeatStar);
   }
 
   ~CommandObjectThreadPlanPrune() override = default;
@@ -2221,8 +2149,7 @@ class CommandObjectTraceDumpFunctionCalls : public CommandObjectParsed {
             eCommandRequiresProcess | eCommandRequiresThread |
                 eCommandTryTargetAPILock | eCommandProcessMustBeLaunched |
                 eCommandProcessMustBePaused | eCommandProcessMustBeTraced) {
-    CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatOptional};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectTraceDumpFunctionCalls() override = default;
@@ -2395,8 +2322,7 @@ class CommandObjectTraceDumpInstructions : public CommandObjectParsed {
             eCommandRequiresProcess | eCommandRequiresThread |
                 eCommandTryTargetAPILock | eCommandProcessMustBeLaunched |
                 eCommandProcessMustBePaused | eCommandProcessMustBeTraced) {
-    CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatOptional};
-    m_arguments.push_back({thread_arg});
+    AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatOptional);
   }
 
   ~CommandObjectTraceDumpInstructions() override = default;
diff --git a/lldb/source/Commands/CommandObjectThreadUtil.cpp b/lldb/source/Commands/CommandObjectThreadUtil.cpp
index d7fa4190a2450..cdc5946547f47 100644
--- a/lldb/source/Commands/CommandObjectThreadUtil.cpp
+++ b/lldb/source/Commands/CommandObjectThreadUtil.cpp
@@ -21,8 +21,7 @@ CommandObjectIterateOverThreads::CommandObjectIterateOverThreads(
     const char *syntax, uint32_t flags)
     : CommandObjectParsed(interpreter, name, help, syntax, flags) {
   // These commands all take thread ID's as arguments.
-  CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatStar};
-  m_arguments.push_back({thread_arg});
+  AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatStar);
 }
 
 CommandObjectMultipleThreads::CommandObjectMultipleThreads(
@@ -30,8 +29,7 @@ CommandObjectMultipleThreads::CommandObjectMultipleThreads(
     const char *syntax, uint32_t flags)
     : CommandObjectParsed(interpreter, name, help, syntax, flags) {
   // These commands all take thread ID's as arguments.
-  CommandArgumentData thread_arg{eArgTypeThreadIndex, eArgRepeatStar};
-  m_arguments.push_back({thread_arg});
+  AddSimpleArgumentList(eArgTypeThreadIndex, eArgRepeatStar);
 }
 
 void CommandObjectIterateOverThreads::DoExecute(Args &command,
diff --git a/lldb/source/Commands/CommandObjectTrace.cpp b/lldb/source/Commands/CommandObjectTrace.cpp
index e0c74e29aaa6b..5bcbc236301cc 100644
--- a/lldb/source/Commands/CommandObjectTrace.cpp
+++ b/lldb/source/Commands/CommandObjectTrace.cpp
@@ -89,8 +89,7 @@ class CommandObjectTraceSave : public CommandObjectParsed {
             eCommandRequiresProcess | eCommandTryTargetAPILock |
                 eCommandProcessMustBeLaunched | eCommandProcessMustBePaused |
                 eCommandProcessMustBeTraced) {
-    CommandArgumentData bundle_dir{eArgTypeDirectoryName, eArgRepeatPlain};
-    m_arguments.push_back({bundle_dir});
+    AddSimpleArgumentList(eArgTypeDirectoryName);
   }
 
   void
@@ -176,8 +175,7 @@ class CommandObjectTraceLoad : public CommandObjectParsed {
             interpreter, "trace load",
             "Load a post-mortem processor trace session from a trace bundle.",
             "trace load <trace_description_file>") {
-    CommandArgumentData session_file_arg{eArgTypeFilename, eArgRepeatPlain};
-    m_arguments.push_back({session_file_arg});
+    AddSimpleArgumentList(eArgTypeFilename);
   }
 
   void
@@ -332,8 +330,7 @@ class CommandObjectTraceSchema : public CommandObjectParsed {
                             "Show the schema of the given trace plugin.",
                             "trace schema <plug-in>. Use the plug-in name "
                             "\"all\" to see all schemas.\n") {
-    CommandArgumentData plugin_arg{eArgTypeNone, eArgRepeatPlain};
-    m_arguments.push_back({plugin_arg});
+    AddSimpleArgumentList(eArgTypeNone);
   }
 
   ~CommandObjectTraceSchema() override = default;
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index 036b8e9d9def1..97489bdc2d9c2 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -589,15 +589,7 @@ class CommandObjectTypeFormatAdd : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "type format add",
                             "Add a new formatting style for a type.", nullptr),
         m_format_options(eFormatInvalid) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 
     SetHelpLong(
         R"(
@@ -784,15 +776,7 @@ class CommandObjectTypeFormatterDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter,
                             FormatCategoryToString(formatter_kind, false)),
         m_formatter_kind(formatter_kind) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlain;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName);
 
     const char *kind = FormatCategoryToString(formatter_kind, true);
     const char *short_kind = FormatCategoryToString(formatter_kind, false);
@@ -929,8 +913,7 @@ class CommandObjectTypeFormatterClear : public CommandObjectParsed {
                                   const char *name, const char *help)
       : CommandObjectParsed(interpreter, name, help, nullptr),
         m_formatter_kind(formatter_kind) {
-    CommandArgumentData category_arg{eArgTypeName, eArgRepeatOptional};
-    m_arguments.push_back({category_arg});
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatOptional);
   }
 
   ~CommandObjectTypeFormatterClear() override = default;
@@ -1045,15 +1028,7 @@ class CommandObjectTypeFormatterList : public CommandObjectParsed {
   CommandObjectTypeFormatterList(CommandInterpreter &interpreter,
                                  const char *name, const char *help)
       : CommandObjectParsed(interpreter, name, help, nullptr), m_options() {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatOptional;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatOptional);
   }
 
   ~CommandObjectTypeFormatterList() override = default;
@@ -1445,15 +1420,7 @@ CommandObjectTypeSummaryAdd::CommandObjectTypeSummaryAdd(
     : CommandObjectParsed(interpreter, "type summary add",
                           "Add a new summary style for a type.", nullptr),
       IOHandlerDelegateMultiline("DONE"), m_options(interpreter) {
-  CommandArgumentEntry type_arg;
-  CommandArgumentData type_style_arg;
-
-  type_style_arg.arg_type = eArgTypeName;
-  type_style_arg.arg_repetition = eArgRepeatPlus;
-
-  type_arg.push_back(type_style_arg);
-
-  m_arguments.push_back(type_arg);
+  AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 
   SetHelpLong(
       R"(
@@ -1745,15 +1712,7 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "type category define",
                             "Define a new category as a source of formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryDefine() override = default;
@@ -1838,15 +1797,7 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "type category enable",
                             "Enable a category as a source of formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryEnable() override = default;
@@ -1897,15 +1848,7 @@ class CommandObjectTypeCategoryDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "type category delete",
                             "Delete a category and all associated formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryDelete() override = default;
@@ -1996,15 +1939,7 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "type category disable",
                             "Disable a category as a source of formatters.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
   }
 
   ~CommandObjectTypeCategoryDisable() override = default;
@@ -2050,15 +1985,7 @@ class CommandObjectTypeCategoryList : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "type category list",
                             "Provide a list of all existing categories.",
                             nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatOptional;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatOptional);
   }
 
   ~CommandObjectTypeCategoryList() override = default;
@@ -2271,15 +2198,7 @@ CommandObjectTypeSynthAdd::CommandObjectTypeSynthAdd(
     : CommandObjectParsed(interpreter, "type synthetic add",
                           "Add a new synthetic provider for a type.", nullptr),
       IOHandlerDelegateMultiline("DONE"), m_options() {
-  CommandArgumentEntry type_arg;
-  CommandArgumentData type_style_arg;
-
-  type_style_arg.arg_type = eArgTypeName;
-  type_style_arg.arg_repetition = eArgRepeatPlus;
-
-  type_arg.push_back(type_style_arg);
-
-  m_arguments.push_back(type_arg);
+  AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 }
 
 bool CommandObjectTypeSynthAdd::AddSynth(ConstString type_name,
@@ -2476,15 +2395,7 @@ class CommandObjectTypeFilterAdd : public CommandObjectParsed {
   CommandObjectTypeFilterAdd(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "type filter add",
                             "Add a new filter for a type.", nullptr) {
-    CommandArgumentEntry type_arg;
-    CommandArgumentData type_style_arg;
-
-    type_style_arg.arg_type = eArgTypeName;
-    type_style_arg.arg_repetition = eArgRepeatPlus;
-
-    type_arg.push_back(type_style_arg);
-
-    m_arguments.push_back(type_arg);
+    AddSimpleArgumentList(eArgTypeName, eArgRepeatPlus);
 
     SetHelpLong(
         R"(
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp
index 5b74b1ae43acc..f123211e72377 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp
@@ -153,12 +153,7 @@ class CommandObjectWatchpointList : public CommandObjectParsed {
             interpreter, "watchpoint list",
             "List all watchpoints at configurable levels of detail.", nullptr,
             eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointList() override = default;
@@ -276,12 +271,7 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
                             "Enable the specified disabled watchpoint(s). If "
                             "no watchpoints are specified, enable all of them.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointEnable() override = default;
@@ -350,12 +340,7 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
                             "removing it/them.  If no watchpoints are "
                             "specified, disable them all.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointDisable() override = default;
@@ -429,12 +414,7 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
                             "Delete the specified watchpoint(s).  If no "
                             "watchpoints are specified, delete them all.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointDelete() override = default;
@@ -550,12 +530,7 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
                             "Set ignore count on the specified watchpoint(s).  "
                             "If no watchpoints are specified, set them all.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointIgnore() override = default;
@@ -673,12 +648,7 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
             "watchpoint.  "
             "Passing an empty argument clears the modification.",
             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandObject::AddIDsArgumentData(arg, eArgTypeWatchpointID,
-                                      eArgTypeWatchpointIDRange);
-    // Add the entry for the first argument for this command to the object's
-    // arguments vector.
-    m_arguments.push_back(arg);
+    CommandObject::AddIDsArgumentData(eWatchpointArgs);
   }
 
   ~CommandObjectWatchpointModify() override = default;
@@ -811,18 +781,7 @@ class CommandObjectWatchpointSetVariable : public CommandObjectParsed {
         "    Watches my_global_var for read/write access, with the region to watch \
 corresponding to the byte size of the data type.");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData var_name_arg;
-
-    // Define the only variant of this arg.
-    var_name_arg.arg_type = eArgTypeVarName;
-    var_name_arg.arg_repetition = eArgRepeatPlain;
-
-    // Push the variant into the argument entry.
-    arg.push_back(var_name_arg);
-
-    // Push the data for the only argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeVarName);
 
     // Absorb the '-w' and '-s' options into our option group.
     m_option_group.Append(&m_option_watchpoint, LLDB_OPT_SET_1, LLDB_OPT_SET_1);
@@ -1009,18 +968,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
 
     Watches write access for the 1-byte region pointed to by the address 'foo + 32')");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData expression_arg;
-
-    // Define the only variant of this arg.
-    expression_arg.arg_type = eArgTypeExpression;
-    expression_arg.arg_repetition = eArgRepeatPlain;
-
-    // Push the only variant into the argument entry.
-    arg.push_back(expression_arg);
-
-    // Push the data for the only argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeExpression);
 
     // Absorb the '-w' and '-s' options into our option group.
     m_option_group.Append(&m_option_watchpoint, LLDB_OPT_SET_ALL,
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index b1629ceab2709..aaf14540cb282 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -162,19 +162,7 @@ initialized:"
         "Final Note: A warning that no watchpoint command was generated when there \
 are no syntax errors may indicate that a function was declared but never called.");
 
-    CommandArgumentEntry arg;
-    CommandArgumentData wp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    wp_id_arg.arg_type = eArgTypeWatchpointID;
-    wp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(wp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeWatchpointID);
   }
 
   ~CommandObjectWatchpointCommandAdd() override = default;
@@ -455,19 +443,7 @@ class CommandObjectWatchpointCommandDelete : public CommandObjectParsed {
       : CommandObjectParsed(interpreter, "delete",
                             "Delete the set of commands from a watchpoint.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData wp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    wp_id_arg.arg_type = eArgTypeWatchpointID;
-    wp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(wp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeWatchpointID);
   }
 
   ~CommandObjectWatchpointCommandDelete() override = default;
@@ -522,19 +498,7 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
                             "List the script or set of commands to be executed "
                             "when the watchpoint is hit.",
                             nullptr, eCommandRequiresTarget) {
-    CommandArgumentEntry arg;
-    CommandArgumentData wp_id_arg;
-
-    // Define the first (and only) variant of this arg.
-    wp_id_arg.arg_type = eArgTypeWatchpointID;
-    wp_id_arg.arg_repetition = eArgRepeatPlain;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(wp_id_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeWatchpointID);
   }
 
   ~CommandObjectWatchpointCommandList() override = default;
diff --git a/lldb/source/Interpreter/CommandObject.cpp b/lldb/source/Interpreter/CommandObject.cpp
index 93c53e89f7d1a..4634b75c6a33a 100644
--- a/lldb/source/Interpreter/CommandObject.cpp
+++ b/lldb/source/Interpreter/CommandObject.cpp
@@ -392,6 +392,24 @@ bool CommandObject::ParseOptionsAndNotify(Args &args,
   return true;
 }
 
+void CommandObject::AddSimpleArgumentList(
+    CommandArgumentType arg_type, ArgumentRepetitionType repetition_type) {
+
+  CommandArgumentEntry arg_entry;
+  CommandArgumentData simple_arg;
+
+  // Define the first (and only) variant of this arg.
+  simple_arg.arg_type = arg_type;
+  simple_arg.arg_repetition = repetition_type;
+
+  // There is only one variant this argument could be; put it into the argument
+  // entry.
+  arg_entry.push_back(simple_arg);
+
+  // Push the data for the first argument into the m_arguments vector.
+  m_arguments.push_back(arg_entry);
+}
+
 int CommandObject::GetNumArgumentEntries() { return m_arguments.size(); }
 
 CommandObject::CommandArgumentEntry *
@@ -694,20 +712,24 @@ void CommandObject::GenerateHelpText(Stream &output_strm) {
   }
 }
 
-void CommandObject::AddIDsArgumentData(CommandArgumentEntry &arg,
-                                       CommandArgumentType ID,
-                                       CommandArgumentType IDRange) {
+void CommandObject::AddIDsArgumentData(CommandObject::IDType type) {
+  CommandArgumentEntry arg;
   CommandArgumentData id_arg;
   CommandArgumentData id_range_arg;
 
   // Create the first variant for the first (and only) argument for this
   // command.
-  id_arg.arg_type = ID;
+  switch (type) {
+  case eBreakpointArgs:
+    id_arg.arg_type = eArgTypeBreakpointID;
+    id_range_arg.arg_type = eArgTypeBreakpointIDRange;
+    break;
+  case eWatchpointArgs:
+    id_arg.arg_type = eArgTypeWatchpointID;
+    id_range_arg.arg_type = eArgTypeWatchpointIDRange;
+    break;
+  }
   id_arg.arg_repetition = eArgRepeatOptional;
-
-  // Create the second variant for the first (and only) argument for this
-  // command.
-  id_range_arg.arg_type = IDRange;
   id_range_arg.arg_repetition = eArgRepeatOptional;
 
   // The first (and only) argument for this command could be either an id or an
@@ -715,6 +737,7 @@ void CommandObject::AddIDsArgumentData(CommandArgumentEntry &arg,
   // this command.
   arg.push_back(id_arg);
   arg.push_back(id_range_arg);
+  m_arguments.push_back(arg);
 }
 
 const char *CommandObject::GetArgumentTypeAsCString(
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index 47b1db16f1e90..7af768aad0bc1 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -419,19 +419,7 @@ class CommandObjectMultiwordItaniumABI_Demangle : public CommandObjectParsed {
       : CommandObjectParsed(
             interpreter, "demangle", "Demangle a C++ mangled name.",
             "language cplusplus demangle [<mangled-name> ...]") {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeSymbol;
-    index_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeSymbol, eArgRepeatPlus);
   }
 
   ~CommandObjectMultiwordItaniumABI_Demangle() override = default;
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index f380d6e7e6720..3e5ee6f663730 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -897,19 +897,7 @@ class CommandObjectObjC_ClassTable_Dump : public CommandObjectParsed {
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused),
         m_options() {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeRegularExpression;
-    index_arg.arg_repetition = eArgRepeatOptional;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeRegularExpression, eArgRepeatOptional);
   }
 
   ~CommandObjectObjC_ClassTable_Dump() override = default;
@@ -1015,19 +1003,7 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
             "language objc tagged-pointer info",
             eCommandRequiresProcess | eCommandProcessMustBeLaunched |
                 eCommandProcessMustBePaused) {
-    CommandArgumentEntry arg;
-    CommandArgumentData index_arg;
-
-    // Define the first (and only) variant of this arg.
-    index_arg.arg_type = eArgTypeAddress;
-    index_arg.arg_repetition = eArgRepeatPlus;
-
-    // There is only one variant this argument could be; put it into the
-    // argument entry.
-    arg.push_back(index_arg);
-
-    // Push the data for the first argument into the m_arguments vector.
-    m_arguments.push_back(arg);
+    AddSimpleArgumentList(eArgTypeAddress, eArgRepeatPlus);
   }
 
   ~CommandObjectMultiwordObjC_TaggedPointer_Info() override = default;
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3dc40ee6bb9ff..51ceb12f1a570 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -5354,8 +5354,7 @@ class CommandObjectProcessGDBRemotePacketXferSize : public CommandObjectParsed {
             interpreter, "process plugin packet xfer-size",
             "Maximum size that lldb will try to read/write one one chunk.",
             nullptr) {
-    CommandArgumentData max_arg{eArgTypeUnsignedInteger, eArgRepeatPlain};
-    m_arguments.push_back({max_arg});
+    AddSimpleArgumentList(eArgTypeUnsignedInteger);
   }
 
   ~CommandObjectProcessGDBRemotePacketXferSize() override = default;
@@ -5397,8 +5396,7 @@ class CommandObjectProcessGDBRemotePacketSend : public CommandObjectParsed {
                             "be added to the packet prior to sending and "
                             "stripped from the result.",
                             nullptr) {
-    CommandArgumentData packet_arg{eArgTypeNone, eArgRepeatStar};
-    m_arguments.push_back({packet_arg});
+    AddSimpleArgumentList(eArgTypeNone, eArgRepeatStar);
   }
 
   ~CommandObjectProcessGDBRemotePacketSend() override = default;

From 7b11e2ec39ae01f53d53250551e207583bd51e80 Mon Sep 17 00:00:00 2001
From: AMS21 <AMS21.github@gmail.com>
Date: Tue, 27 Feb 2024 19:35:11 +0100
Subject: [PATCH 473/546] [clang-tidy] Fix
 `cppcoreguidelines-missing-std-forward` false positive for deleted functions
 (#83055)

Improved check by no longer giving false positives for deleted functions.
---
 .../cppcoreguidelines/MissingStdForwardCheck.cpp  |  3 ++-
 clang-tools-extra/docs/ReleaseNotes.rst           |  4 ++++
 .../cppcoreguidelines/missing-std-forward.cpp     | 15 +++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
index 370de12999ace..c633683570f74 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.cpp
@@ -127,7 +127,8 @@ void MissingStdForwardCheck::registerMatchers(MatchFinder *Finder) {
                   hasAncestor(functionDecl().bind("func")),
                   hasAncestor(functionDecl(
                       isDefinition(), equalsBoundNode("func"), ToParam,
-                      unless(hasDescendant(std::move(ForwardCallMatcher)))))),
+                      unless(anyOf(isDeleted(), hasDescendant(std::move(
+                                                    ForwardCallMatcher))))))),
       this);
 }
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 69537964f9bce..3f90e7d63d6b2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -134,6 +134,10 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check by
   ignoring local variable with ``[maybe_unused]`` attribute.
 
+- Improved :doc:`cppcoreguidelines-missing-std-forward
+  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check by no longer
+  giving false positives for deleted functions.
+
 - Cleaned up :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>`
   by removing enforcement of rule `C.48
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
index 443f338ba2046..20e43f04180ff 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
@@ -173,3 +173,18 @@ void lambda_value_reference_auxiliary_var(T&& t) {
 }
 
 } // namespace negative_cases
+
+namespace deleted_functions {
+
+template <typename T>
+void f(T &&) = delete;
+
+struct S {
+    template <typename T>
+    S(T &&) = delete;
+
+    template <typename T>
+    void operator&(T &&) = delete;
+};
+
+} // namespace deleted_functions

From 563f414e049dc06dcb955f565fcff3c663982ee4 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 27 Feb 2024 12:35:48 -0600
Subject: [PATCH 474/546] [mlir][AMDGPU] Set uniform-work-group-size=true by
 default (#79077)

GPU kernels generated via typical MLIR mechanisms make the assumption
that all workgroups are of uniform size, and so, as in OpenMP, it is
appropriate to set the "uniform-work-group-size"="true" attribute on
these functions by default. This commit makes that choice.

In the event it is needed, this commit adds
`rocdl.uniform_work_group_size` as an attribute to be set on LLVM
functions that can be used to override the default.

In addition, add proper failure messages to translation
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  | 16 +++++++
 .../ROCDL/ROCDLToLLVMIRTranslation.cpp        | 45 +++++++++++++++----
 mlir/test/Target/LLVMIR/rocdl.mlir            |  9 +++-
 3 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 6b170c8d06f43..53e9f2dc6a994 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -28,6 +28,22 @@ def ROCDL_Dialect : Dialect {
   let hasOperationAttrVerify = 1;
 
   let extraClassDeclaration = [{
+    /// Get the name of the attribute used to annotate external kernel
+    /// functions.
+    static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+    static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.flat_work_group_size");
+    }
+    static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
+    }
+    /// MLIR's gpu-related infrastructure effectively assume uniform workgroup
+    /// sizes, so this attribute defaults to "true" on `rocdl.kernel` functions.
+    /// It is provided here to allow overriding this assumption.
+    static constexpr ::llvm::StringLiteral getUniformWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.uniform_work_group_size");
+    }
+
     /// The address space value that represents global memory.
     static constexpr unsigned kGlobalMemoryAddressSpace = 1;
     /// The address space value that represents shared memory.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index 93eb456cdc2c4..94423b35d1ff3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -88,7 +88,9 @@ class ROCDLDialectLLVMIRTranslationInterface
     if (dialect->getKernelAttrHelper().getName() == attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
+      ;
 
       // For GPU kernels,
       // 1. Insert AMDGPU_KERNEL calling convention.
@@ -100,6 +102,13 @@ class ROCDLDialectLLVMIRTranslationInterface
       if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
         llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
       }
+
+      // MLIR's GPU kernel APIs all assume and produce uniformly-sized
+      // workgroups, so the lowering of the `rocdl.kernel` marker encodes this
+      // assumption. This assumption may be overridden by setting
+      // `rocdl.uniform_work_group_size` on a given function.
+      if (!llvmFunc->hasFnAttribute("uniform-work-group-size"))
+        llvmFunc->addFnAttr("uniform-work-group-size", "true");
     }
     // Override flat-work-group-size
     // TODO: update clients to rocdl.flat_work_group_size instead,
@@ -108,10 +117,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<IntegerAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be an integer");
 
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
@@ -124,10 +135,12 @@ class ROCDLDialectLLVMIRTranslationInterface
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<StringAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a string");
 
       llvm::Function *llvmFunc =
           moduleTranslation.lookupFunction(func.getName());
@@ -135,16 +148,32 @@ class ROCDLDialectLLVMIRTranslationInterface
       llvmAttrValue.append(value.getValue());
       llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
     }
-
+    if (ROCDL::ROCDLDialect::getUniformWorkGroupSizeAttrName() ==
+        attribute.getName()) {
+      auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+      if (!func)
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
+      auto value = dyn_cast<BoolAttr>(attribute.getValue());
+      if (!value)
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a boolean");
+      llvm::Function *llvmFunc =
+          moduleTranslation.lookupFunction(func.getName());
+      llvmFunc->addFnAttr("uniform-work-group-size",
+                          value.getValue() ? "true" : "false");
+    }
     // Set reqd_work_group_size metadata
     if (dialect->getReqdWorkGroupSizeAttrHelper().getName() ==
         attribute.getName()) {
       auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
       if (!func)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " is only supported on `llvm.func` operations");
       auto value = dyn_cast<DenseI32ArrayAttr>(attribute.getValue());
       if (!value)
-        return failure();
+        return op->emitOpError(Twine(attribute.getName()) +
+                               " must be a dense i32 array attribute");
       llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
       SmallVector<llvm::Metadata *, 3> metadata;
       llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 06b78650c8d01..3ea6292c679d9 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -56,6 +56,12 @@ llvm.func @known_block_sizes()
   llvm.return
 }
 
+llvm.func @kernel_func_no_uniform_work_groups() attributes {rocdl.kernel, rocdl.uniform_work_group_size = false} {
+  // CHECK-LABEL: amdgpu_kernel void @kernel_func_no_uniform_work_groups()
+  // CHECK: #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS:[0-9]+]]
+  llvm.return
+}
+
 llvm.func @rocdl.lane_id() -> i32 {
   // CHECK: [[mbcntlo:%.+]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   // CHECK-NEXT: call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[mbcntlo]])
@@ -505,8 +511,9 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="true" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: attributes #[[$KERNEL_NO_UNIFORM_WORK_GROUPS_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "uniform-work-group-size"="false" }
 // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
 // CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

From e427e934f677567f8184ff900cb4cbdb8cf21a21 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 27 Feb 2024 12:43:05 -0600
Subject: [PATCH 475/546] [lldb][dap] Avoid concurrent `HandleCommand` calls
 (#83162)

The `EventThreadFunction` can end up calling `HandleCommand`
concurrently with the main request processing thread. The underlying API
does not appear to be thread safe, so add a narrowly scoped mutex lock
to prevent calling it in this place from more than one thread.

Fixes #81686. Prior to this, TestDAP_launch.py is 4% flaky. After, it
passes in 1000 runs.
---
 .../API/tools/lldb-dap/commands/TestDAP_commands.py  |  2 --
 lldb/tools/lldb-dap/LLDBUtils.cpp                    | 12 +++++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
index 8dcda75d0a452..226b9385fe719 100644
--- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
+++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
@@ -1,5 +1,4 @@
 import os
-import unittest
 
 import dap_server
 import lldbdap_testcase
@@ -7,7 +6,6 @@
 from lldbsuite.test.decorators import *
 
 
-@unittest.skip("https://llvm.org/PR81686")
 class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
     def test_command_directive_quiet_on_success(self):
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 35b7a986a8964..a91cc6718f4df 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -9,6 +9,8 @@
 #include "LLDBUtils.h"
 #include "DAP.h"
 
+#include <mutex>
+
 namespace lldb_dap {
 
 bool RunLLDBCommands(llvm::StringRef prefix,
@@ -37,7 +39,15 @@ bool RunLLDBCommands(llvm::StringRef prefix,
       }
     }
 
-    interp.HandleCommand(command.str().c_str(), result);
+    {
+      // Prevent simultaneous calls to HandleCommand, e.g. EventThreadFunction
+      // may asynchronously call RunExitCommands when we are already calling
+      // RunTerminateCommands.
+      static std::mutex handle_command_mutex;
+      std::lock_guard<std::mutex> locker(handle_command_mutex);
+      interp.HandleCommand(command.str().c_str(), result);
+    }
+
     const bool got_error = !result.Succeeded();
     // The if statement below is assuming we always print out `!` prefixed
     // lines. The only time we don't print is when we have `quiet_on_success ==

From f44c3faccaa4bcc9b8b7739a76a52d328fbb8d91 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sgundapa@quicinc.com>
Date: Tue, 27 Feb 2024 12:50:22 -0600
Subject: [PATCH 476/546] =?UTF-8?q?Revert=20"[Hexagon]=20Optimize=20post-i?=
 =?UTF-8?q?ncrement=20load=20and=20stores=20in=20loops.=20(=E2=80=A6=20(#8?=
 =?UTF-8?q?3151)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…#82418)"

This reverts commit d62ca8def395ac165f253fdde1d93725394a4d53.
---
 llvm/lib/Target/Hexagon/CMakeLists.txt        |   1 -
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  |  56 --
 llvm/lib/Target/Hexagon/HexagonInstrInfo.h    |   2 -
 llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp | 689 ------------------
 .../Target/Hexagon/HexagonTargetMachine.cpp   |  13 -
 .../Hexagon/MCTargetDesc/HexagonBaseInfo.h    |  12 +-
 llvm/test/CodeGen/Hexagon/post-inc-vec.mir    | 413 -----------
 llvm/test/CodeGen/Hexagon/post_inc_store.mir  | 168 -----
 .../test/CodeGen/Hexagon/postincopt-crash.mir |  58 --
 .../CodeGen/Hexagon/postincopt-dcfetch.mir    |  19 -
 .../CodeGen/Hexagon/valid-offset-loadbsw4.mir |  32 -
 11 files changed, 1 insertion(+), 1462 deletions(-)
 delete mode 100644 llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
 delete mode 100644 llvm/test/CodeGen/Hexagon/post-inc-vec.mir
 delete mode 100644 llvm/test/CodeGen/Hexagon/post_inc_store.mir
 delete mode 100644 llvm/test/CodeGen/Hexagon/postincopt-crash.mir
 delete mode 100644 llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
 delete mode 100644 llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir

diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index 2870f0bb6ad32..a22a5c11e6ab3 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -51,7 +51,6 @@ add_llvm_target(HexagonCodeGen
   HexagonOptAddrMode.cpp
   HexagonOptimizeSZextends.cpp
   HexagonPeephole.cpp
-  HexagonPostIncOpt.cpp
   HexagonRDFOpt.cpp
   HexagonRegisterInfo.cpp
   HexagonSelectionDAGInfo.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 91cc9307786b6..619c7dc69f9b2 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1655,13 +1655,6 @@ bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
   return getAddrMode(MI) == HexagonII::PostInc;
 }
 
-bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const {
-  unsigned BasePos, OffsetPos;
-  if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return false;
-  return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm();
-}
-
 // Returns true if an instruction is predicated irrespective of the predicate
 // sense. For example, all of the following will return true.
 // if (p0) R1 = add(R2, R3)
@@ -2443,55 +2436,6 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
          Opcode == Hexagon::J2_loop1rext;
 }
 
-bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case Hexagon::L2_loadalignb_pci:
-  case Hexagon::L2_loadalignb_pcr:
-  case Hexagon::L2_loadalignh_pci:
-  case Hexagon::L2_loadalignh_pcr:
-  case Hexagon::L2_loadbsw2_pci:
-  case Hexagon::L2_loadbsw2_pcr:
-  case Hexagon::L2_loadbsw4_pci:
-  case Hexagon::L2_loadbsw4_pcr:
-  case Hexagon::L2_loadbzw2_pci:
-  case Hexagon::L2_loadbzw2_pcr:
-  case Hexagon::L2_loadbzw4_pci:
-  case Hexagon::L2_loadbzw4_pcr:
-  case Hexagon::L2_loadrb_pci:
-  case Hexagon::L2_loadrb_pcr:
-  case Hexagon::L2_loadrd_pci:
-  case Hexagon::L2_loadrd_pcr:
-  case Hexagon::L2_loadrh_pci:
-  case Hexagon::L2_loadrh_pcr:
-  case Hexagon::L2_loadri_pci:
-  case Hexagon::L2_loadri_pcr:
-  case Hexagon::L2_loadrub_pci:
-  case Hexagon::L2_loadrub_pcr:
-  case Hexagon::L2_loadruh_pci:
-  case Hexagon::L2_loadruh_pcr:
-  case Hexagon::S2_storerbnew_pci:
-  case Hexagon::S2_storerbnew_pcr:
-  case Hexagon::S2_storerb_pci:
-  case Hexagon::S2_storerb_pcr:
-  case Hexagon::S2_storerd_pci:
-  case Hexagon::S2_storerd_pcr:
-  case Hexagon::S2_storerf_pci:
-  case Hexagon::S2_storerf_pcr:
-  case Hexagon::S2_storerhnew_pci:
-  case Hexagon::S2_storerhnew_pcr:
-  case Hexagon::S2_storerh_pci:
-  case Hexagon::S2_storerh_pcr:
-  case Hexagon::S2_storerinew_pci:
-  case Hexagon::S2_storerinew_pcr:
-  case Hexagon::S2_storeri_pci:
-  case Hexagon::S2_storeri_pcr:
-    return true;
-  }
-  return false;
-}
-
 bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
     default: return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 65783c560321a..e496995d3ff12 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -434,8 +434,6 @@ class HexagonInstrInfo : public HexagonGenInstrInfo {
   bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const;
   bool PredOpcodeHasJMP_c(unsigned Opcode) const;
   bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
-  bool isPostIncWithImmOffset(const MachineInstr &MI) const;
-  bool isCircBufferInstr(const MachineInstr &MI) const;
 
   unsigned getAddrMode(const MachineInstr &MI) const;
   MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
diff --git a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
deleted file mode 100644
index 4c845f24f76a9..0000000000000
--- a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
+++ /dev/null
@@ -1,689 +0,0 @@
-//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Convert post-inc addressing mode into base-offset addressing mode.
-// Ex:
-// original loop:
-// v1 = phi(v0, v3)
-// v2,v3 = post_load v1, 4
-
-// Often, unroller creates below form of post-increments:
-// v1 = phi(v0, v3')
-// v2,v3  = post_load v1, 4
-// v2',v3'= post_load v3, 4
-
-// This can be optimized in two ways
-
-// 1.
-// v1 = phi(v0, v3')
-// v2,v3' = post_load v1, 8
-// v2' = load v3', -4
-//
-// 2.
-// v1 = phi(v0, v3')
-// v2,v3' = post_load v1, 8
-// v2' = load v1, 4
-//
-// Option 2 is favored as we can packetize two memory operations in a single
-// packet. However, this is not always favorable due to memory dependences
-// and in cases where we form a bigger chain of post-increment ops that will
-// create more spills as we can not execute post-increment ops with out
-// executing base-offset instructions.
-//===----------------------------------------------------------------------===//
-#include "HexagonInstrInfo.h"
-#include "HexagonSubtarget.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "hexagon-postincopt"
-
-static cl::opt<unsigned> PostIncChainThreshold(
-    "post-inc-chain-threshold", cl::Hidden, cl::init(4),
-    cl::desc("Limit the number of post-inc instructions in a chain."));
-
-static cl::opt<bool> PreferPostIncStore(
-    "prefer-post-inc-store", cl::Hidden, cl::init(true),
-    cl::desc("Prefer post-inc store in a list of loads and stores."));
-
-namespace llvm {
-void initializeHexagonPostIncOptPass(PassRegistry &);
-FunctionPass *createHexagonPostIncOpt();
-} // namespace llvm
-
-namespace {
-
-class HexagonPostIncOpt : public MachineFunctionPass {
-  MachineLoopInfo *MLI = nullptr;
-  const HexagonInstrInfo *HII = nullptr;
-  const TargetRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  const HexagonSubtarget *HST = nullptr;
-
-public:
-  static char ID;
-
-  HexagonPostIncOpt() : MachineFunctionPass(ID) {
-    initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<AAResultsWrapperPass>();
-    AU.addRequired<MachineLoopInfo>();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; }
-
-  bool runOnMachineFunction(MachineFunction &Fn) override;
-
-private:
-  bool translatePostIncsInLoop(MachineBasicBlock &MBB);
-  void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const;
-  void replacePostIncWithBaseOffset(MachineInstr &MI) const;
-  bool isPostIncInsn(MachineInstr &MI) const;
-  void foldAdds(MachineBasicBlock &MBB) const;
-  void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const;
-  void removeDeadInstructions(MachineBasicBlock &MBB) const;
-
-  void generatePostInc(MachineBasicBlock &MBB);
-  bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
-  void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
-
-  bool isValidOffset(const MachineInstr &MI, int64_t Offset) const;
-  bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const;
-};
-
-class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs {
-  HexagonPostIncOpt &Pass;
-
-public:
-  HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF,
-                            MachineLoopInfo *MLI)
-      : ScheduleDAGInstrs(MF, MLI, false), Pass(P){};
-  void schedule() override;
-  ScheduleDAGTopologicalSort &getTopo() { return Topo; };
-};
-
-} // End anonymous namespace.
-
-char HexagonPostIncOpt::ID = 0;
-
-INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE,
-                      "Hexagon Post-Inc-Opt Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass",
-                    false, false)
-
-/// Return true if MIA dominates MIB.
-static bool dominates(MachineInstr *MIA, MachineInstr *MIB) {
-  if (MIA->getParent() != MIB->getParent())
-    return false; // Don't know since machine dominator tree is out of date.
-
-  MachineBasicBlock *MBB = MIA->getParent();
-  MachineBasicBlock::iterator I = MBB->instr_begin();
-  // Iterate over the basic block until MIA or MIB is found.
-  for (; &*I != MIA && &*I != MIB; ++I)
-    ;
-
-  // MIA dominates MIB if MIA is found first.
-  return &*I == MIA;
-}
-
-// Return the Phi register value that comes from the loop block.
-static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
-  for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
-    if (Phi->getOperand(i + 1).getMBB() == LoopBB)
-      return Phi->getOperand(i).getReg();
-  return UINT_MAX;
-}
-
-static bool isAddWithImmValue(const MachineInstr &MI) {
-  // FIXME: For now, only deal with adds that have strict immediate values.
-  // Some A2_addi instructions can be of the form.
-  // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16
-  return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm();
-}
-
-// Compute the number of 'real' instructions in the basic block by
-// ignoring terminators.
-static unsigned getBasicBlockSize(MachineBasicBlock &MBB) {
-  unsigned size = 0;
-  for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator()))
-    if (!I.isDebugInstr())
-      size++;
-  return size;
-}
-
-// Setup Post increment Schedule DAG.
-static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG,
-                           MachineBasicBlock &MBB) {
-  PIDAG.startBlock(&MBB);
-  PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(),
-                    getBasicBlockSize(MBB));
-  // Build the graph.
-  PIDAG.schedule();
-  // exitRegion() is an empty function in base class. So, safe to call it here.
-  PIDAG.exitRegion();
-}
-
-// Check if post-increment candidate has any memory dependence on any
-// instruction in the chain.
-static bool hasMemoryDependency(SUnit *PostIncSU,
-                                SmallVector<MachineInstr *, 4> &UseList) {
-
-  // FIXME: Fine tune the order dependence. Probably can only consider memory
-  // related OrderKind.
-  for (auto &Dep : PostIncSU->Succs)
-    if (Dep.getKind() == SDep::Order)
-      if (std::find(UseList.begin(), UseList.end(),
-                    Dep.getSUnit()->getInstr()) != UseList.end())
-        return true;
-
-  return false;
-}
-
-// Fold an add with immediate into either an add or a load or a store.
-void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const {
-  LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n");
-  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
-    if (!isAddWithImmValue(MI))
-      continue;
-    unsigned DefReg = MI.getOperand(0).getReg();
-    unsigned AddReg = MI.getOperand(1).getReg();
-    int64_t AddImm = MI.getOperand(2).getImm();
-
-    SmallVector<MachineInstr *, 4> UseList;
-    // Gather the uses of add instruction's def reg.
-    for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) {
-      MachineInstr *UseMI = MO.getParent();
-      // Deal with only the instuctions that belong to this block.
-      // If we cross this block, the generation of post-increment logic
-      // will not be able to transform to post-inc due to dominance.
-      if (UseMI->getParent() == &MBB)
-        UseList.push_back(UseMI);
-    }
-
-    if (UseList.empty())
-      continue;
-
-    LLVM_DEBUG({
-      dbgs() << "Current instruction considered for folding \n";
-      MI.dump();
-    });
-
-    for (auto UseMI : UseList) {
-      if (isAddWithImmValue(*UseMI)) {
-        int64_t NewImm = AddImm + UseMI->getOperand(2).getImm();
-        // Fold if the new immediate is with in the range.
-        if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) {
-          LLVM_DEBUG({
-            UseMI->dump();
-            dbgs() << "\t is folded in to \n";
-          });
-          UseMI->getOperand(1).setReg(AddReg);
-          UseMI->getOperand(2).setImm(NewImm);
-          LLVM_DEBUG(UseMI->dump());
-        }
-      } else if (HII->isBaseImmOffset(*UseMI)) {
-        LLVM_DEBUG({
-          UseMI->dump();
-          dbgs() << "\t is folded in to \n";
-        });
-        updateBaseAndOffset(*UseMI, MI);
-        LLVM_DEBUG(UseMI->dump());
-      }
-      LLVM_DEBUG(dbgs() << "\n");
-    }
-  }
-  removeDeadInstructions(MBB);
-  LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n");
-}
-
-void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI,
-                                            MachineInstr &AddMI) const {
-  assert(HII->isBaseImmOffset(MI));
-  unsigned BasePos, OffsetPos;
-  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return;
-
-  MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
-  MachineOperand &BaseOp = MI.getOperand(BasePos);
-
-  if (BaseOp.getReg() != AddMI.getOperand(0).getReg())
-    return;
-
-  unsigned IncBase = AddMI.getOperand(1).getReg();
-  int64_t IncValue = AddMI.getOperand(2).getImm();
-
-  int64_t NewOffset = OffsetOp.getImm() + IncValue;
-  if (!isValidOffset(MI, NewOffset))
-    return;
-
-  OffsetOp.setImm(NewOffset);
-  BaseOp.setReg(IncBase);
-}
-
-void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const {
-  // For MBB, check that the value defined by each instruction is used.
-  // If not, delete it.
-  for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(),
-                                                 ME = MBB.instr_rend();
-       MI != ME;) {
-    // From DeadMachineInstructionElem. Don't delete inline assembly.
-    if (MI->isInlineAsm()) {
-      ++MI;
-      continue;
-    }
-    bool SawStore = false;
-    // Check if it's safe to remove the instruction due to side effects.
-    if (!MI->isSafeToMove(nullptr, SawStore)) {
-      ++MI;
-      continue;
-    }
-    unsigned Uses = 0;
-    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                    MOE = MI->operands_end();
-         MOI != MOE; ++MOI) {
-      if (!MOI->isReg() || !MOI->isDef())
-        continue;
-      unsigned reg = MOI->getReg();
-      // Assume physical registers are used.
-      if (Register::isPhysicalRegister(reg)) {
-        Uses++;
-        continue;
-      }
-      if (MRI->use_begin(reg) != MRI->use_end())
-        Uses++;
-    }
-    if (!Uses) {
-      MI++->eraseFromParent();
-      continue;
-    }
-    ++MI;
-  }
-}
-
-bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const {
-  // Predicated post-increments are not yet handled. (ISel is not generating
-  // them yet). Circular buffer instructions should not be handled.
-  return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) &&
-          !HII->isCircBufferInstr(MI));
-}
-
-/// For instructions with a base and offset, return true if the new Offset
-/// is a valid value with the correct alignment.
-bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI,
-                                      int64_t Offset) const {
-  if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false))
-    return false;
-  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
-  return (Offset & AlignMask) == 0;
-}
-
-bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI,
-                                            int IncVal) const {
-  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
-  if ((IncVal & AlignMask) != 0)
-    return false;
-
-  // Number of total bits in the instruction used to encode Inc value.
-  unsigned IncBits = 4;
-  // For HVX instructions, the offset is 3.
-  if (HexagonII::isCVI(MI.getDesc()))
-    IncBits = 3;
-
-  IncBits += Log2_32(HII->getMemAccessSize(MI));
-  if (HII->getMemAccessSize(MI) > 8)
-    IncBits = 16;
-
-  int MinValidVal = -1U << (IncBits - 1);
-  int MaxValidVal = ~(-1U << (IncBits - 1));
-  return (IncVal >= MinValidVal && IncVal <= MaxValidVal);
-}
-
-void HexagonPostIncOptSchedDAG::schedule() {
-  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
-  buildSchedGraph(AA);
-}
-
-// Replace post-increment operations with base+offset counterpart.
-void HexagonPostIncOpt::replacePostIncWithBaseOffset(
-    MachineBasicBlock &MBB) const {
-  LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with "
-                       "base+offset counterparts.\n");
-
-  SmallVector<MachineInstr *, 4> MIList;
-  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
-    // Check for eligible post-inc candidates.
-    if (!isPostIncInsn(MI))
-      continue;
-    MIList.push_back(&MI);
-  }
-
-  for (auto MI : MIList)
-    replacePostIncWithBaseOffset(*MI);
-
-  LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n");
-}
-
-void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const {
-  short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode());
-  if (NewOpcode < 0)
-    return;
-
-  unsigned BasePos = 0, OffsetPos = 0;
-  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
-    return;
-  const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos);
-  const MachineOperand &PostIncBase = MI.getOperand(BasePos);
-
-  MachineBasicBlock &MBB = *MI.getParent();
-  DebugLoc DL = MI.getDebugLoc();
-  MachineOperand *PostIncDest;
-  MachineInstrBuilder MIB;
-  if (MI.mayLoad()) {
-    PostIncDest = &MI.getOperand(1);
-    const MachineOperand &LDValue = MI.getOperand(0);
-    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
-    MIB.add(LDValue).add(PostIncBase).addImm(0);
-  } else {
-    PostIncDest = &MI.getOperand(0);
-    const MachineOperand &STValue = MI.getOperand(3);
-    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
-    MIB.add(PostIncBase).addImm(0).add(STValue);
-  }
-
-  // Transfer memoperands.
-  MIB->cloneMemRefs(*MBB.getParent(), MI);
-
-  // Create an add instruction for the post-inc addition of offset.
-  MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi));
-  MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset);
-
-  LLVM_DEBUG({
-    dbgs() << "\n";
-    MI.dump();
-    dbgs() << "\tis tranformed to \n";
-    MIB->dump();
-    MIBA->dump();
-    dbgs() << "\n\n";
-  });
-
-  MI.eraseFromParent();
-}
-
-void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) {
-  LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n");
-  MachineBasicBlock::iterator MII = MBB.getFirstNonPHI();
-  MachineBasicBlock::iterator MIE = MBB.instr_begin();
-  bool isOK = true;
-  while (MII != MIE) {
-    MachineInstr *Phi = &*std::prev(MII);
-    MII = std::prev(MII);
-    unsigned LoopVal = getLoopPhiReg(Phi, &MBB);
-    if (LoopVal == UINT_MAX)
-      continue;
-    MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
-    if (!isAddWithImmValue(*LoopInst))
-      continue;
-
-    if (LoopInst->getOpcode() != Hexagon::A2_addi)
-      continue;
-
-    unsigned AddReg = LoopInst->getOperand(1).getReg();
-    int64_t AddImm = LoopInst->getOperand(2).getImm();
-    SmallVector<MachineInstr *, 4> UseList;
-    MachineInstr *PostIncCandidate = nullptr;
-
-    // Find the probable candidates for Post-increment instruction.
-    SmallVector<MachineInstr *, 4> CandList;
-    for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) {
-      MachineInstr *UseMI = MO.getParent();
-
-      if (UseMI == LoopInst)
-        continue;
-
-      if (!dominates(UseMI, LoopInst)) {
-        isOK = false;
-        break;
-      }
-      const MachineOperand *BaseOp = nullptr;
-      int64_t Offset;
-      bool OffsetIsScalable;
-      if (!HII->isBaseImmOffset(*UseMI) ||
-          !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset,
-                                        OffsetIsScalable, TRI)) {
-        isOK = false;
-        break;
-      }
-      int64_t NewOffset = Offset - AddImm;
-      if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() ||
-          BaseOp->getReg() != AddReg) {
-        isOK = false;
-        break;
-      }
-      if (OffsetIsScalable) {
-        isOK = false;
-        break;
-      }
-      if (Offset == 0) {
-        // If you have stores in the chain, make sure they are in the beginning
-        // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST,
-        // ST.
-        if (UseMI->mayStore() && PreferPostIncStore)
-          CandList.insert(CandList.begin(), UseMI);
-        else
-          CandList.push_back(UseMI);
-        continue;
-      }
-      UseList.push_back(UseMI);
-    }
-
-    if (!isOK)
-      continue;
-
-    for (auto MI : CandList) {
-      if (!PostIncCandidate)
-        PostIncCandidate = MI;
-      // Push the rest of the list for updation.
-      else
-        UseList.push_back(MI);
-    }
-
-    // If a candidate is found, replace it with the post-inc instruction.
-    // Also, adjust offset for other uses as needed.
-    if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst))
-      continue;
-
-    // Logic to determine what the base register to be.
-    // There are two choices:
-    //   1. New address register after we updated the post-increment candidate.
-    //      v2,v3 = post_load v1, 4
-    //      v3 is the choice here.
-    //   2. The base register we used in post-increment candidate.
-    //      v2,v3 = post_load v1, 4
-    //      v1 is the choice here.
-    // Use v3  if there is a memory dependence between post-inc instruction and
-    // any other instruction in the chain.
-    // FIXME: We can do some complex DAG analysis based off height and depth and
-    // selectively update other instructions in the chain. Use v3 if there are
-    // more instructions in the chain, otherwise we will end up increasing the
-    // height of the DAG resulting in more spills. By default we have a
-    // threshold controlled by the option "post-inc-chain-threshold" which is
-    // set to 4. v1 is preferred as we can packetize two memory operations in a
-    // single packet in scalar core. But it heavily depends on the structure of
-    // DAG.
-    bool UpdateBaseToNew = false;
-
-    // Do not bother to build a DAG and analyze if the Use list is empty.
-    if (!UseList.empty()) {
-      MachineFunction *MF = MBB.getParent();
-      // Setup the Post-inc schedule DAG.
-      HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI);
-      initPISchedDAG(PIDAG, MBB);
-      SUnit *SU = PIDAG.getSUnit(PostIncCandidate);
-      if (hasMemoryDependency(SU, UseList) ||
-          UseList.size() >= PostIncChainThreshold)
-        UpdateBaseToNew = true;
-    }
-
-    if (UpdateBaseToNew) {
-      LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the "
-                           "base register of post-increment\n");
-      for (auto UseMI : UseList) {
-        if (!dominates(PostIncCandidate, UseMI))
-          continue;
-        unsigned BasePos, OffsetPos;
-        if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) {
-          // New offset has already been validated; no need to do it again.
-          LLVM_DEBUG({
-            UseMI->dump();
-            dbgs() << "\t is transformed to \n";
-          });
-          int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm;
-          UseMI->getOperand(OffsetPos).setImm(NewOffset);
-          UseMI->getOperand(BasePos).setReg(LoopVal);
-          LLVM_DEBUG(UseMI->dump());
-        }
-      }
-    }
-    replaceWithPostInc(PostIncCandidate, LoopInst);
-  }
-  LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n");
-}
-
-bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI,
-                                              MachineInstr *AddMI) const {
-  if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0)
-    return false;
-  assert(AddMI->getOpcode() == Hexagon::A2_addi);
-  return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm());
-}
-
-void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI,
-                                           MachineInstr *AddMI) const {
-  short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode());
-  assert(NewOpcode >= 0 &&
-         "Couldn't change base offset to post-increment form");
-
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  const MachineOperand &IncDest = AddMI->getOperand(0);
-  const MachineOperand &IncBase = AddMI->getOperand(1);
-  const MachineOperand &IncValue = AddMI->getOperand(2);
-  MachineInstrBuilder MIB;
-  LLVM_DEBUG({
-    dbgs() << "\n\n";
-    MI->dump();
-    dbgs() << "\t is tranformed to post-inc form of \n";
-  });
-
-  if (MI->mayLoad()) {
-    const MachineOperand &LDValue = MI->getOperand(0);
-    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
-    MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue);
-  } else {
-    const MachineOperand &STValue = MI->getOperand(2);
-    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
-    MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue);
-  }
-
-  // Transfer memoperands.
-  MIB->cloneMemRefs(*MBB.getParent(), *MI);
-
-  LLVM_DEBUG({
-    MIB->dump();
-    dbgs() << "As a result this add instruction is erased.\n";
-    AddMI->dump();
-  });
-
-  MI->eraseFromParent();
-  AddMI->eraseFromParent();
-}
-
-bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) {
-  // Algorithm:
-  // 1. Replace all the post-inc instructions with Base+Offset instruction and
-  // an add instruction in this block.
-  // 2. Fold all the adds in to respective uses.
-  // 3. Generate post-increment instructions and update the uses of the base
-  // register if needed based on constraints.
-
-  replacePostIncWithBaseOffset(MBB);
-  foldAdds(MBB);
-  generatePostInc(MBB);
-  return true;
-}
-
-bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) {
-
-  // Skip pass if requested.
-  if (skipFunction(MF.getFunction()))
-    return false;
-
-  // Get Target Information.
-  MLI = &getAnalysis<MachineLoopInfo>();
-  HST = &MF.getSubtarget<HexagonSubtarget>();
-  TRI = HST->getRegisterInfo();
-  MRI = &MF.getRegInfo();
-  HII = HST->getInstrInfo();
-
-  // Skip this pass for TinyCore.
-  // Tiny core allwos partial post increment operations - This constraint can
-  // be imposed inside the pass. In a chain of post-increments, the first can
-  // be post-increment, rest can be adjusted to base+offset (these are
-  // inexpensive in most of the cases);
-  if (HST->isTinyCore())
-    return false;
-
-  LLVM_DEBUG({
-    dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n";
-    dbgs() << "Function: " << MF.getName() << "\n";
-  });
-  bool Change = false;
-  std::vector<MachineBasicBlock *> MLBB;
-  for (auto &BB : MF) {
-    // Check if this Basic Block belongs to any loop.
-    auto *LI = MLI->getLoopFor(&BB);
-    // We only deal with inner-most loops that has one block.
-    if (LI && LI->getBlocks().size() == 1) {
-      MachineBasicBlock *MBB = LI->getHeader();
-      // Do not traverse blocks that are already visited.
-      if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end())
-        continue;
-
-      MLBB.push_back(MBB);
-
-      LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n");
-      Change |= translatePostIncsInLoop(*MBB);
-    }
-  }
-  LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n");
-  return Change;
-}
-
-FunctionPass *llvm::createHexagonPostIncOpt() {
-  return new HexagonPostIncOpt();
-}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 170684276ac3c..7d77286339399 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -129,10 +129,6 @@ static cl::opt<bool> EnableInstSimplify("hexagon-instsimplify", cl::Hidden,
                                         cl::init(true),
                                         cl::desc("Enable instsimplify"));
 
-static cl::opt<bool> DisableHexagonPostIncOpt(
-    "hexagon-postinc-opt", cl::Hidden,
-    cl::desc("Disable Hexagon post-increment optimization"));
-
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -171,7 +167,6 @@ namespace llvm {
   void initializeHexagonNewValueJumpPass(PassRegistry&);
   void initializeHexagonOptAddrModePass(PassRegistry&);
   void initializeHexagonPacketizerPass(PassRegistry&);
-  void initializeHexagonPostIncOptPass(PassRegistry &);
   void initializeHexagonRDFOptPass(PassRegistry&);
   void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
   void initializeHexagonTfrCleanupPass(PassRegistry &);
@@ -205,7 +200,6 @@ namespace llvm {
   FunctionPass *createHexagonOptimizeSZextends();
   FunctionPass *createHexagonPacketizer(bool Minimal);
   FunctionPass *createHexagonPeephole();
-  FunctionPass *createHexagonPostIncOpt();
   FunctionPass *createHexagonRDFOpt();
   FunctionPass *createHexagonSplitConst32AndConst64();
   FunctionPass *createHexagonSplitDoubleRegs();
@@ -237,7 +231,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
   initializeHexagonNewValueJumpPass(PR);
   initializeHexagonOptAddrModePass(PR);
   initializeHexagonPacketizerPass(PR);
-  initializeHexagonPostIncOptPass(PR);
   initializeHexagonRDFOptPass(PR);
   initializeHexagonSplitDoubleRegsPass(PR);
   initializeHexagonVectorCombineLegacyPass(PR);
@@ -266,7 +259,6 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
       TLOF(std::make_unique<HexagonTargetObjectFile>()) {
   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
   initializeHexagonTfrCleanupPass(*PassRegistry::getPassRegistry());
-  initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
   initAsmInfo();
 }
 
@@ -443,11 +435,6 @@ void HexagonPassConfig::addPreRegAlloc() {
     if (!DisableHardwareLoops)
       addPass(createHexagonHardwareLoops());
   }
-
-  if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive)
-    if (!DisableHexagonPostIncOpt)
-      addPass(createHexagonPostIncOpt());
-
   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
     addPass(&MachinePipelinerID);
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 98404121bda02..ca982696b0600 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -18,7 +18,6 @@
 
 #include "HexagonDepITypes.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "llvm/MC/MCInstrDesc.h"
 
 namespace llvm {
 
@@ -49,7 +48,7 @@ namespace HexagonII {
 
   // MCInstrDesc TSFlags
   // *** Must match HexagonInstrFormat*.td ***
-  enum HexagonTSFlagsVal {
+  enum {
     // This 7-bit field describes the insn type.
     TypePos = 0,
     TypeMask = 0x7f,
@@ -174,11 +173,6 @@ namespace HexagonII {
     hasUnaryRestrictionMask = 0x1,
   };
 
-  inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos,
-                             unsigned Mask) {
-    return (MID.TSFlags >> Pos) & Mask;
-  }
-
   // *** The code above must match HexagonInstrFormat*.td *** //
 
   // Hexagon specific MO operand flag mask.
@@ -281,10 +275,6 @@ namespace HexagonII {
     INST_ICLASS_ALU32_3   = 0xf0000000
   };
 
-  inline bool isCVI(const MCInstrDesc &MID) {
-    return getTSFlags(MID, isCVIPos, isCVIMask) != 0;
-  }
-
   LLVM_ATTRIBUTE_UNUSED
   static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
     switch (S) {
diff --git a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
deleted file mode 100644
index 3788dc3fecd89..0000000000000
--- a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
+++ /dev/null
@@ -1,413 +0,0 @@
-#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
-
-# Test that we do not generate two post-increment vector load/store
-# in the loop.
-# CHECK: J2_loop0r
-# CHECK: V6_vS32b_pi
-# CHECK-NOT: = V6_vL32b_pi
-# CHECK: V6_vL32b_ai
-# CHECK: V6_vL32b_ai
-# CHECK: V6_vS32b_ai
-# CHECK: ENDLOOP0
-
---- |
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2v2.128B(i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
-  declare void @llvm.hexagon.V6.vS32b.qpred.ai.128B(<1024 x i1>, ptr, <32 x i32>) #1
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32>, <32 x i32>, i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32>, <32 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32>, <64 x i32>) #0
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
-  declare void @llvm.assume(i1 noundef) #2
-
-  ; Function Attrs: noinline nounwind
-  define void @blah(i32 %0, i32 %1, ptr noalias %2, ptr noalias nocapture readonly %3, ptr noalias nocapture readonly %4, ptr nocapture readnone %5, ptr nocapture readnone %6, i32 %7, i32 %8, ptr nocapture readonly %9, ptr nocapture readonly %10) local_unnamed_addr #3 {
-  entry:
-    %11 = call i32 @llvm.hexagon.S2.extractu(i32 %0, i32 23, i32 9)
-    %12 = shl i32 %11, 7
-    %mul16.i = mul nsw i32 %12, %1
-    %add.i = add nsw i32 %1, 1
-    %mul17.i = mul nsw i32 %add.i, %12
-    %cmp184.i = icmp slt i32 %mul16.i, %mul17.i
-    br i1 %cmp184.i, label %for.body.lr.ph.i, label %for.end.i
-
-  for.body.lr.ph.i:                                 ; preds = %entry
-    %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> <i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576>, <32 x i32> <i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144>) #5
-    %14 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> zeroinitializer, <32 x i32> zeroinitializer) #5
-    %15 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32) #5
-    %cgep = getelementptr i8, ptr %2, i32 %mul16.i
-    %cgep8 = getelementptr i8, ptr %4, i32 %mul16.i
-    %cgep9 = getelementptr i8, ptr %3, i32 %mul16.i
-    br label %for.body.i
-
-  for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
-    %lsr.iv6 = phi ptr [ %cgep12, %for.body.i ], [ %cgep9, %for.body.lr.ph.i ]
-    %lsr.iv3 = phi ptr [ %cgep11, %for.body.i ], [ %cgep8, %for.body.lr.ph.i ]
-    %lsr.iv = phi ptr [ %cgep10, %for.body.i ], [ %cgep, %for.body.lr.ph.i ]
-    %elemIdx.05.i = phi i32 [ %mul16.i, %for.body.lr.ph.i ], [ %add19.i, %for.body.i ]
-    %16 = load <32 x i32>, ptr %lsr.iv6, align 128
-    %17 = load <32 x i32>, ptr %lsr.iv3, align 128
-    %18 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %17, <32 x i32> %16) #5
-    %19 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %13, <64 x i32> %18) #5
-    %20 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %14, <64 x i32> %18) #5
-    %21 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %19) #5
-    %22 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %20) #5
-    %23 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %22, i32 7) #5
-    %24 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %21, <32 x i32> %23) #5
-    %25 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %19) #5
-    %26 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %20) #5
-    %27 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %26, i32 7) #5
-    %28 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %25, <32 x i32> %27) #5
-    %29 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %24, <32 x i32> %15) #5
-    %30 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %28, <32 x i32> %15) #5
-    %31 = tail call <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32> %29, <32 x i32> %30, i32 4) #5
-    store <32 x i32> %31, ptr %lsr.iv, align 128
-    %add19.i = add nsw i32 %elemIdx.05.i, 128
-    %cmp18.i = icmp slt i32 %add19.i, %mul17.i
-    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 128
-    %cgep11 = getelementptr i8, ptr %lsr.iv3, i32 128
-    %cgep12 = getelementptr i8, ptr %lsr.iv6, i32 128
-    br i1 %cmp18.i, label %for.body.i, label %for.end.i
-
-  for.end.i:                                        ; preds = %for.body.i, %entry
-    ret void
-  }
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-  declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4
-
-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
-  declare i32 @llvm.hexagon.S2.extractu(i32, i32 immarg, i32 immarg) #0
-
-  attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
-  attributes #1 = { nocallback nofree nosync nounwind willreturn memory(write) }
-  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
-  attributes #3 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls,-small-data" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-  attributes #5 = { nounwind }
-
-...
----
-name:            blah
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: intregs, preferred-register: '' }
-  - { id: 1, class: intregs, preferred-register: '' }
-  - { id: 2, class: hvxwr, preferred-register: '' }
-  - { id: 3, class: hvxwr, preferred-register: '' }
-  - { id: 4, class: hvxvr, preferred-register: '' }
-  - { id: 5, class: intregs, preferred-register: '' }
-  - { id: 6, class: intregs, preferred-register: '' }
-  - { id: 7, class: intregs, preferred-register: '' }
-  - { id: 8, class: intregs, preferred-register: '' }
-  - { id: 9, class: intregs, preferred-register: '' }
-  - { id: 10, class: intregs, preferred-register: '' }
-  - { id: 11, class: intregs, preferred-register: '' }
-  - { id: 12, class: intregs, preferred-register: '' }
-  - { id: 13, class: intregs, preferred-register: '' }
-  - { id: 14, class: intregs, preferred-register: '' }
-  - { id: 15, class: intregs, preferred-register: '' }
-  - { id: 16, class: intregs, preferred-register: '' }
-  - { id: 17, class: intregs, preferred-register: '' }
-  - { id: 18, class: intregs, preferred-register: '' }
-  - { id: 19, class: intregs, preferred-register: '' }
-  - { id: 20, class: intregs, preferred-register: '' }
-  - { id: 21, class: intregs, preferred-register: '' }
-  - { id: 22, class: intregs, preferred-register: '' }
-  - { id: 23, class: intregs, preferred-register: '' }
-  - { id: 24, class: intregs, preferred-register: '' }
-  - { id: 25, class: predregs, preferred-register: '' }
-  - { id: 26, class: predregs, preferred-register: '' }
-  - { id: 27, class: hvxvr, preferred-register: '' }
-  - { id: 28, class: intregs, preferred-register: '' }
-  - { id: 29, class: hvxvr, preferred-register: '' }
-  - { id: 30, class: intregs, preferred-register: '' }
-  - { id: 31, class: hvxvr, preferred-register: '' }
-  - { id: 32, class: intregs, preferred-register: '' }
-  - { id: 33, class: hvxvr, preferred-register: '' }
-  - { id: 34, class: hvxvr, preferred-register: '' }
-  - { id: 35, class: hvxwr, preferred-register: '' }
-  - { id: 36, class: hvxwr, preferred-register: '' }
-  - { id: 37, class: hvxwr, preferred-register: '' }
-  - { id: 38, class: hvxvr, preferred-register: '' }
-  - { id: 39, class: hvxvr, preferred-register: '' }
-  - { id: 40, class: intregs, preferred-register: '' }
-  - { id: 41, class: hvxvr, preferred-register: '' }
-  - { id: 42, class: hvxvr, preferred-register: '' }
-  - { id: 43, class: hvxvr, preferred-register: '' }
-  - { id: 44, class: hvxvr, preferred-register: '' }
-  - { id: 45, class: hvxvr, preferred-register: '' }
-  - { id: 46, class: hvxvr, preferred-register: '' }
-  - { id: 47, class: hvxvr, preferred-register: '' }
-  - { id: 48, class: hvxvr, preferred-register: '' }
-  - { id: 49, class: intregslow8, preferred-register: '' }
-  - { id: 50, class: hvxvr, preferred-register: '' }
-  - { id: 51, class: predregs, preferred-register: '' }
-  - { id: 52, class: intregs, preferred-register: '' }
-  - { id: 53, class: intregs, preferred-register: '' }
-  - { id: 54, class: intregs, preferred-register: '' }
-  - { id: 55, class: intregs, preferred-register: '' }
-  - { id: 56, class: intregs, preferred-register: '' }
-  - { id: 57, class: intregs, preferred-register: '' }
-  - { id: 58, class: intregs, preferred-register: '' }
-  - { id: 59, class: intregs, preferred-register: '' }
-  - { id: 60, class: intregs, preferred-register: '' }
-  - { id: 61, class: hvxvr, preferred-register: '' }
-  - { id: 62, class: intregs, preferred-register: '' }
-  - { id: 63, class: hvxvr, preferred-register: '' }
-  - { id: 64, class: intregs, preferred-register: '' }
-  - { id: 65, class: hvxwr, preferred-register: '' }
-  - { id: 66, class: hvxwr, preferred-register: '' }
-  - { id: 67, class: hvxwr, preferred-register: '' }
-  - { id: 68, class: hvxvr, preferred-register: '' }
-  - { id: 69, class: hvxvr, preferred-register: '' }
-  - { id: 70, class: hvxvr, preferred-register: '' }
-  - { id: 71, class: hvxvr, preferred-register: '' }
-  - { id: 72, class: hvxvr, preferred-register: '' }
-  - { id: 73, class: hvxvr, preferred-register: '' }
-  - { id: 74, class: hvxvr, preferred-register: '' }
-  - { id: 75, class: intregs, preferred-register: '' }
-  - { id: 76, class: intregs, preferred-register: '' }
-  - { id: 77, class: intregs, preferred-register: '' }
-  - { id: 78, class: intregs, preferred-register: '' }
-  - { id: 79, class: hvxvr, preferred-register: '' }
-  - { id: 80, class: intregs, preferred-register: '' }
-  - { id: 81, class: hvxvr, preferred-register: '' }
-  - { id: 82, class: intregs, preferred-register: '' }
-  - { id: 83, class: hvxwr, preferred-register: '' }
-  - { id: 84, class: hvxwr, preferred-register: '' }
-  - { id: 85, class: hvxwr, preferred-register: '' }
-  - { id: 86, class: hvxvr, preferred-register: '' }
-  - { id: 87, class: hvxvr, preferred-register: '' }
-  - { id: 88, class: hvxvr, preferred-register: '' }
-  - { id: 89, class: hvxvr, preferred-register: '' }
-  - { id: 90, class: hvxvr, preferred-register: '' }
-  - { id: 91, class: hvxvr, preferred-register: '' }
-  - { id: 92, class: hvxvr, preferred-register: '' }
-  - { id: 93, class: intregs, preferred-register: '' }
-  - { id: 94, class: intregs, preferred-register: '' }
-  - { id: 95, class: intregs, preferred-register: '' }
-  - { id: 96, class: intregs, preferred-register: '' }
-  - { id: 97, class: predregs, preferred-register: '' }
-  - { id: 98, class: predregs, preferred-register: '' }
-liveins:
-  - { reg: '$r0', virtual-reg: '%16' }
-  - { reg: '$r1', virtual-reg: '%17' }
-  - { reg: '$r2', virtual-reg: '%18' }
-  - { reg: '$r3', virtual-reg: '%19' }
-  - { reg: '$r4', virtual-reg: '%20' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    4
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:
-  - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-  - { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.entry:
-    successors: %bb.1(0x40000000), %bb.3(0x40000000)
-    liveins: $r0, $r1, $r2, $r3, $r4
-
-    %20:intregs = COPY $r4
-    %19:intregs = COPY $r3
-    %18:intregs = COPY $r2
-    %17:intregs = COPY $r1
-    %16:intregs = COPY $r0
-    %22:intregs = S2_extractu %16, 23, 9
-    %23:intregs = S2_asl_i_r %22, 7
-    %0:intregs = nsw M2_mpyi %23, %17
-    %24:intregs = nsw A2_addi %17, 1
-    %1:intregs = nsw M2_mpyi %24, %23
-    %25:predregs = C2_cmpgt %1, %0
-    J2_jumpf %25, %bb.3, implicit-def dead $pc
-    J2_jump %bb.1, implicit-def dead $pc
-
-  bb.1.for.body.lr.ph.i:
-    successors: %bb.4(0x40000000), %bb.6(0x40000000)
-
-    %28:intregs = A2_tfrsi 269488144
-    %27:hvxvr = V6_lvsplatw %28
-    %30:intregs = A2_tfrsi 1077952576
-    %29:hvxvr = V6_lvsplatw %30
-    %2:hvxwr = REG_SEQUENCE %29, %subreg.vsub_hi, %27, %subreg.vsub_lo
-    %31:hvxvr = V6_vd0
-    %3:hvxwr = REG_SEQUENCE %31, %subreg.vsub_hi, %31, %subreg.vsub_lo
-    %32:intregs = A2_tfrsi 32
-    %4:hvxvr = V6_lvsplath %32
-    %5:intregs = A2_add %18, %0
-    %6:intregs = A2_add %20, %0
-    %7:intregs = A2_add %19, %0
-    %40:intregs = A2_tfrsi 7
-    %49:intregslow8 = A2_tfrsi 4
-    %52:intregs = A2_sub %1, %0
-    %53:intregs = A2_addi %52, 127
-    %54:intregs = S2_lsr_i_r %53, 7
-    %55:intregs = COPY %54
-    %56:intregs = S2_lsr_i_r %55, 1
-    %57:intregs = A2_andir %55, 1
-    %97:predregs = C2_cmpgtui %56, 0
-    J2_jumpf %97, %bb.6, implicit-def $pc
-    J2_jump %bb.4, implicit-def $pc
-
-  bb.4:
-    successors: %bb.5(0x80000000)
-
-    J2_loop0r %bb.5, %56, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
-    J2_jump %bb.5, implicit-def $pc
-
-  bb.5:
-    successors: %bb.5(0x40000000), %bb.6(0x40000000)
-
-    %58:intregs = PHI %7, %bb.4, %80, %bb.5
-    %59:intregs = PHI %6, %bb.4, %82, %bb.5
-    %60:intregs = PHI %5, %bb.4, %93, %bb.5
-    %61:hvxvr, %62:intregs = V6_vL32b_pi %58, 128 :: (load (s1024) from %ir.lsr.iv6)
-    %63:hvxvr, %64:intregs = V6_vL32b_pi %59, 128 :: (load (s1024) from %ir.lsr.iv3)
-    %65:hvxwr = REG_SEQUENCE %63, %subreg.vsub_hi, %61, %subreg.vsub_lo
-    %66:hvxwr = V6_vmpabusv %2, %65
-    %67:hvxwr = V6_vmpabusv %3, %65
-    %68:hvxvr = V6_vasrh %67.vsub_hi, %40
-    %69:hvxvr = V6_vavgh %66.vsub_hi, %68
-    %70:hvxvr = V6_vasrh %67.vsub_lo, %40
-    %71:hvxvr = V6_vavgh %66.vsub_lo, %70
-    %72:hvxvr = V6_vaddhsat %69, %4
-    %73:hvxvr = V6_vaddhsat %71, %4
-    %74:hvxvr = V6_vasrhbsat %72, %73, %49
-    %75:intregs = V6_vS32b_pi %60, 128, %74 :: (store (s1024) into %ir.lsr.iv)
-    %79:hvxvr, %80:intregs = V6_vL32b_pi %62, 128 :: (load (s1024) from %ir.lsr.iv6 + 128)
-    %81:hvxvr, %82:intregs = V6_vL32b_pi %64, 128 :: (load (s1024) from %ir.lsr.iv3 + 128)
-    %83:hvxwr = REG_SEQUENCE %81, %subreg.vsub_hi, %79, %subreg.vsub_lo
-    %84:hvxwr = V6_vmpabusv %2, %83
-    %85:hvxwr = V6_vmpabusv %3, %83
-    %86:hvxvr = V6_vasrh %85.vsub_hi, %40
-    %87:hvxvr = V6_vavgh %84.vsub_hi, %86
-    %88:hvxvr = V6_vasrh %85.vsub_lo, %40
-    %89:hvxvr = V6_vavgh %84.vsub_lo, %88
-    %90:hvxvr = V6_vaddhsat %87, %4
-    %91:hvxvr = V6_vaddhsat %89, %4
-    %92:hvxvr = V6_vasrhbsat %90, %91, %49
-    %93:intregs = V6_vS32b_pi %75, 128, %92 :: (store (s1024) into %ir.lsr.iv + 128)
-    ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
-    J2_jump %bb.6, implicit-def $pc
-
-  bb.6:
-    successors: %bb.7(0x40000000), %bb.8(0x40000000)
-
-    %94:intregs = PHI %7, %bb.1, %80, %bb.5
-    %95:intregs = PHI %6, %bb.1, %82, %bb.5
-    %96:intregs = PHI %5, %bb.1, %93, %bb.5
-    %98:predregs = C2_cmpgtui %57, 0
-    J2_jumpf %98, %bb.8, implicit-def $pc
-    J2_jump %bb.7, implicit-def $pc
-
-  bb.7:
-    successors: %bb.2(0x80000000)
-
-    J2_jump %bb.2, implicit-def $pc
-
-  bb.2.for.body.i (machine-block-address-taken):
-    successors: %bb.8(0x04000000)
-
-    %33:hvxvr, %15:intregs = V6_vL32b_pi %94, 128 :: (load (s1024) from %ir.lsr.iv6)
-    %34:hvxvr, %14:intregs = V6_vL32b_pi %95, 128 :: (load (s1024) from %ir.lsr.iv3)
-    %35:hvxwr = REG_SEQUENCE %34, %subreg.vsub_hi, %33, %subreg.vsub_lo
-    %36:hvxwr = V6_vmpabusv %2, %35
-    %37:hvxwr = V6_vmpabusv %3, %35
-    %41:hvxvr = V6_vasrh %37.vsub_hi, %40
-    %42:hvxvr = V6_vavgh %36.vsub_hi, %41
-    %45:hvxvr = V6_vasrh %37.vsub_lo, %40
-    %46:hvxvr = V6_vavgh %36.vsub_lo, %45
-    %47:hvxvr = V6_vaddhsat %42, %4
-    %48:hvxvr = V6_vaddhsat %46, %4
-    %50:hvxvr = V6_vasrhbsat %47, %48, %49
-    %13:intregs = V6_vS32b_pi %96, 128, %50 :: (store (s1024) into %ir.lsr.iv)
-    J2_jump %bb.8, implicit-def $pc
-
-  bb.8:
-    successors: %bb.3(0x80000000)
-
-    J2_jump %bb.3, implicit-def $pc
-
-  bb.3.for.end.i:
-    PS_jmpret $r31, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/Hexagon/post_inc_store.mir b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
deleted file mode 100644
index 3e3f51ac9114d..0000000000000
--- a/llvm/test/CodeGen/Hexagon/post_inc_store.mir
+++ /dev/null
@@ -1,168 +0,0 @@
-#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
-
-# Test that we convert a post-inc load and store to a regular load and post-inc
-# store.
-# CHECK: J2_loop0r
-# CHECK-NOT: = L2_loadruh_pi
-# CHECK: L2_loadruh_io
-# CHECK: S2_storerh_pi
-# CHECK: ENDLOOP0
-
---- |
-  ; Function Attrs: nofree norecurse nounwind
-  define dso_local void @blam(i32 %arg, ptr nocapture %arg1, i16 signext %arg2) local_unnamed_addr #0 {
-  bb:
-    %icmp = icmp eq i32 %arg, 0
-    br i1 %icmp, label %bb13, label %bb3
-
-  bb3:                                              ; preds = %bb, %bb10
-    %phi = phi i32 [ %add11, %bb10 ], [ 0, %bb ]
-    %mul = mul i32 %phi, %arg
-    %cgep = getelementptr i16, ptr %arg1, i32 %mul
-    br label %bb4
-
-  bb4:                                              ; preds = %bb4, %bb3
-    %lsr.iv = phi i32 [ %lsr.iv.next, %bb4 ], [ %arg, %bb3 ]
-    %phi5 = phi ptr [ %cgep, %bb3 ], [ %cgep1, %bb4 ]
-    %load = load i16, ptr %phi5, align 2
-    %add = add i16 %load, %arg2
-    store i16 %add, ptr %phi5, align 2
-    %lsr.iv.next = add i32 %lsr.iv, -1
-    %icmp8 = icmp eq i32 %lsr.iv.next, 0
-    %cgep1 = getelementptr i16, ptr %phi5, i32 1
-    br i1 %icmp8, label %bb10, label %bb4
-
-  bb10:                                             ; preds = %bb4
-    %add11 = add nuw i32 %phi, 1
-    %icmp12 = icmp eq i32 %add11, %arg
-    br i1 %icmp12, label %bb13, label %bb3
-
-  bb13:                                             ; preds = %bb10, %bb
-    ret void
-  }
-
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-...
----
-name:            blam
-alignment:       16
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: intregs, preferred-register: '' }
-  - { id: 1, class: intregs, preferred-register: '' }
-  - { id: 2, class: intregs, preferred-register: '' }
-  - { id: 3, class: intregs, preferred-register: '' }
-  - { id: 4, class: intregs, preferred-register: '' }
-  - { id: 5, class: intregs, preferred-register: '' }
-  - { id: 6, class: intregs, preferred-register: '' }
-  - { id: 7, class: intregs, preferred-register: '' }
-  - { id: 8, class: intregs, preferred-register: '' }
-  - { id: 9, class: intregs, preferred-register: '' }
-  - { id: 10, class: intregs, preferred-register: '' }
-  - { id: 11, class: intregs, preferred-register: '' }
-  - { id: 12, class: predregs, preferred-register: '' }
-  - { id: 13, class: intregs, preferred-register: '' }
-  - { id: 14, class: intregs, preferred-register: '' }
-  - { id: 15, class: intregs, preferred-register: '' }
-  - { id: 16, class: predregs, preferred-register: '' }
-  - { id: 17, class: predregs, preferred-register: '' }
-  - { id: 18, class: predregs, preferred-register: '' }
-  - { id: 19, class: predregs, preferred-register: '' }
-  - { id: 20, class: intregs, preferred-register: '' }
-  - { id: 21, class: intregs, preferred-register: '' }
-liveins:
-  - { reg: '$r0', virtual-reg: '%7' }
-  - { reg: '$r1', virtual-reg: '%8' }
-  - { reg: '$r2', virtual-reg: '%9' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.0.bb:
-    successors: %bb.4(0x30000000), %bb.5(0x50000000)
-    liveins: $r0, $r1, $r2
-
-    %9:intregs = COPY $r2
-    %8:intregs = COPY $r1
-    %7:intregs = COPY $r0
-    %21:intregs = COPY %7
-    %20:intregs = COPY %7
-    %12:predregs = C2_cmpeqi %7, 0
-    J2_jumpt %12, %bb.4, implicit-def $pc
-
-  bb.5:
-    successors: %bb.1(0x80000000)
-
-    %11:intregs = A2_tfrsi 0
-    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
-
-  bb.1.bb3 (machine-block-address-taken):
-    successors: %bb.2(0x80000000)
-
-    %0:intregs = PHI %11, %bb.5, %6, %bb.3
-    %13:intregs = M2_mpyi %0, %7
-    %1:intregs = S2_addasl_rrri %8, %13, 1
-    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
-
-  bb.2.bb4 (machine-block-address-taken):
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-
-    %3:intregs = PHI %1, %bb.1, %5, %bb.2
-    %14:intregs = L2_loadruh_io %3, 0 :: (load (s16) from %ir.phi5)
-    %15:intregs = A2_add %14, %9
-    %5:intregs = S2_storerh_pi %3, 2, %15 :: (store (s16) into %ir.phi5)
-    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
-    J2_jump %bb.3, implicit-def dead $pc
-
-  bb.3.bb10:
-    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
-
-    %6:intregs = nuw A2_addi %0, 1
-    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
-    J2_jump %bb.4, implicit-def dead $pc
-
-  bb.4.bb13:
-    PS_jmpret $r31, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
deleted file mode 100644
index e22053421791d..0000000000000
--- a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
+++ /dev/null
@@ -1,58 +0,0 @@
-# RUN: llc -march=hexagon -run-pass=hexagon-postincopt %s -o /dev/null
-# REQUIRES: asserts
-# Test that we do not hit unreachable code dealt with L4_ior_memoph_io.
-
-...
----
-name:            foo
-alignment:       4
-tracksRegLiveness: true
-body:             |
-  bb.0.entry:
-    successors: %bb.4(0x30000000), %bb.5(0x50000000)
-    liveins: $r0, $r1, $r2
-
-    %9:intregs = COPY $r2
-    %8:intregs = COPY $r1
-    %7:intregs = COPY $r0
-    %21:intregs = COPY %7
-    %20:intregs = COPY %7
-    %12:predregs = C2_cmpeqi %7, 0
-    J2_jumpt %12, %bb.4, implicit-def $pc
-
-  bb.5:
-    successors: %bb.1(0x80000000)
-
-    %11:intregs = A2_tfrsi 0
-    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
-
-  bb.1:
-    successors: %bb.2(0x80000000)
-
-    %0:intregs = PHI %11, %bb.5, %6, %bb.3
-    %13:intregs = M2_mpyi %0, %7
-    %1:intregs = S2_addasl_rrri %8, %13, 1
-    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
-
-  bb.2:
-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-
-    %3:intregs = PHI %1, %bb.1, %5, %bb.2
-    %14:intregs = L2_loadruh_io %3, 0
-    L4_ior_memoph_io %3:intregs, 0, 21
-    %15:intregs = A2_add %14, %9
-    %5:intregs = S2_storerh_pi %3, 2, %15
-    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
-    J2_jump %bb.3, implicit-def dead $pc
-
-  bb.3:
-    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
-
-    %6:intregs = nuw A2_addi %0, 1
-    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
-    J2_jump %bb.4, implicit-def dead $pc
-
-  bb.4:
-    PS_jmpret $r31, implicit-def dead $pc
-
-...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
deleted file mode 100644
index 27d653c99f7b8..0000000000000
--- a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
+++ /dev/null
@@ -1,19 +0,0 @@
-# RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s
-# Check that this doesn't crash.
-# CHECK: Y2_dcfetchbo
-
-name: fred
-tracksRegLiveness: true
-body: |
-  bb.0:
-    successors: %bb.1
-    %0:intregs = IMPLICIT_DEF
-
-  bb.1:
-    successors: %bb.1
-
-    %1:intregs = PHI %0:intregs, %bb.0, %2:intregs, %bb.1
-    Y2_dcfetchbo %1:intregs, 0
-    %2:intregs = A2_addi %1:intregs, 1
-    J2_jump %bb.1, implicit-def dead $pc
-...
diff --git a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
deleted file mode 100644
index fca42d547dfbc..0000000000000
--- a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
+++ /dev/null
@@ -1,32 +0,0 @@
-# RUN: llc -march=hexagon -run-pass hexagon-postincopt -o - %s | FileCheck %s
-# REQUIRES: asserts
-
-# Check that this doesn't crash:
-# CHECK: L2_loadbsw4_io
-
----
-name: fred
-tracksRegLiveness: true
-liveins:
-  - { reg: '$r0', virtual-reg: '%0' }
-body: |
-  bb.0:
-    successors: %bb.1(0x80000000)
-    liveins: $r0
-
-    %0:intregs = COPY $r0
-    %1:intregs = A2_tfrsi 240
-    %2:doubleregs = IMPLICIT_DEF
-    %3:doubleregs = IMPLICIT_DEF
-
-  bb.1:
-    successors: %bb.1(0x80000000)
-
-    %4:intregs = PHI %1, %bb.0, %5, %bb.1
-    %6:doubleregs = L2_loadbsw4_io %4, 0
-    %7:doubleregs = M2_vrmac_s0 %2, %6, %3
-    S2_storeri_io %0, 0, %7.isub_lo
-    %5:intregs = nuw A2_addi %4, 256
-    J2_jump %bb.1, implicit-def dead $pc
-
-...

From 13fd4bf4e53391aab3cdfd922e9ceb4ad1225d1e Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 27 Feb 2024 19:56:00 +0100
Subject: [PATCH 477/546] [llvm-ar][Archive] Use getDefaultTargetTriple instead
 of host triple for the fallback archive format. (#82888)

---
 .../ClangOffloadPackager.cpp                     |  2 +-
 llvm/docs/CommandGuide/llvm-ar.rst               |  5 +++--
 llvm/docs/ReleaseNotes.rst                       |  4 ++++
 llvm/include/llvm/Object/Archive.h               |  2 +-
 llvm/lib/Object/Archive.cpp                      |  4 ++--
 llvm/lib/Object/ArchiveWriter.cpp                |  2 +-
 llvm/tools/llvm-ar/llvm-ar.cpp                   | 16 +++++++---------
 7 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
index c36a5aa58cee5..c6d5b31ab512c 100644
--- a/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
+++ b/clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
@@ -197,7 +197,7 @@ static Error unbundleImages() {
 
       if (Error E = writeArchive(
               Args["file"], Members, SymtabWritingMode::NormalSymtab,
-              Archive::getDefaultKindForHost(), true, false, nullptr))
+              Archive::getDefaultKind(), true, false, nullptr))
         return E;
     } else if (Args.count("file")) {
       if (Extracted.size() > 1)
diff --git a/llvm/docs/CommandGuide/llvm-ar.rst b/llvm/docs/CommandGuide/llvm-ar.rst
index f643b214bcc83..03d5b9e41ada3 100644
--- a/llvm/docs/CommandGuide/llvm-ar.rst
+++ b/llvm/docs/CommandGuide/llvm-ar.rst
@@ -262,8 +262,9 @@ Other
 .. option:: --format=<type>
 
  This option allows for default, gnu, darwin or bsd ``<type>`` to be selected.
- When creating an ``archive``, ``<type>`` will default to that of the host
- machine.
+ When creating an ``archive`` with the default ``<type>``, :program:``llvm-ar``
+ will attempt to infer it from the input files and fallback to the default
+ toolchain target if unable to do so.
 
 .. option:: -h, --help
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 5be00d9d5a589..8a3a0ec66ed87 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -143,6 +143,10 @@ Changes to the LLVM tools
   files using reference types and GC are also supported (but also only for
   functions, globals, and data, and only for listing symbols and names).
 
+* llvm-ar now utilizes LLVM_DEFAULT_TARGET_TRIPLE to determine the archive format
+  if it's not specified with the ``--format`` argument and cannot be inferred from
+  input files.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index 3dd99a46507a2..f71630054dc63 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -338,7 +338,7 @@ class Archive : public Binary {
 
   Kind kind() const { return (Kind)Format; }
   bool isThin() const { return IsThin; }
-  static object::Archive::Kind getDefaultKindForHost();
+  static object::Archive::Kind getDefaultKind();
 
   child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
   child_iterator child_end() const;
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index e447e5b23316f..9000e9aa81ff4 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -969,8 +969,8 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
-object::Archive::Kind Archive::getDefaultKindForHost() {
-  Triple HostTriple(sys::getProcessTriple());
+object::Archive::Kind Archive::getDefaultKind() {
+  Triple HostTriple(sys::getDefaultTargetTriple());
   return HostTriple.isOSDarwin()
              ? object::Archive::K_DARWIN
              : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 155926a8c5949..96e4ec1ee0b77 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -90,7 +90,7 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
     }
   }
 
-  return object::Archive::getDefaultKindForHost();
+  return object::Archive::getDefaultKind();
 }
 
 Expected<NewArchiveMember>
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index c8800303bc1e4..81cb2a21daf1f 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -670,7 +670,7 @@ Expected<std::unique_ptr<Binary>> getAsBinary(const Archive::Child &C,
 }
 
 template <class A> static bool isValidInBitMode(const A &Member) {
-  if (object::Archive::getDefaultKindForHost() != object::Archive::K_AIXBIG)
+  if (object::Archive::getDefaultKind() != object::Archive::K_AIXBIG)
     return true;
   LLVMContext Context;
   Expected<std::unique_ptr<Binary>> BinOrErr = getAsBinary(Member, &Context);
@@ -1036,10 +1036,10 @@ static void performWriteOperation(ArchiveOperation Operation,
       }
     } else if (NewMembersP)
       Kind = !NewMembersP->empty() ? NewMembersP->front().detectKindFromObject()
-                                   : object::Archive::getDefaultKindForHost();
+                                   : object::Archive::getDefaultKind();
     else
       Kind = !NewMembers.empty() ? NewMembers.front().detectKindFromObject()
-                                 : object::Archive::getDefaultKindForHost();
+                                 : object::Archive::getDefaultKind();
     break;
   case GNU:
     Kind = object::Archive::K_GNU;
@@ -1331,7 +1331,7 @@ static int ar_main(int argc, char **argv) {
 
   // Get BitMode from enviorment variable "OBJECT_MODE" for AIX OS, if
   // specified.
-  if (object::Archive::getDefaultKindForHost() == object::Archive::K_AIXBIG) {
+  if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
     BitMode = getBitMode(getenv("OBJECT_MODE"));
     if (BitMode == BitModeTy::Unknown)
       BitMode = BitModeTy::Bit32;
@@ -1392,8 +1392,7 @@ static int ar_main(int argc, char **argv) {
       continue;
 
     if (strncmp(*ArgIt, "-X", 2) == 0) {
-      if (object::Archive::getDefaultKindForHost() ==
-          object::Archive::K_AIXBIG) {
+      if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
         Match = *(*ArgIt + 2) != '\0' ? *ArgIt + 2 : *(++ArgIt);
         BitMode = getBitMode(Match);
         if (BitMode == BitModeTy::Unknown)
@@ -1432,8 +1431,7 @@ static int ranlib_main(int argc, char **argv) {
           cl::PrintVersionMessage();
           return 0;
         } else if (arg.front() == 'X') {
-          if (object::Archive::getDefaultKindForHost() ==
-              object::Archive::K_AIXBIG) {
+          if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
             HasAIXXOption = true;
             arg.consume_front("X");
             const char *Xarg = arg.data();
@@ -1464,7 +1462,7 @@ static int ranlib_main(int argc, char **argv) {
     }
   }
 
-  if (object::Archive::getDefaultKindForHost() == object::Archive::K_AIXBIG) {
+  if (object::Archive::getDefaultKind() == object::Archive::K_AIXBIG) {
     // If not specify -X option, get BitMode from enviorment variable
     // "OBJECT_MODE" for AIX OS if specify.
     if (!HasAIXXOption) {

From 8e3b60540c81354b321018a47bf159ca9b52d776 Mon Sep 17 00:00:00 2001
From: Michael Jones <71531609+michaelrj-google@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:03:20 -0800
Subject: [PATCH 478/546] [libc] Add fixed point support to printf (#82707)

This patch adds the r, R, k, and K conversion specifiers to printf, with
accompanying tests. They are guarded behind the
LIBC_COPT_PRINTF_DISABLE_FIXED_POINT flag as well as automatic fixed
point support detection.
---
 libc/config/config.json                       |   4 +
 libc/docs/configure.rst                       |   1 +
 libc/docs/dev/printf_behavior.rst             |  12 +
 libc/fuzzing/stdio/CMakeLists.txt             |  14 +
 libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp | 133 ++++++++
 libc/src/stdio/printf_core/CMakeLists.txt     |   4 +
 libc/src/stdio/printf_core/converter.cpp      |   8 +
 libc/src/stdio/printf_core/converter_atlas.h  |   5 +
 libc/src/stdio/printf_core/converter_utils.h  |   3 +
 libc/src/stdio/printf_core/core_structs.h     |  24 +-
 libc/src/stdio/printf_core/fixed_converter.h  | 309 ++++++++++++++++++
 .../stdio/printf_core/float_dec_converter.h   |   3 -
 libc/src/stdio/printf_core/parser.h           |  65 ++++
 libc/src/stdio/printf_core/printf_config.h    |   7 +
 libc/test/src/stdio/CMakeLists.txt            |   3 +
 libc/test/src/stdio/sprintf_test.cpp          | 211 ++++++++++++
 .../llvm-project-overlay/libc/BUILD.bazel     |  14 +-
 17 files changed, 807 insertions(+), 13 deletions(-)
 create mode 100644 libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
 create mode 100644 libc/src/stdio/printf_core/fixed_converter.h

diff --git a/libc/config/config.json b/libc/config/config.json
index 6a208cc556611..b73c47b1a14bc 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -15,6 +15,10 @@
     "LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE": {
       "value": false,
       "doc": "Use large table for better printf long double performance."
+    },
+    "LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": {
+      "value": false,
+      "doc": "Disable printing fixed point values in printf and friends."
     }
   },
   "string": {
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 9997dde6cf899..a177550647bd9 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -26,6 +26,7 @@ overrides in ``config/<platform>/config.json`` and ``config/<platform>/<arch>/co
 to learn about the defaults for your platform and target.
 
 * **"printf" options**
+    - ``LIBC_CONF_PRINTF_DISABLE_FIXED_POINT``: Disable printing fixed point values in printf and friends.
     - ``LIBC_CONF_PRINTF_DISABLE_FLOAT``: Disable printing floating point values in printf and friends.
     - ``LIBC_CONF_PRINTF_DISABLE_INDEX_MODE``: Disable index mode in the printf format string.
     - ``LIBC_CONF_PRINTF_DISABLE_WRITE_INT``: Disable handling of %n in printf format string.
diff --git a/libc/docs/dev/printf_behavior.rst b/libc/docs/dev/printf_behavior.rst
index 7128c738d1924..00d6c83f4b0d3 100644
--- a/libc/docs/dev/printf_behavior.rst
+++ b/libc/docs/dev/printf_behavior.rst
@@ -62,6 +62,13 @@ When set, this flag disables support for floating point numbers and all their
 conversions (%a, %f, %e, %g); any floating point number conversion will be
 treated as invalid. This reduces code size.
 
+LIBC_COPT_PRINTF_DISABLE_FIXED_POINT
+------------------------------------
+When set, this flag disables support for fixed point numbers and all their
+conversions (%r, %k); any fixed point number conversion will be treated as
+invalid. This reduces code size. This has no effect if the current compiler does
+not support fixed point numbers.
+
 LIBC_COPT_PRINTF_NO_NULLPTR_CHECKS
 ----------------------------------
 When set, this flag disables the nullptr checks in %n and %s.
@@ -191,3 +198,8 @@ original conversion.
 The %p conversion will display a null pointer as if it was the string
 "(nullptr)" passed to a "%s" conversion, with all other options remaining the
 same as the original conversion.
+
+The %r, %R, %k, and %K fixed point number format specifiers are accepted as
+defined in ISO/IEC TR 18037 (the fixed point number extension). These are
+available when the compiler is detected as having support for fixed point
+numbers and the LIBC_COPT_PRINTF_DISABLE_FIXED_POINT flag is not set.
diff --git a/libc/fuzzing/stdio/CMakeLists.txt b/libc/fuzzing/stdio/CMakeLists.txt
index 22de67d42747f..8f89baa702000 100644
--- a/libc/fuzzing/stdio/CMakeLists.txt
+++ b/libc/fuzzing/stdio/CMakeLists.txt
@@ -15,3 +15,17 @@ add_libc_fuzzer(
     libc.src.stdio.snprintf
     libc.src.__support.FPUtil.fp_bits
 )
+
+if(LIBC_COMPILER_HAS_FIXED_POINT)
+  add_libc_fuzzer(
+    printf_fixed_conv_fuzz
+    NEED_MPFR
+    SRCS
+      printf_fixed_conv_fuzz.cpp
+    DEPENDS
+      libc.src.stdio.snprintf
+      libc.src.__support.fixed_point.fx_bits
+    COMPILE_OPTIONS
+      -ffixed-point # TODO: add -ffixed-point to fuzz tests automatically
+  )
+endif()
diff --git a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
new file mode 100644
index 0000000000000..b4a8621891203
--- /dev/null
+++ b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
@@ -0,0 +1,133 @@
+//===-- printf_fixed_conv_fuzz.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Fuzzing test for llvm-libc printf %f/e/g/a implementations.
+///
+//===----------------------------------------------------------------------===//
+#include "src/stdio/snprintf.h"
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "src/__support/fixed_point/fx_bits.h"
+#include "src/__support/fixed_point/fx_rep.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "utils/MPFRWrapper/mpfr_inc.h"
+
+constexpr int MAX_SIZE = 10000;
+
+inline bool simple_streq(char *first, char *second, int length) {
+  for (int i = 0; i < length; ++i)
+    if (first[i] != second[i])
+      return false;
+
+  return true;
+}
+
+inline int clamp(int num, int max) {
+  if (num > max)
+    return max;
+  if (num < -max)
+    return -max;
+  return num;
+}
+
+enum class TestResult {
+  Success,
+  BufferSizeFailed,
+  LengthsDiffer,
+  StringsNotEqual,
+};
+
+template <typename F>
+inline TestResult test_vals(const char *fmt, uint64_t num, int prec,
+                            int width) {
+  typename LIBC_NAMESPACE::fixed_point::FXRep<F>::StorageType raw_num = num;
+
+  auto raw_num_bits = LIBC_NAMESPACE::fixed_point::FXBits<F>(raw_num);
+
+  // This needs to be a float with enough bits of precision to hold the fixed
+  // point number.
+  static_assert(sizeof(long double) > sizeof(long accum));
+
+  // build a long double that is equivalent to the fixed point number.
+  long double ld_num =
+      static_cast<long double>(raw_num_bits.get_integral()) +
+      (static_cast<long double>(raw_num_bits.get_fraction()) /
+       static_cast<long double>(1ll << raw_num_bits.get_exponent()));
+
+  if (raw_num_bits.get_sign())
+    ld_num = -ld_num;
+
+  // Call snprintf on a nullptr to get the buffer size.
+  int buffer_size = LIBC_NAMESPACE::snprintf(nullptr, 0, fmt, width, prec, num);
+
+  if (buffer_size < 0)
+    return TestResult::BufferSizeFailed;
+
+  char *test_buff = new char[buffer_size + 1];
+  char *reference_buff = new char[buffer_size + 1];
+
+  int test_result = 0;
+  int reference_result = 0;
+
+  test_result = LIBC_NAMESPACE::snprintf(test_buff, buffer_size + 1, fmt, width,
+                                         prec, num);
+
+  // The fixed point format is defined to be %f equivalent.
+  reference_result = mpfr_snprintf(reference_buff, buffer_size + 1, "%*.*Lf",
+                                   width, prec, ld_num);
+
+  // All of these calls should return that they wrote the same amount.
+  if (test_result != reference_result || test_result != buffer_size)
+    return TestResult::LengthsDiffer;
+
+  if (!simple_streq(test_buff, reference_buff, buffer_size))
+    return TestResult::StringsNotEqual;
+
+  delete[] test_buff;
+  delete[] reference_buff;
+  return TestResult::Success;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  // const uint8_t raw_data[] = {0x8d,0x43,0x40,0x0,0x0,0x0,};
+  // data = raw_data;
+  // size = sizeof(raw_data);
+  int prec = 0;
+  int width = 0;
+
+  LIBC_NAMESPACE::fixed_point::FXRep<long accum>::StorageType raw_num = 0;
+
+  // Copy as many bytes of data as will fit into num, prec, and with. Any extras
+  // are ignored.
+  for (size_t cur = 0; cur < size; ++cur) {
+    if (cur < sizeof(raw_num)) {
+      raw_num = (raw_num << 8) + data[cur];
+    } else if (cur < sizeof(raw_num) + sizeof(prec)) {
+      prec = (prec << 8) + data[cur];
+    } else if (cur < sizeof(raw_num) + sizeof(prec) + sizeof(width)) {
+      width = (width << 8) + data[cur];
+    }
+  }
+
+  width = clamp(width, MAX_SIZE);
+  prec = clamp(prec, MAX_SIZE);
+
+  TestResult result;
+  result = test_vals<long accum>("%*.*lk", raw_num, prec, width);
+  if (result != TestResult::Success)
+    __builtin_trap();
+
+  result = test_vals<unsigned long accum>("%*.*lK", raw_num, prec, width);
+  if (result != TestResult::Success)
+    __builtin_trap();
+
+  return 0;
+}
diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
index 9b81a7abb6cfd..02819ea25ea05 100644
--- a/libc/src/stdio/printf_core/CMakeLists.txt
+++ b/libc/src/stdio/printf_core/CMakeLists.txt
@@ -10,6 +10,9 @@ endif()
 if(LIBC_CONF_PRINTF_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE)
   list(APPEND printf_config_copts "-DLIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE")
 endif()
+if(LIBC_CONF_PRINTF_DISABLE_FIXED_POINT)
+  list(APPEND printf_config_copts "-DLIBC_COPT_PRINTF_DISABLE_FIXED_POINT")
+endif()
 if(printf_config_copts)
   list(PREPEND printf_config_copts "COMPILE_OPTIONS")
 endif()
@@ -76,6 +79,7 @@ add_object_library(
     float_inf_nan_converter.h
     float_hex_converter.h
     float_dec_converter.h
+    fixed_converter.h #TODO: Check if this should be disabled when fixed unavail
   DEPENDS
     .writer
     .core_structs
diff --git a/libc/src/stdio/printf_core/converter.cpp b/libc/src/stdio/printf_core/converter.cpp
index 52412aef3c5c1..613d693c3cfcb 100644
--- a/libc/src/stdio/printf_core/converter.cpp
+++ b/libc/src/stdio/printf_core/converter.cpp
@@ -9,6 +9,7 @@
 #include "src/stdio/printf_core/converter.h"
 
 #include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/printf_config.h"
 #include "src/stdio/printf_core/writer.h"
 
 // This option allows for replacing all of the conversion functions with custom
@@ -75,6 +76,13 @@ int convert(Writer *writer, const FormatSection &to_conv) {
   case 'G':
     return convert_float_dec_auto(writer, to_conv);
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+  case 'r':
+  case 'R':
+  case 'k':
+  case 'K':
+    return convert_fixed(writer, to_conv);
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
   case 'n':
     return convert_write_int(writer, to_conv);
diff --git a/libc/src/stdio/printf_core/converter_atlas.h b/libc/src/stdio/printf_core/converter_atlas.h
index 6471f3f2955b7..2189ed11a551e 100644
--- a/libc/src/stdio/printf_core/converter_atlas.h
+++ b/libc/src/stdio/printf_core/converter_atlas.h
@@ -31,6 +31,11 @@
 #include "src/stdio/printf_core/float_hex_converter.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
 
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+// defines convert_fixed
+#include "src/stdio/printf_core/fixed_converter.h"
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
 #include "src/stdio/printf_core/write_int_converter.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
diff --git a/libc/src/stdio/printf_core/converter_utils.h b/libc/src/stdio/printf_core/converter_utils.h
index 54f0a870d0ac4..948fe816e9b76 100644
--- a/libc/src/stdio/printf_core/converter_utils.h
+++ b/libc/src/stdio/printf_core/converter_utils.h
@@ -51,6 +51,9 @@ LIBC_INLINE uintmax_t apply_length_modifier(uintmax_t num, LengthModifier lm) {
       return result;                                                           \
   }
 
+// This is used to represent which direction the number should be rounded.
+enum class RoundDirection { Up, Down, Even };
+
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE
 
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 7634d45568ab8..d3718b49d1b13 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -10,7 +10,9 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CORE_STRUCTS_H
 
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/stdio/printf_core/printf_config.h"
 
 #include <inttypes.h>
 #include <stddef.h>
@@ -77,7 +79,13 @@ struct FormatSection {
   }
 };
 
-enum PrimaryType : uint8_t { Unknown = 0, Float = 1, Pointer = 2, Integer = 3 };
+enum PrimaryType : uint8_t {
+  Unknown = 0,
+  Float = 1,
+  Pointer = 2,
+  Integer = 3,
+  FixedPoint = 4,
+};
 
 // TypeDesc stores the information about a type that is relevant to printf in
 // a relatively compact manner.
@@ -95,9 +103,16 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
   } else {
     constexpr bool IS_POINTER = cpp::is_pointer_v<T>;
     constexpr bool IS_FLOAT = cpp::is_floating_point_v<T>;
-    return TypeDesc{sizeof(T), IS_POINTER ? PrimaryType::Pointer
-                               : IS_FLOAT ? PrimaryType::Float
-                                          : PrimaryType::Integer};
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+    constexpr bool IS_FIXED_POINT = cpp::is_fixed_point_v<T>;
+#else
+    constexpr bool IS_FIXED_POINT = false;
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
+    return TypeDesc{sizeof(T), IS_POINTER       ? PrimaryType::Pointer
+                               : IS_FLOAT       ? PrimaryType::Float
+                               : IS_FIXED_POINT ? PrimaryType::FixedPoint
+                                                : PrimaryType::Integer};
   }
 }
 
@@ -109,6 +124,7 @@ constexpr int FILE_WRITE_ERROR = -1;
 constexpr int FILE_STATUS_ERROR = -2;
 constexpr int NULLPTR_WRITE_ERROR = -3;
 constexpr int INT_CONVERSION_ERROR = -4;
+constexpr int FIXED_POINT_CONVERSION_ERROR = -5;
 
 } // namespace printf_core
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
new file mode 100644
index 0000000000000..de69c603be6b6
--- /dev/null
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -0,0 +1,309 @@
+//===-- Fixed Point Converter for printf ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
+#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
+
+#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/fixed_point/fx_bits.h"
+#include "src/__support/fixed_point/fx_rep.h"
+#include "src/__support/integer_to_string.h"
+#include "src/__support/libc_assert.h"
+#include "src/stdio/printf_core/converter_utils.h"
+#include "src/stdio/printf_core/core_structs.h"
+#include "src/stdio/printf_core/writer.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+
+namespace LIBC_NAMESPACE {
+namespace printf_core {
+
+// This is just for assertions. It will be compiled out for release builds.
+LIBC_INLINE constexpr uint32_t const_ten_exp(uint32_t exponent) {
+  uint32_t result = 1;
+  LIBC_ASSERT(exponent < 11);
+  for (uint32_t i = 0; i < exponent; ++i)
+    result *= 10;
+
+  return result;
+}
+
+#define READ_FX_BITS(TYPE)                                                     \
+  do {                                                                         \
+    auto fixed_bits = fixed_point::FXBits<TYPE>(                               \
+        fixed_point::FXRep<TYPE>::StorageType(to_conv.conv_val_raw));          \
+    integral = fixed_bits.get_integral();                                      \
+    fractional = fixed_bits.get_fraction();                                    \
+    exponent = fixed_bits.get_exponent();                                      \
+    is_negative = fixed_bits.get_sign();                                       \
+  } while (false)
+
+#define APPLY_FX_LENGTH_MODIFIER(LENGTH_MODIFIER)                              \
+  do {                                                                         \
+    if (to_conv.conv_name == 'r') {                                            \
+      READ_FX_BITS(LENGTH_MODIFIER fract);                                     \
+    } else if (to_conv.conv_name == 'R') {                                     \
+      READ_FX_BITS(unsigned LENGTH_MODIFIER fract);                            \
+    } else if (to_conv.conv_name == 'k') {                                     \
+      READ_FX_BITS(LENGTH_MODIFIER accum);                                     \
+    } else if (to_conv.conv_name == 'K') {                                     \
+      READ_FX_BITS(unsigned LENGTH_MODIFIER accum);                            \
+    } else {                                                                   \
+      LIBC_ASSERT(false && "Invalid conversion name passed to convert_fixed"); \
+      return FIXED_POINT_CONVERSION_ERROR;                                     \
+    }                                                                          \
+  } while (false)
+
+LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
+  // Long accum should be the largest type, so we can store all the smaller
+  // numbers in things sized for it.
+  using LARep = fixed_point::FXRep<unsigned long accum>;
+  using StorageType = LARep::StorageType;
+
+  // All of the letters will be defined relative to variable a, which will be
+  // the appropriate case based on the name of the conversion. This converts any
+  // conversion name into the letter 'a' with the appropriate case.
+  const char a = (to_conv.conv_name & 32) | 'A';
+  FormatFlags flags = to_conv.flags;
+
+  bool is_negative;
+  int exponent;
+  StorageType integral;
+  StorageType fractional;
+
+  // r = fract
+  // k = accum
+  // lowercase = signed
+  // uppercase = unsigned
+  // h = short
+  // l = long
+  // any other length modifier has no effect
+
+  if (to_conv.length_modifier == LengthModifier::h) {
+    APPLY_FX_LENGTH_MODIFIER(short);
+  } else if (to_conv.length_modifier == LengthModifier::l) {
+    APPLY_FX_LENGTH_MODIFIER(long);
+  } else {
+    APPLY_FX_LENGTH_MODIFIER();
+  }
+
+  LIBC_ASSERT(static_cast<size_t>(exponent) <=
+                  (sizeof(StorageType) - sizeof(uint32_t)) * CHAR_BIT &&
+              "StorageType must be large enough to hold the fractional "
+              "component multiplied by a 32 bit number.");
+
+  // If to_conv doesn't specify a precision, the precision defaults to 6.
+  const size_t precision = to_conv.precision < 0 ? 6 : to_conv.precision;
+  bool has_decimal_point =
+      (precision > 0) || ((flags & FormatFlags::ALTERNATE_FORM) != 0);
+
+  // The number of non-zero digits below the decimal point for a negative power
+  // of 2 in base 10 is equal to the magnitude of the power of 2.
+
+  // A quick proof:
+  // Let p be any positive integer.
+  // Let e = 2^(-p)
+  // Let t be a positive integer such that e * 10^t is an integer.
+  // By definition: The smallest allowed value of t must be equal to the number
+  // of non-zero digits below the decimal point in e.
+  // If we evaluate e * 10^t we get the following:
+  // e * 10^t = 2^(-p) * 10*t = 2^(-p) * 2^t * 5^t = 5^t * 2^(t-p)
+  // For 5^t * 2^(t-p) to be an integer, both exponents must be non-negative,
+  // since 5 and 2 are coprime.
+  // The smallest value of t such that t-p is non-negative is p.
+  // Therefor, the number of non-zero digits below the decimal point for a given
+  // negative power of 2 "p" is equal to the value of p.
+
+  constexpr size_t MAX_FRACTION_DIGITS = LARep::FRACTION_LEN;
+
+  char fraction_digits[MAX_FRACTION_DIGITS];
+
+  size_t valid_fraction_digits = 0;
+
+  // TODO: Factor this part out
+  while (fractional > 0) {
+    uint32_t cur_digits = 0;
+    // 10^9 is used since it's the largest power of 10 that fits in a uint32_t
+    constexpr uint32_t TEN_EXP_NINE = 1000000000;
+    constexpr size_t DIGITS_PER_BLOCK = 9;
+
+    // Multiply by 10^9, then grab the digits above the decimal point, then
+    // clear those digits in fractional.
+    fractional = fractional * TEN_EXP_NINE;
+    cur_digits = static_cast<uint32_t>(fractional >> exponent);
+    fractional = fractional % (StorageType(1) << exponent);
+
+    // we add TEN_EXP_NINE to force leading zeroes to show up, then we skip the
+    // first digit in the loop.
+    const IntegerToString<uint32_t> cur_fractional_digits(cur_digits +
+                                                          TEN_EXP_NINE);
+    for (size_t i = 0;
+         i < DIGITS_PER_BLOCK && valid_fraction_digits < MAX_FRACTION_DIGITS;
+         ++i, ++valid_fraction_digits)
+      fraction_digits[valid_fraction_digits] =
+          cur_fractional_digits.view()[i + 1];
+
+    if (valid_fraction_digits >= MAX_FRACTION_DIGITS) {
+      LIBC_ASSERT(fractional == 0 && "If the fraction digit buffer is full, "
+                                     "there should be no remaining digits.");
+      /*
+        A visual explanation of what this assert is checking:
+
+         32 digits (max for 32 bit fract)
+         +------------------------------++--+--- must be zero
+         |                              ||  |
+         123456789012345678901234567890120000
+         |       ||       ||       ||       |
+         +-------++-------++-------++-------+
+         9 digit blocks
+      */
+      LIBC_ASSERT(cur_digits % const_ten_exp(
+                                   DIGITS_PER_BLOCK -
+                                   (MAX_FRACTION_DIGITS % DIGITS_PER_BLOCK)) ==
+                      0 &&
+                  "Digits after the MAX_FRACTION_DIGITS should all be zero.");
+      valid_fraction_digits = MAX_FRACTION_DIGITS;
+    }
+  }
+
+  if (precision < valid_fraction_digits) {
+    // Handle rounding. Just do round to nearest, tie to even since it's
+    // unspecified.
+    RoundDirection round;
+    char first_digit_after = fraction_digits[precision];
+    if (first_digit_after > '5') {
+      round = RoundDirection::Up;
+    } else if (first_digit_after < '5') {
+      round = RoundDirection::Down;
+    } else {
+      // first_digit_after == '5'
+      // need to check the remaining digits, but default to even.
+      round = RoundDirection::Even;
+      for (size_t cur_digit_index = precision + 1;
+           cur_digit_index + 1 < valid_fraction_digits; ++cur_digit_index) {
+        if (fraction_digits[cur_digit_index] != '0') {
+          round = RoundDirection::Up;
+          break;
+        }
+      }
+    }
+
+    // If we need to actually perform rounding, do so.
+    if (round == RoundDirection::Up || round == RoundDirection::Even) {
+      bool keep_rounding = true;
+      int digit_to_round = static_cast<int>(precision) - 1;
+      for (; digit_to_round >= 0 && keep_rounding; --digit_to_round) {
+        keep_rounding = false;
+        char cur_digit = fraction_digits[digit_to_round];
+        // if the digit should not be rounded up
+        if (round == RoundDirection::Even && ((cur_digit - '0') % 2) == 0) {
+          // break out of the loop
+          break;
+        }
+        fraction_digits[digit_to_round] += 1;
+
+        // if the digit was a 9, instead replace with a 0.
+        if (cur_digit == '9') {
+          fraction_digits[digit_to_round] = '0';
+          keep_rounding = true;
+        }
+      }
+
+      // if every digit below the decimal point was rounded up but we need to
+      // keep rounding
+      if (keep_rounding &&
+          (round == RoundDirection::Up ||
+           (round == RoundDirection::Even && ((integral % 2) == 1)))) {
+        // add one to the integral portion to round it up.
+        ++integral;
+      }
+    }
+
+    valid_fraction_digits = precision;
+  }
+
+  const IntegerToString<StorageType> integral_str(integral);
+
+  // these are signed to prevent underflow due to negative values. The
+  // eventual values will always be non-negative.
+  size_t trailing_zeroes = 0;
+  int padding;
+
+  // If the precision is greater than the actual result, pad with 0s
+  if (precision > valid_fraction_digits)
+    trailing_zeroes = precision - (valid_fraction_digits);
+
+  constexpr cpp::string_view DECIMAL_POINT(".");
+
+  char sign_char = 0;
+
+  // Check if the conv name is uppercase
+  if (a == 'A') {
+    // These flags are only for signed conversions, so this removes them if the
+    // conversion is unsigned.
+    flags = FormatFlags(flags &
+                        ~(FormatFlags::FORCE_SIGN | FormatFlags::SPACE_PREFIX));
+  }
+
+  if (is_negative)
+    sign_char = '-';
+  else if ((flags & FormatFlags::FORCE_SIGN) == FormatFlags::FORCE_SIGN)
+    sign_char = '+'; // FORCE_SIGN has precedence over SPACE_PREFIX
+  else if ((flags & FormatFlags::SPACE_PREFIX) == FormatFlags::SPACE_PREFIX)
+    sign_char = ' ';
+
+  padding = static_cast<int>(to_conv.min_width - (sign_char > 0 ? 1 : 0) -
+                             integral_str.size() -
+                             static_cast<int>(has_decimal_point) -
+                             valid_fraction_digits - trailing_zeroes);
+  if (padding < 0)
+    padding = 0;
+
+  if ((flags & FormatFlags::LEFT_JUSTIFIED) == FormatFlags::LEFT_JUSTIFIED) {
+    // The pattern is (sign), integral, (.), (fraction), (zeroes), (spaces)
+    if (sign_char > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write(sign_char));
+    RET_IF_RESULT_NEGATIVE(writer->write(integral_str.view()));
+    if (has_decimal_point)
+      RET_IF_RESULT_NEGATIVE(writer->write(DECIMAL_POINT));
+    if (valid_fraction_digits > 0)
+      RET_IF_RESULT_NEGATIVE(
+          writer->write({fraction_digits, valid_fraction_digits}));
+    if (trailing_zeroes > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write('0', trailing_zeroes));
+    if (padding > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write(' ', padding));
+  } else {
+    // The pattern is (spaces), (sign), (zeroes), integral, (.), (fraction),
+    // (zeroes)
+    if ((padding > 0) &&
+        ((flags & FormatFlags::LEADING_ZEROES) != FormatFlags::LEADING_ZEROES))
+      RET_IF_RESULT_NEGATIVE(writer->write(' ', padding));
+    if (sign_char > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write(sign_char));
+    if ((padding > 0) &&
+        ((flags & FormatFlags::LEADING_ZEROES) == FormatFlags::LEADING_ZEROES))
+      RET_IF_RESULT_NEGATIVE(writer->write('0', padding));
+    RET_IF_RESULT_NEGATIVE(writer->write(integral_str.view()));
+    if (has_decimal_point)
+      RET_IF_RESULT_NEGATIVE(writer->write(DECIMAL_POINT));
+    if (valid_fraction_digits > 0)
+      RET_IF_RESULT_NEGATIVE(
+          writer->write({fraction_digits, valid_fraction_digits}));
+    if (trailing_zeroes > 0)
+      RET_IF_RESULT_NEGATIVE(writer->write('0', trailing_zeroes));
+  }
+  return WRITE_OK;
+}
+
+} // namespace printf_core
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index b54526d371086..a6c68329e6602 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -45,9 +45,6 @@ constexpr uint32_t MAX_BLOCK = 999999999;
 // constexpr uint32_t MAX_BLOCK = 999999999999999999;
 constexpr char DECIMAL_POINT = '.';
 
-// This is used to represent which direction the number should be rounded.
-enum class RoundDirection { Up, Down, Even };
-
 LIBC_INLINE RoundDirection get_round_direction(int last_digit, bool truncated,
                                                fputil::Sign sign) {
   switch (fputil::quick_get_round()) {
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 1e7d2e58c924a..0876116a0bac8 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -9,13 +9,19 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
+#include "include/llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/str_to_integer.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_config.h"
 
 #include <stddef.h>
 
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+#include "src/__support/fixed_point/fx_rep.h"
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
 namespace LIBC_NAMESPACE {
 namespace printf_core {
 
@@ -28,6 +34,14 @@ template <> struct int_type_of<double> {
 template <> struct int_type_of<long double> {
   using type = fputil::FPBits<long double>::StorageType;
 };
+
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+template <typename T>
+struct int_type_of<cpp::enable_if<cpp::is_fixed_point_v<T>, T>> {
+  using type = typename fixed_point::FXRep<T>::StorageType;
+};
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+
 template <typename T> using int_type_of_v = typename int_type_of<T>::type;
 
 #ifndef LIBC_COPT_PRINTF_DISABLE_INDEX_MODE
@@ -206,6 +220,25 @@ template <typename ArgProvider> class Parser {
         }
         break;
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+      // Capitalization represents sign, but we only need to get the right
+      // bitwidth here so we ignore that.
+      case ('r'):
+      case ('R'):
+        // all fract sizes we support are less than 32 bits, and currently doing
+        // va_args with fixed point types just doesn't work.
+        // TODO: Move to fixed point types once va_args supports it.
+        WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, uint32_t, conv_index);
+        break;
+      case ('k'):
+      case ('K'):
+        if (lm == LengthModifier::l) {
+          WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, uint64_t, conv_index);
+        } else {
+          WRITE_ARG_VAL_SIMPLEST(section.conv_val_raw, uint32_t, conv_index);
+        }
+        break;
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
       case ('n'):
 #endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
@@ -399,6 +432,22 @@ template <typename ArgProvider> class Parser {
       else if (cur_type_desc == type_desc_from_type<long double>())
         args_cur.template next_var<long double>();
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+      // Floating point numbers may be stored separately from the other
+      // arguments.
+      else if (cur_type_desc == type_desc_from_type<short fract>())
+        args_cur.template next_var<short fract>();
+      else if (cur_type_desc == type_desc_from_type<fract>())
+        args_cur.template next_var<fract>();
+      else if (cur_type_desc == type_desc_from_type<long fract>())
+        args_cur.template next_var<long fract>();
+      else if (cur_type_desc == type_desc_from_type<short accum>())
+        args_cur.template next_var<short accum>();
+      else if (cur_type_desc == type_desc_from_type<accum>())
+        args_cur.template next_var<accum>();
+      else if (cur_type_desc == type_desc_from_type<long accum>())
+        args_cur.template next_var<long accum>();
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
       // pointers may be stored separately from normal values.
       else if (cur_type_desc == type_desc_from_type<void *>())
         args_cur.template next_var<void *>();
@@ -528,6 +577,22 @@ template <typename ArgProvider> class Parser {
             conv_size = type_desc_from_type<long double>();
           break;
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
+#ifdef LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+        // Capitalization represents sign, but we only need to get the right
+        // bitwidth here so we ignore that.
+        case ('r'):
+        case ('R'):
+          conv_size = type_desc_from_type<uint32_t>();
+          break;
+        case ('k'):
+        case ('K'):
+          if (lm == LengthModifier::l) {
+            conv_size = type_desc_from_type<uint64_t>();
+          } else {
+            conv_size = type_desc_from_type<uint32_t>();
+          }
+          break;
+#endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
         case ('n'):
 #endif // LIBC_COPT_PRINTF_DISABLE_WRITE_INT
diff --git a/libc/src/stdio/printf_core/printf_config.h b/libc/src/stdio/printf_core/printf_config.h
index e1d9654f3affe..8a48abdd170ec 100644
--- a/libc/src/stdio/printf_core/printf_config.h
+++ b/libc/src/stdio/printf_core/printf_config.h
@@ -29,6 +29,13 @@
 #define LIBC_COPT_PRINTF_INDEX_ARR_LEN 128
 #endif
 
+// If fixed point is available and the user hasn't explicitly opted out, then
+// enable fixed point.
+#if defined(LIBC_COMPILER_HAS_FIXED_POINT) &&                                  \
+    !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
+#define LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
+#endif
+
 // TODO(michaelrj): Provide a proper interface for these options.
 // LIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE
 // LIBC_COPT_FLOAT_TO_STR_USE_DYADIC_FLOAT
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 93c21aa994ef4..6e1c86e070a82 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -121,6 +121,9 @@ endif()
 if(LIBC_CONF_PRINTF_DISABLE_WRITE_INT)
   list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_WRITE_INT")
 endif()
+if(LIBC_CONF_PRINTF_DISABLE_FIXED_POINT)
+  list(APPEND sprintf_test_copts "-DLIBC_COPT_PRINTF_DISABLE_FIXED_POINT")
+endif()
 
 add_fp_unittest(
   sprintf_test
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index 186b37e2898af..b9f402027e7fc 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -3201,6 +3201,217 @@ TEST_F(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
 
 #endif // LIBC_COPT_PRINTF_DISABLE_FLOAT
 
+#if defined(LIBC_COMPILER_HAS_FIXED_POINT) &&                                  \
+    !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
+TEST_F(LlvmLibcSPrintfTest, FixedConv) {
+
+  // These numeric tests are potentially a little weak, but the fuzz test is
+  // more thorough than my handwritten tests tend to be.
+
+  // TODO: Replace hex literals with their appropriate fixed point literals.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0x80000000); // -0.0
+  ASSERT_STREQ_LEN(written, buff, "-0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%r", 0xffff); // -fract max
+  ASSERT_STREQ_LEN(written, buff, "-0.999969");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%R", 0xffff); // unsigned fract max
+  ASSERT_STREQ_LEN(written, buff, "0.999985");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0xffffffff); // -accum max
+  ASSERT_STREQ_LEN(written, buff, "-65535.999969");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%K", 0xffffffff); // unsigned accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.999985");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%r", 0x7fff); // fract max
+  ASSERT_STREQ_LEN(written, buff, "0.999969");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%k", 0x7fffffff); // accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.999969");
+
+  // Length Modifier Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hk", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hk", 0xffff); // -short accum max
+  ASSERT_STREQ_LEN(written, buff, "-255.992188");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hr", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hr", 0xff); // -short fract max
+  ASSERT_STREQ_LEN(written, buff, "-0.992188");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hK", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%hK", 0xffff); // unsigned short accum max
+  ASSERT_STREQ_LEN(written, buff, "255.996094");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%hR", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%hR", 0xff); // unsigned short fract max
+  ASSERT_STREQ_LEN(written, buff, "0.996094");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lk", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lk",
+                                    0xffffffffffffffff); //-long accum max
+  ASSERT_STREQ_LEN(written, buff, "-4294967296.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lr", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lr",
+                                    0xffffffff); //-long fract max
+  ASSERT_STREQ_LEN(written, buff, "-1.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lK", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%lK",
+                              0xffffffffffffffff); // unsigned long accum max
+  ASSERT_STREQ_LEN(written, buff, "4294967296.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lR", 0x0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "0.000000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%lR",
+                                    0xffffffff); // unsigned long fract max
+  ASSERT_STREQ_LEN(written, buff, "1.000000");
+
+  // Min Width Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%10k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "  1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%10k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, " -1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%8k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%9k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%4k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%4k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  // Precision Tests.
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%.16K", 0xFFFFFFFF); // unsigned accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.9999847412109375");
+
+  written = LIBC_NAMESPACE::sprintf(
+      buff, "%.32lK", 0xFFFFFFFFFFFFFFFF); // unsigned long accum max
+  ASSERT_STREQ_LEN(written, buff,
+                   "4294967295.99999999976716935634613037109375");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%.0K", 0xFFFFFFFF); // unsigned accum max
+  ASSERT_STREQ_LEN(written, buff, "65536");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%.0R", 0xFFFF); // unsigned fract max
+  ASSERT_STREQ_LEN(written, buff, "1");
+
+  // Flag Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%+k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "+1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%+k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "% k", 0x0000a000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, " 1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "% k", 0x8000a000); //-1.25
+  ASSERT_STREQ_LEN(written, buff, "-1.250000");
+
+  // unsigned variants ignore sign flags.
+  written = LIBC_NAMESPACE::sprintf(buff, "%+K", 0x00014000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "% K", 0x00014000); // 1.25
+  ASSERT_STREQ_LEN(written, buff, "1.250000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-10k", 0x0000c000); // 1.5
+  ASSERT_STREQ_LEN(written, buff, "1.500000  ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%#.k", 0x00008000); // 1.0
+  ASSERT_STREQ_LEN(written, buff, "1.");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%#.0k", 0x0000c000); // 1.5
+  ASSERT_STREQ_LEN(written, buff, "2.");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%010k", 0x0000c000); // 1.5
+  ASSERT_STREQ_LEN(written, buff, "001.500000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%010k", 0x8000c000); //-1.5
+  ASSERT_STREQ_LEN(written, buff, "-01.500000");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%+- #0k", 0); // 0.0
+  ASSERT_STREQ_LEN(written, buff, "+0.000000");
+
+  // Combined Tests.
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%10.2k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, "      9.99");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%5.1k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, " 10.0");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-10.2k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, "9.99      ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-5.1k", 0x0004feb8); // 9.99
+  ASSERT_STREQ_LEN(written, buff, "10.0 ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-5.1k", 0x00000001); // accum min
+  ASSERT_STREQ_LEN(written, buff, "0.0  ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%30k", 0x7fffffff); // accum max
+  ASSERT_STREQ_LEN(written, buff, "                  65535.999969");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%-30k", 0x7fffffff); // accum max
+  ASSERT_STREQ_LEN(written, buff, "65535.999969                  ");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%20.2lK",
+                                    0x3b9ac9ffFD70A3D7); // 999999999.99
+  ASSERT_STREQ_LEN(written, buff, "        999999999.99");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%20.1lK",
+                                    0x3b9ac9ffFD70A3D7); // 999999999.99
+  ASSERT_STREQ_LEN(written, buff, "        1000000000.0");
+
+  written = LIBC_NAMESPACE::sprintf(buff, "%12.3R %-12.3k", 0x1999,
+                                    0x00800000); // 0.1, 256.0
+  ASSERT_STREQ_LEN(written, buff, "       0.100 256.000     ");
+
+  written =
+      LIBC_NAMESPACE::sprintf(buff, "%+-#12.3lk % 012.3k", 0x000000001013a92a,
+                              0x02740000); // 0.126, 1256.0
+  ASSERT_STREQ_LEN(written, buff, "+0.126        0001256.000");
+}
+#endif // defined(LIBC_COMPILER_HAS_FIXED_POINT) &&
+       // !defined(LIBC_COPT_PRINTF_DISABLE_FIXED_POINT)
+
 #ifndef LIBC_COPT_PRINTF_DISABLE_WRITE_INT
 TEST(LlvmLibcSPrintfTest, WriteIntConv) {
   char buff[64];
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 09c53c9e8a131..a1a5b7fe9bf40 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2982,20 +2982,21 @@ libc_function(
 ################################ stdio targets #################################
 
 libc_support_library(
-    name = "printf_core_structs",
-    hdrs = ["src/stdio/printf_core/core_structs.h"],
+    name = "printf_config",
+    hdrs = ["src/stdio/printf_core/printf_config.h"],
     defines = PRINTF_COPTS,
     deps = [
-        ":__support_cpp_string_view",
-        ":__support_fputil_fp_bits",
     ],
 )
 
 libc_support_library(
-    name = "printf_config",
-    hdrs = ["src/stdio/printf_core/printf_config.h"],
+    name = "printf_core_structs",
+    hdrs = ["src/stdio/printf_core/core_structs.h"],
     defines = PRINTF_COPTS,
     deps = [
+        ":__support_cpp_string_view",
+        ":__support_fputil_fp_bits",
+        ":printf_config",
     ],
 )
 
@@ -3081,6 +3082,7 @@ libc_support_library(
         ":__support_libc_assert",
         ":__support_uint",
         ":__support_uint128",
+        ":printf_config",
         ":printf_core_structs",
         ":printf_writer",
     ],

From 16e74fd48988ac95551d0f64e1b36f78a82a89a2 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 27 Feb 2024 11:07:40 -0800
Subject: [PATCH 479/546] Reland "[TypeProf][InstrPGO] Introduce raw and instr
 profile format change for type profiling." (#82711)

New change on top of [reviewed
patch](https://github.com/llvm/llvm-project/pull/81691) are [in commits
after this
one](https://github.com/llvm/llvm-project/pull/82711/commits/d0757f46b3e3865b5f7c552bc0744309a363e0ac).
Previous commits are restored from the remote branch with timestamps.

1. Fix build breakage for non-ELF platforms, by defining the missing
functions {`__llvm_profile_begin_vtables`, `__llvm_profile_end_vtables`,
`__llvm_profile_begin_vtabnames `, `__llvm_profile_end_vtabnames`}
everywhere.
* Tested on mac laptop (for darwins) and Windows. Specifically,
functions in `InstrProfilingPlatformWindows.c` returns `NULL` to make it
more explicit that type prof isn't supported; see comments for the
reason.
* For the rest (AIX, other), mostly follow existing examples (like this
[one](https://github.com/llvm/llvm-project/commit/f95b2f1acf1171abb0d00089fd4c9238753847e3))

2. Rename `__llvm_prf_vtabnames` -> `__llvm_prf_vns` for shorter section
name, and make returned pointers
[const](https://github.com/llvm/llvm-project/pull/82711/commits/a825d2a4ec00f07772a373091a702f149c3b0c34#diff-4de780ce726d76b7abc9d3353aef95013e7b21e7bda01be8940cc6574fb0b5ffR120-R121)

**Original Description**

* Raw profile format
- Header: records the byte size of compressed vtable names, and the
number of profiled vtable entries (call it `VTableProfData`). Header
also records padded bytes of each section.
- Payload: adds a section for compressed vtable names, and a section to
store `VTableProfData`. Both sections are padded so the size is a
multiple of 8.
* Indexed profile format
  - Header: records the byte offset of compressed vtable names.
- Payload: adds a section to store compressed vtable names. This section
is used by `llvm-profdata` to show the list of vtables profiled for an
instrumented site.

[The originally reviewed
patch](https://github.com/llvm/llvm-project/pull/66825) will have
profile reader/write change and llvm-profdata change.
- To ensure this PR has all the necessary profile format change along
with profile version bump, created a copy of the originally reviewed
patch in https://github.com/llvm/llvm-project/pull/80761. The copy
doesn't have profile format change, but it has the set of tests which
covers type profile generation, profile read and profile merge. Tests
pass there.

rfc in
https://discourse.llvm.org/t/rfc-dynamic-type-profiling-and-optimizations-in-llvm/74600

---------

Co-authored-by: modiking <modiking213@gmail.com>
---
 compiler-rt/include/profile/InstrProfData.inc |  54 +++++++++-
 compiler-rt/lib/profile/InstrProfiling.h      |  35 +++++--
 .../lib/profile/InstrProfilingBuffer.c        |  96 +++++++++++++++---
 compiler-rt/lib/profile/InstrProfilingFile.c  |  11 +-
 .../lib/profile/InstrProfilingInternal.h      |   8 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  23 ++++-
 .../lib/profile/InstrProfilingPlatformAIX.c   |  14 ++-
 .../profile/InstrProfilingPlatformDarwin.c    |  23 +++++
 .../lib/profile/InstrProfilingPlatformLinux.c |  21 ++++
 .../lib/profile/InstrProfilingPlatformOther.c |  15 +++
 .../profile/InstrProfilingPlatformWindows.c   |  19 ++++
 .../lib/profile/InstrProfilingWriter.c        |  37 +++++--
 .../profile/instrprof-write-buffer-internal.c |   6 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |  17 +++-
 .../llvm/ProfileData/InstrProfData.inc        |  54 +++++++++-
 .../llvm/ProfileData/InstrProfReader.h        |  13 +++
 llvm/lib/ProfileData/InstrProf.cpp            |  11 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      |  44 +++++++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  43 ++++++--
 .../InstrProfiling/coverage.ll                |   8 +-
 .../thinlto_indirect_call_promotion.profraw   | Bin 528 -> 544 bytes
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 2016 -> 2032 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1968 -> 1984 bytes
 .../thinlto_indirect_call_promotion.profraw   | Bin 0 -> 528 bytes
 .../llvm-profdata/binary-ids-padding.test     |   6 +-
 .../llvm-profdata/large-binary-id-size.test   |   4 +-
 ...alformed-not-space-for-another-header.test |   6 +-
 .../malformed-num-counters-zero.test          |   6 +-
 .../malformed-ptr-to-counter-array.test       |   6 +-
 .../misaligned-binary-ids-size.test           |   4 +-
 .../mismatched-raw-profile-header.test        |   2 +
 .../tools/llvm-profdata/raw-32-bits-be.test   |  11 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  10 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  10 +-
 .../tools/llvm-profdata/raw-two-profiles.test |   8 +-
 37 files changed, 541 insertions(+), 98 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw

diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index fce407f547f3d..e9866d94b762c 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
+                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+                       IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the address point of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (e.g., there might be an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (starting and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
@@ -741,6 +785,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_VTAB_COFF
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_VNAME_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -758,6 +804,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON)
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 705b54fddcd7b..d424a22c212c3 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -49,6 +49,12 @@ typedef struct ValueProfNode {
 #include "profile/InstrProfData.inc"
 } ValueProfNode;
 
+typedef void *IntPtrT;
+typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer) Type Name;
+#include "profile/InstrProfData.inc"
+} VTableProfData;
+
 /*!
  * \brief Return 1 if profile counters are continuously synced to the raw
  * profile via an mmap(). This is in contrast to the default mode, in which
@@ -103,12 +109,16 @@ const __llvm_profile_data *__llvm_profile_begin_data(void);
 const __llvm_profile_data *__llvm_profile_end_data(void);
 const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
+const char *__llvm_profile_begin_vtabnames(void);
+const char *__llvm_profile_end_vtabnames(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
 char *__llvm_profile_begin_bitmap(void);
 char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
+const VTableProfData *__llvm_profile_begin_vtables();
+const VTableProfData *__llvm_profile_end_vtables();
 uint32_t *__llvm_profile_begin_orderfile();
 
 /*!
@@ -252,20 +262,31 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
 /*! \brief Get the size of the profile name section in bytes. */
 uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
 
-/* ! \brief Given the sizes of the data and counter information, return the
- * number of padding bytes before and after the counters, and after the names,
- * in the raw profile.
+/*! \brief Get the number of virtual table profile data entries */
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End);
+
+/*! \brief Get the size of virtual table profile data in bytes. */
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End);
+
+/* ! \brief Given the sizes of the data and counter information, computes the
+ * number of padding bytes before and after the counter section, as well as the
+ * number of padding bytes after other setions in the raw profile.
+ * Returns -1 upon errors and 0 upon success. Output parameters should be used
+ * iff return value is 0.
  *
  * Note: When mmap() mode is disabled, no padding bytes before/after counters
  * are needed. However, in mmap() mode, the counter section in the raw profile
  * must be page-aligned: this API computes the number of padding bytes
  * needed to achieve that.
  */
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
-    uint64_t *PaddingBytesAfterNames);
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmap, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVNames);
 
 /*!
  * \brief Set the flag that profile data has been dumped to the file.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index af52804b2b532..7c5c26f4d113b 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -51,16 +51,29 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
 
   return __llvm_profile_get_size_for_buffer_internal(
       DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd,
-      NamesBegin, NamesEnd);
+      NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
+  // `sizeof(__llvm_profile_data) - 1` is required in the numerator when
+  // [Begin, End] represents an inclusive range.
+  // For ELF, [Begin, End) represents the address of linker-inserted
+  // symbols  `__start__<elf-section>` and `__stop_<elf-section>`.
+  // Thereby, `End` is one byte past the inclusive range, and
+  // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get
+  // the correct number of profile data.
+  // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true
+  // across platforms.
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
 }
@@ -71,6 +84,26 @@ uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
+// Counts the number of `VTableProfData` elements within the range of [Begin,
+// End). Caller should guarantee that End points to one byte past the inclusive
+// range.
+// FIXME: Add a compiler-rt test to make sure the number of vtables in the
+// raw profile is the same as the number of vtable elements in the instrumented
+// binary.
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_num_vtable(const VTableProfData *Begin,
+                                       const VTableProfData *End) {
+  // Convert pointers to intptr_t to use integer arithmetic.
+  intptr_t EndI = (intptr_t)End, BeginI = (intptr_t)Begin;
+  return (EndI - BeginI) / sizeof(VTableProfData);
+}
+
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_vtable_section_size(const VTableProfData *Begin,
+                                                const VTableProfData *End) {
+  return (intptr_t)(End) - (intptr_t)(Begin);
+}
+
 COMPILER_RT_VISIBILITY size_t __llvm_profile_counter_entry_size(void) {
   if (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE)
     return sizeof(uint8_t);
@@ -119,11 +152,13 @@ static int needsCounterPadding(void) {
 }
 
 COMPILER_RT_VISIBILITY
-void __llvm_profile_get_padding_sizes_for_counters(
+int __llvm_profile_get_padding_sizes_for_counters(
     uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
-    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
-    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
-    uint64_t *PaddingBytesAfterNames) {
+    uint64_t NamesSize, uint64_t VTableSize, uint64_t VNameSize,
+    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t *PaddingBytesAfterBitmapBytes, uint64_t *PaddingBytesAfterNames,
+    uint64_t *PaddingBytesAfterVTable, uint64_t *PaddingBytesAfterVName) {
+  // Counter padding is needed only if continuous mode is enabled.
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
@@ -131,9 +166,19 @@ void __llvm_profile_get_padding_sizes_for_counters(
     *PaddingBytesAfterBitmapBytes =
         __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
-    return;
+    if (PaddingBytesAfterVTable != NULL)
+      *PaddingBytesAfterVTable =
+          __llvm_profile_get_num_padding_bytes(VTableSize);
+    if (PaddingBytesAfterVName != NULL)
+      *PaddingBytesAfterVName = __llvm_profile_get_num_padding_bytes(VNameSize);
+    return 0;
   }
 
+  // Value profiling not supported in continuous mode at profile-write time.
+  // Return -1 to alert the incompatibility.
+  if (VTableSize != 0 || VNameSize != 0)
+    return -1;
+
   // In continuous mode, the file offsets for headers and for the start of
   // counter sections need to be page-aligned.
   *PaddingBytesBeforeCounters =
@@ -142,13 +187,22 @@ void __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesAfterBitmapBytes =
       calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
+  // Set these two variables to zero to avoid uninitialized variables
+  // even if VTableSize and VNameSize are known to be zero.
+  if (PaddingBytesAfterVTable != NULL)
+    *PaddingBytesAfterVTable = 0;
+  if (PaddingBytesAfterVName != NULL)
+    *PaddingBytesAfterVName = 0;
+  return 0;
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) {
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
+    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
+    const char *VNamesBegin, const char *VNamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -156,20 +210,29 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
+  const uint64_t VTableSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNameSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize, 0 /* VTableSize */,
+      0 /* VNameSize */, &PaddingBytesBeforeCounters,
+      &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+      &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+      &PaddingBytesAfterVNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
          PaddingBytesAfterCounters + NumBitmapBytes +
-         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames;
+         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames +
+         VTableSize + PaddingBytesAfterVTable + VNameSize +
+         PaddingBytesAfterVNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -191,7 +254,10 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
-  return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
-                            NamesEnd, 0);
+  // Set virtual table arguments to NULL since they are not supported yet.
+  return lprofWriteDataImpl(
+      &BufferWriter, DataBegin, DataEnd, CountersBegin, CountersEnd,
+      BitmapBegin, BitmapEnd, /*VPDataReader=*/0, NamesBegin, NamesEnd,
+      /*VTableBegin=*/NULL, /*VTableEnd=*/NULL, /*VNamesBegin=*/NULL,
+      /*VNamesEnd=*/NULL, /*SkipNameDataWrite=*/0);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index f3b457d786e6b..e4d99ef4872bd 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -137,15 +137,18 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
              DataBegin, PageSize);
     return 1;
   }
+
   int Fileno = fileno(File);
   /* Determine how much padding is needed before/after the counters and
    * after the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize, /*VTableSize=*/0,
+      /*VNameSize=*/0, &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames,
+      &PaddingBytesAfterVTable, &PaddingBytesAfterVNames);
 
   uint64_t PageAlignedCountersLength = CountersSize + PaddingBytesAfterCounters;
   uint64_t FileOffsetToCounters = CurrentFileOffset +
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 03ed67fcfa766..d5bd0e41fb129 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -22,7 +22,9 @@
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
     const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
-    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd,
+    const VTableProfData *VTableBegin, const VTableProfData *VTableEnd,
+    const char *VNamesBegin, const char *VNamesEnd);
 
 /*!
  * \brief Write instrumentation data to the given buffer, given explicit
@@ -156,7 +158,9 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const char *CountersBegin, const char *CountersEnd,
                        const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
-                       const char *NamesEnd, int SkipNameDataWrite);
+                       const char *NamesEnd, const VTableProfData *VTableBegin,
+                       const VTableProfData *VTableEnd, const char *VNamesBegin,
+                       const char *VNamesEnd, int SkipNameDataWrite);
 
 /* Merge value profile data pointed to by SrcValueProfData into
  * in-memory profile counters pointed by to DstData.  */
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index b5850e99ee37d..c0706b73e1668 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -107,6 +107,26 @@ static uintptr_t signextIfWin64(void *V) {
 #endif
 }
 
+// Skip names section, vtable profile data section and vtable names section
+// for runtime profile merge. To merge runtime addresses from multiple
+// profiles collected from the same instrumented binary, the binary should be
+// loaded at fixed base address (e.g., build with -no-pie, or run with ASLR
+// disabled). In this set-up these three sections remain unchanged.
+static uint64_t
+getDistanceFromCounterToValueProf(const __llvm_profile_header *const Header) {
+  const uint64_t VTableSectionSize =
+      Header->NumVTables * sizeof(VTableProfData);
+  const uint64_t PaddingBytesAfterVTableSection =
+      __llvm_profile_get_num_padding_bytes(VTableSectionSize);
+  const uint64_t VNamesSize = Header->VNamesSize;
+  const uint64_t PaddingBytesAfterVNamesSize =
+      __llvm_profile_get_num_padding_bytes(VNamesSize);
+  return Header->NamesSize +
+         __llvm_profile_get_num_padding_bytes(Header->NamesSize) +
+         VTableSectionSize + PaddingBytesAfterVTableSection + VNamesSize +
+         PaddingBytesAfterVNamesSize;
+}
+
 COMPILER_RT_VISIBILITY
 int __llvm_profile_merge_from_buffer(const char *ProfileData,
                                      uint64_t ProfileSize) {
@@ -137,8 +157,7 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcBitmapStart = SrcCountersEnd;
   SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
-      SrcNameStart + Header->NamesSize +
-      __llvm_profile_get_num_padding_bytes(Header->NamesSize);
+      SrcNameStart + getDistanceFromCounterToValueProf(Header);
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
index 002bec164d7e8..b9d51b698b414 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
@@ -175,7 +175,8 @@ void __llvm_profile_register_names_function(void *NamesStart,
                                             uint64_t NamesSize) {}
 
 // The __start_SECNAME and __stop_SECNAME symbols (for SECNAME \in
-// {"__llvm_prf_cnts", "__llvm_prf_data", "__llvm_prf_name", "__llvm_prf_vnds"})
+// {"__llvm_prf_cnts", "__llvm_prf_data", "__llvm_prf_name", "__llvm_prf_vnds",
+// "__llvm_prf_vns", "__llvm_prf_vtab"})
 // are always live when linking on AIX, regardless if the .o's being linked
 // reference symbols from the profile library (for example when no files were
 // compiled with -fprofile-generate). That's because these symbols are kept
@@ -197,6 +198,10 @@ static int dummy_vnds[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_VNODES_SECT_NAME);
 static int dummy_orderfile[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_ORDERFILE_SECT_NAME);
+static int dummy_vname[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_VNAME_SECT_NAME);
+static int dummy_vtab[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_VTAB_SECT_NAME);
 
 // To avoid GC'ing of the dummy variables by the linker, reference them in an
 // array and reference the array in the runtime registration code
@@ -206,9 +211,10 @@ static int dummy_orderfile[0] COMPILER_RT_SECTION(
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 COMPILER_RT_VISIBILITY
-void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_bits,
-                               (void *)&dummy_data, (void *)&dummy_name,
-                               (void *)&dummy_vnds, (void *)&dummy_orderfile};
+void *__llvm_profile_keep[] = {(void *)&dummy_cnts,  (void *)&dummy_bits,
+                               (void *)&dummy_data,  (void *)&dummy_name,
+                               (void *)&dummy_vnds,  (void *)&dummy_orderfile,
+                               (void *)&dummy_vname, (void *)&dummy_vtab};
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
index 2154d242a8174..6adc7f328cbf7 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
@@ -36,6 +36,17 @@ extern char
 COMPILER_RT_VISIBILITY
 extern char BitmapEnd __asm("section$end$__DATA$" INSTR_PROF_BITS_SECT_NAME);
 COMPILER_RT_VISIBILITY
+extern VTableProfData
+    VTableProfStart __asm("section$start$__DATA$" INSTR_PROF_VTAB_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern VTableProfData
+    VTableProfEnd __asm("section$end$__DATA$" INSTR_PROF_VTAB_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern char
+    VNameStart __asm("section$start$__DATA$" INSTR_PROF_VNAME_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern char VNameEnd __asm("section$end$__DATA$" INSTR_PROF_VNAME_SECT_NAME);
+COMPILER_RT_VISIBILITY
 extern uint32_t
     OrderFileStart __asm("section$start$__DATA$" INSTR_PROF_ORDERFILE_SECT_NAME);
 
@@ -65,6 +76,18 @@ char *__llvm_profile_begin_bitmap(void) { return &BitmapStart; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_bitmap(void) { return &BitmapEnd; }
 COMPILER_RT_VISIBILITY
+const VTableProfData *__llvm_profile_begin_vtables(void) {
+  return &VTableProfStart;
+}
+COMPILER_RT_VISIBILITY
+const VTableProfData *__llvm_profile_end_vtables(void) {
+  return &VTableProfEnd;
+}
+COMPILER_RT_VISIBILITY
+const char *__llvm_profile_begin_vtabnames(void) { return &VNameStart; }
+COMPILER_RT_VISIBILITY
+const char *__llvm_profile_end_vtabnames(void) { return &VNameEnd; }
+COMPILER_RT_VISIBILITY
 uint32_t *__llvm_profile_begin_orderfile(void) { return &OrderFileStart; }
 
 COMPILER_RT_VISIBILITY
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 19266ab6c6fb8..b766436497b74 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -24,8 +24,12 @@
 #define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
 #define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
+#define PROF_VNAME_START INSTR_PROF_SECT_START(INSTR_PROF_VNAME_COMMON)
+#define PROF_VNAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
+#define PROF_VTABLE_START INSTR_PROF_SECT_START(INSTR_PROF_VTAB_COMMON)
+#define PROF_VTABLE_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VTAB_COMMON)
 #define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
 #define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
@@ -41,6 +45,10 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern VTableProfData PROF_VTABLE_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_VNAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -63,6 +71,19 @@ COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_names(void) {
 COMPILER_RT_VISIBILITY const char *__llvm_profile_end_names(void) {
   return &PROF_NAME_STOP;
 }
+COMPILER_RT_VISIBILITY const char *__llvm_profile_begin_vtabnames(void) {
+  return &PROF_VNAME_START;
+}
+COMPILER_RT_VISIBILITY const char *__llvm_profile_end_vtabnames(void) {
+  return &PROF_VNAME_STOP;
+}
+COMPILER_RT_VISIBILITY const VTableProfData *
+__llvm_profile_begin_vtables(void) {
+  return &PROF_VTABLE_START;
+}
+COMPILER_RT_VISIBILITY const VTableProfData *__llvm_profile_end_vtables(void) {
+  return &PROF_VTABLE_STOP;
+}
 COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
   return &PROF_CNTS_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
index 5319ca813b43f..aa79a5641ceca 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
@@ -18,8 +18,12 @@
 
 static const __llvm_profile_data *DataFirst = NULL;
 static const __llvm_profile_data *DataLast = NULL;
+static const VTableProfData *VTableProfDataFirst = NULL;
+static const VTableProfData *VTableProfDataLast = NULL;
 static const char *NamesFirst = NULL;
 static const char *NamesLast = NULL;
+static const char *VNamesFirst = NULL;
+static const char *VNamesLast = NULL;
 static char *CountersFirst = NULL;
 static char *CountersLast = NULL;
 static uint32_t *OrderFileFirst = NULL;
@@ -80,11 +84,22 @@ COMPILER_RT_VISIBILITY
 const __llvm_profile_data *__llvm_profile_begin_data(void) { return DataFirst; }
 COMPILER_RT_VISIBILITY
 const __llvm_profile_data *__llvm_profile_end_data(void) { return DataLast; }
+COMPILER_RT_VISIBILITY const VTableProfData *
+__llvm_profile_begin_vtables(void) {
+  return VTableProfDataFirst;
+}
+COMPILER_RT_VISIBILITY const VTableProfData *__llvm_profile_end_vtables(void) {
+  return VTableProfDataLast;
+}
 COMPILER_RT_VISIBILITY
 const char *__llvm_profile_begin_names(void) { return NamesFirst; }
 COMPILER_RT_VISIBILITY
 const char *__llvm_profile_end_names(void) { return NamesLast; }
 COMPILER_RT_VISIBILITY
+const char *__llvm_profile_begin_vtabnames(void) { return VNamesFirst; }
+COMPILER_RT_VISIBILITY
+const char *__llvm_profile_end_vtabnames(void) { return VNamesLast; }
+COMPILER_RT_VISIBILITY
 char *__llvm_profile_begin_counters(void) { return CountersFirst; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_counters(void) { return CountersLast; }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
index 0751b28f81d0a..b9642ca7f6810 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
@@ -6,6 +6,8 @@
 |*
 \*===----------------------------------------------------------------------===*/
 
+#include <stddef.h>
+
 #include "InstrProfiling.h"
 #include "InstrProfilingInternal.h"
 
@@ -59,9 +61,26 @@ const __llvm_profile_data *__llvm_profile_begin_data(void) {
 }
 const __llvm_profile_data *__llvm_profile_end_data(void) { return &DataEnd; }
 
+// Type profiling isn't implemented under MSVC ABI, so return NULL (rather than
+// implementing linker magic on Windows) to make it more explicit. To elaborate,
+// the current type profiling implementation maps a profiled vtable address to a
+// vtable variable through vtables mangled name. Under MSVC ABI, the variable
+// name for vtables might not be the mangled name (see
+// MicrosoftCXXABI::getAddrOfVTable in MicrosoftCXXABI.cpp for more details on
+// how a vtable name is computed). Note the mangled name is still in the vtable
+// IR (just not variable name) for mapping purpose, but more implementation work
+// is required.
+const VTableProfData *__llvm_profile_begin_vtables(void) { return NULL; }
+const VTableProfData *__llvm_profile_end_vtables(void) { return NULL; }
+
 const char *__llvm_profile_begin_names(void) { return &NamesStart + 1; }
 const char *__llvm_profile_end_names(void) { return &NamesEnd; }
 
+// Type profiling isn't supported on Windows, so return NULl to make it more
+// explicit.
+const char *__llvm_profile_begin_vtabnames(void) { return NULL; }
+const char *__llvm_profile_end_vtabnames(void) { return NULL; }
+
 char *__llvm_profile_begin_counters(void) { return &CountersStart + 1; }
 char *__llvm_profile_end_counters(void) { return &CountersEnd; }
 char *__llvm_profile_begin_bitmap(void) { return &BitmapStart + 1; }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 4d767d1385148..8816a71155511 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -250,9 +250,14 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
+  const VTableProfData *VTableBegin = __llvm_profile_begin_vtables();
+  const VTableProfData *VTableEnd = __llvm_profile_end_vtables();
+  const char *VNamesBegin = __llvm_profile_begin_vtabnames();
+  const char *VNamesEnd = __llvm_profile_end_vtabnames();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
                             CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
-                            NamesBegin, NamesEnd, SkipNameDataWrite);
+                            NamesBegin, NamesEnd, VTableBegin, VTableEnd,
+                            VNamesBegin, VNamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
@@ -261,7 +266,9 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *CountersBegin, const char *CountersEnd,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
-                   const char *NamesEnd, int SkipNameDataWrite) {
+                   const char *NamesEnd, const VTableProfData *VTableBegin,
+                   const VTableProfData *VTableEnd, const char *VNamesBegin,
+                   const char *VNamesEnd, int SkipNameDataWrite) {
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
       __llvm_profile_get_data_size(DataBegin, DataEnd);
@@ -273,6 +280,12 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
+  const uint64_t NumVTables =
+      __llvm_profile_get_num_vtable(VTableBegin, VTableEnd);
+  const uint64_t VTableSectionSize =
+      __llvm_profile_get_vtable_section_size(VTableBegin, VTableEnd);
+  const uint64_t VNamesSize =
+      __llvm_profile_get_name_size(VNamesBegin, VNamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -280,11 +293,15 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
-  __llvm_profile_get_padding_sizes_for_counters(
-      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
-      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
+      PaddingBytesAfterBitmapBytes, PaddingBytesAfterNames,
+      PaddingBytesAfterVTable, PaddingBytesAfterVNames;
+  if (__llvm_profile_get_padding_sizes_for_counters(
+          DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
+          VTableSectionSize, VNamesSize, &PaddingBytesBeforeCounters,
+          &PaddingBytesAfterCounters, &PaddingBytesAfterBitmapBytes,
+          &PaddingBytesAfterNames, &PaddingBytesAfterVTable,
+          &PaddingBytesAfterVNames) == -1)
+    return -1;
 
   {
 /* Initialize header structure.  */
@@ -323,7 +340,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {SkipNameDataWrite ? NULL : NamesBegin, sizeof(uint8_t), NamesSize, 0},
-      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
+      {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1},
+      {VTableBegin, sizeof(uint8_t), VTableSectionSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVTable, 1},
+      {SkipNameDataWrite ? NULL : VNamesBegin, sizeof(uint8_t), VNamesSize, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterVNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
index d9670f739ca98..2c1c29ac0c588 100644
--- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c
+++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
@@ -31,7 +31,8 @@ char *__llvm_profile_end_bitmap(void);
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const void *DataBegin, const void *DataEnd, const char *CountersBegin,
     const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
-    const char *NamesBegin, const char *NamesEnd);
+    const char *NamesBegin, const char *NamesEnd, const void *VTableBegin,
+    const void *VTableEnd, const char *VNamesBegin, const char *VNamesEnd);
 
 int __llvm_profile_write_buffer_internal(
     char *Buffer, const void *DataBegin, const void *DataEnd,
@@ -45,7 +46,8 @@ int main(int argc, const char *argv[]) {
       __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
       __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
-      __llvm_profile_begin_names(), __llvm_profile_end_names());
+      __llvm_profile_begin_names(), __llvm_profile_end_names(), NULL, NULL,
+      NULL, NULL);
 
   char *buf = malloc(bufsize);
   int ret = __llvm_profile_write_buffer_internal(
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index a928ba6961f36..25ec06a739202 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -831,6 +831,7 @@ struct InstrProfRecord {
   struct ValueProfData {
     std::vector<InstrProfValueSiteRecord> IndirectCallSites;
     std::vector<InstrProfValueSiteRecord> MemOPSizes;
+    std::vector<InstrProfValueSiteRecord> VTableTargets;
   };
   std::unique_ptr<ValueProfData> ValueData;
 
@@ -853,6 +854,8 @@ struct InstrProfRecord {
       return ValueData->IndirectCallSites;
     case IPVK_MemOPSize:
       return ValueData->MemOPSizes;
+    case IPVK_VTableTarget:
+      return ValueData->VTableTargets;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -1036,7 +1039,9 @@ enum ProfVersion {
   Version10 = 10,
   // An additional field is used for bitmap bytes.
   Version11 = 11,
-  // The current version is 11.
+  // VTable profiling,
+  Version12 = 12,
+  // The current version is 12.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1057,6 +1062,7 @@ struct Header {
   uint64_t MemProfOffset;
   uint64_t BinaryIdOffset;
   uint64_t TemporalProfTracesOffset;
+  uint64_t VTableNamesOffset;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
@@ -1193,8 +1199,13 @@ template <> inline uint64_t getMagic<uint32_t>() {
 // It should also match the synthesized type in
 // Transforms/Instrumentation/InstrProfiling.cpp:getOrCreateRegionCounters.
 template <class IntPtrT> struct alignas(8) ProfileData {
-  #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
-  #include "llvm/ProfileData/InstrProfData.inc"
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+template <class IntPtrT> struct alignas(8) VTableProfileData {
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Init) Type Name;
+#include "llvm/ProfileData/InstrProfData.inc"
 };
 
 // File header structure of the LLVM profile data in raw format.
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index fce407f547f3d..e9866d94b762c 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -96,6 +96,25 @@ INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
+/* For a virtual table object, record the name hash to associate profiled
+ * addresses with global variables, and record {starting address, size in bytes}
+ * to map the profiled virtual table (which usually have an offset from the
+ * starting address) back to a virtual table object. */
+#ifndef INSTR_PROF_VTABLE_DATA
+#define INSTR_PROF_VTABLE_DATA(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_VTABLE_DATA_DEFINED
+#endif
+INSTR_PROF_VTABLE_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), \
+                       VTableNameHash, ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
+                       IndexedInstrProf::ComputeHash(PGOVTableName)))
+INSTR_PROF_VTABLE_DATA(const IntPtrT, llvm::PointerType::getUnqual(Ctx), \
+                       VTablePointer, VTableAddr)
+INSTR_PROF_VTABLE_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), VTableSize, \
+                       ConstantInt::get(llvm::Type::getInt32Ty(Ctx), \
+                                        VTableSizeVal))
+#undef INSTR_PROF_VTABLE_DATA
+/* INSTR_PROF_VTABLE_DATA end. */
 
 /* This is an internal data structure used by value profiler. It
  * is defined here to allow serialization code sharing by LLVM
@@ -147,6 +166,8 @@ INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
 INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
                       (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
+INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
 /* INSTR_PROF_RAW_HEADER  end */
@@ -188,13 +209,26 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0, "indirect call target")
 /* For memory intrinsic functions size profiling. */
 VALUE_PROF_KIND(IPVK_MemOPSize, 1, "memory intrinsic functions size")
+/* For virtual table address profiling, the address point of the virtual table
+ * (i.e., the address contained in objects pointing to a virtual table) are
+ * profiled. Note this may not be the address of the per C++ class virtual table
+ *  object (e.g., there might be an offset).
+ *
+ * The profiled addresses are stored in raw profile, together with the following
+ * two types of information.
+ * 1. The (starting and ending) addresses of per C++ class virtual table objects.
+ * 2. The (compressed) virtual table object names.
+ * RawInstrProfReader converts profiled virtual table addresses to virtual table
+ *  objects' MD5 hash.
+ */
+VALUE_PROF_KIND(IPVK_VTableTarget, 2, "The profiled address point of the vtable")
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget, "first")
-VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize, "last")
+VALUE_PROF_KIND(IPVK_Last, IPVK_VTableTarget, "last")
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -284,12 +318,18 @@ INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vname, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON), \
+                      INSTR_PROF_VNAME_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vals, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VALS_COMMON), \
                       INSTR_PROF_VALS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_vnodes, \
                       INSTR_PROF_QUOTE(INSTR_PROF_VNODES_COMMON), \
                       INSTR_PROF_VNODES_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_vtab, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON), \
+                      INSTR_PROF_VTAB_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_covmap, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVMAP_COMMON), \
                       INSTR_PROF_COVMAP_COFF, "__LLVM_COV,")
@@ -668,9 +708,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 9
+#define INSTR_PROF_RAW_VERSION 10
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 11
+#define INSTR_PROF_INDEX_VERSION 12
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 6
 
@@ -708,10 +748,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
    than WIN32 */
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
+#define INSTR_PROF_VNAME_COMMON __llvm_prf_vns
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
 #define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
+#define INSTR_PROF_VTAB_COMMON __llvm_prf_vtab
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
 #define INSTR_PROF_COVFUN_COMMON __llvm_covfun
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
@@ -722,10 +764,12 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
  */
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
+#define INSTR_PROF_VNAME_COFF ".lprfvn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
 #define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
+#define INSTR_PROF_VTAB_COFF ".lprfvt$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
 #define INSTR_PROF_COVFUN_COFF ".lcovfun$M"
 /* Since cov data and cov names sections are not allocated, we don't need to
@@ -741,6 +785,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_VTAB_COFF
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_VNAME_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -758,6 +804,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
 #define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
+#define INSTR_PROF_VTAB_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VTAB_COMMON)
+#define INSTR_PROF_VNAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_VNAME_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 87f15639a2c3c..cfde5d3fc77d6 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -326,12 +326,16 @@ class RawInstrProfReader : public InstrProfReader {
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableBegin = nullptr;
+  const RawInstrProf::VTableProfileData<IntPtrT> *VTableEnd = nullptr;
   const char *CountersStart;
   const char *CountersEnd;
   const char *BitmapStart;
   const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
+  const char *VNamesStart = nullptr;
+  const char *VNamesEnd = nullptr;
   // After value profile is all read, this pointer points to
   // the header of next profile data (if exists)
   const uint8_t *ValueDataStart;
@@ -656,6 +660,15 @@ class IndexedInstrProfReader : public InstrProfReader {
   std::unique_ptr<MemProfRecordHashTable> MemProfRecordTable;
   /// MemProf frame profile data on-disk indexed via frame id.
   std::unique_ptr<MemProfFrameHashTable> MemProfFrameTable;
+  /// VTableNamePtr points to the beginning of compressed vtable names.
+  /// When a symtab is constructed from profiles by llvm-profdata, the list of
+  /// names could be decompressed based on `VTableNamePtr` and
+  /// `CompressedVTableNamesLen`.
+  /// A compiler that reads indexed profiles could construct symtab from module
+  /// IR so it doesn't need the decompressed names.
+  const char *VTableNamePtr = nullptr;
+  /// The length of compressed vtable names.
+  uint64_t CompressedVTableNamesLen = 0;
   /// Total size of binary ids.
   uint64_t BinaryIdsSize{0};
   /// Start address of binary id length and data pairs.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 2eeeff987399d..b9afee413853e 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1533,9 +1533,12 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
+  case 12ull:
+    H.VTableNamesOffset = read(Buffer, offsetOf(&Header::VTableNamesOffset));
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
@@ -1561,10 +1564,14 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version12,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
+  case 12ull:
+    return offsetOf(&Header::VTableNamesOffset) +
+           sizeof(Header::VTableNamesOffset);
+    [[fallthrough]];
   case 11ull:
     [[fallthrough]];
   case 10ull:
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 0d8d43daae960..31b742bca14d6 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -366,6 +366,11 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
               return E;
             Value = IndexedInstrProf::ComputeHash(VD.first);
           }
+        } else if (ValueKind == IPVK_VTableTarget) {
+          if (InstrProfSymtab::isExternalSymbol(VD.first))
+            Value = 0;
+          else
+            Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
           READ_NUM(VD.first, Value);
         }
@@ -582,10 +587,17 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   auto NumBitmapBytes = swap(Header.NumBitmapBytes);
   auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
+  auto VTableNameSize = swap(Header.VNamesSize);
+  auto NumVTables = swap(Header.NumVTables);
   ValueKindLast = swap(Header.ValueKindLast);
 
   auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>);
-  auto PaddingSize = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterNames = getNumPaddingBytes(NamesSize);
+  auto PaddingBytesAfterVTableNames = getNumPaddingBytes(VTableNameSize);
+
+  auto VTableSectionSize =
+      NumVTables * sizeof(RawInstrProf::VTableProfileData<IntPtrT>);
+  auto PaddingBytesAfterVTableProfData = getNumPaddingBytes(VTableSectionSize);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdSize;
@@ -594,7 +606,12 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
   ptrdiff_t NamesOffset =
       BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
-  ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
+  ptrdiff_t VTableProfDataOffset =
+      NamesOffset + NamesSize + PaddingBytesAfterNames;
+  ptrdiff_t VTableNameOffset = VTableProfDataOffset + VTableSectionSize +
+                               PaddingBytesAfterVTableProfData;
+  ptrdiff_t ValueDataOffset =
+      VTableNameOffset + VTableNameSize + PaddingBytesAfterVTableNames;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
   if (Start + ValueDataOffset > DataBuffer->getBufferEnd())
@@ -614,8 +631,14 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
         Start + DataOffset);
     DataEnd = Data + NumData;
+    VTableBegin =
+        reinterpret_cast<const RawInstrProf::VTableProfileData<IntPtrT> *>(
+            Start + VTableProfDataOffset);
+    VTableEnd = VTableBegin + NumVTables;
     NamesStart = Start + NamesOffset;
     NamesEnd = NamesStart + NamesSize;
+    VNamesStart = Start + VTableNameOffset;
+    VNamesEnd = VNamesStart + VTableNameSize;
   }
 
   CountersStart = Start + CountersOffset;
@@ -1260,6 +1283,23 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
+  if (GET_VERSION(Header->formatVersion()) >= 12) {
+    uint64_t VTableNamesOffset =
+        endian::byte_swap<uint64_t, llvm::endianness::little>(
+            Header->VTableNamesOffset);
+    const unsigned char *Ptr = Start + VTableNamesOffset;
+
+    CompressedVTableNamesLen =
+        support::endian::readNext<uint64_t, llvm::endianness::little,
+                                  unaligned>(Ptr);
+
+    // Writer first writes the length of compressed string, and then the actual
+    // content.
+    VTableNamePtr = (const char *)Ptr;
+    if (VTableNamePtr > (const char *)DataBuffer->getBufferEnd())
+      return make_error<InstrProfError>(instrprof_error::truncated);
+  }
+
   if (GET_VERSION(Header->formatVersion()) >= 10 &&
       Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
     uint64_t TemporalProfTracesOffset =
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index d65f8fe50313d..3e0a0e0d70116 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -455,12 +455,11 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   Header.MemProfOffset = 0;
   Header.BinaryIdOffset = 0;
   Header.TemporalProfTracesOffset = 0;
-  int N = sizeof(IndexedInstrProf::Header) / sizeof(uint64_t);
+  Header.VTableNamesOffset = 0;
 
-  // Only write out all the fields except 'HashOffset', 'MemProfOffset',
-  // 'BinaryIdOffset' and `TemporalProfTracesOffset`. We need to remember the
-  // offset of these fields to allow back patching later.
-  for (int I = 0; I < N - 4; I++)
+  // Only write out the first four fields. We need to remember the offset of the
+  // remaining fields to allow back patching later.
+  for (int I = 0; I < 4; I++)
     OS.write(reinterpret_cast<uint64_t *>(&Header)[I]);
 
   // Save the location of Header.HashOffset field in \c OS.
@@ -484,6 +483,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   uint64_t TemporalProfTracesOffset = OS.tell();
   OS.write(0);
 
+  uint64_t VTableNamesOffset = OS.tell();
+  OS.write(0);
+
   // Reserve space to write profile summary data.
   uint32_t NumEntries = ProfileSummaryBuilder::DefaultCutoffs.size();
   uint32_t SummarySize = Summary::getSize(Summary::NumKinds, NumEntries);
@@ -604,6 +606,31 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       OS.writeByte(0);
   }
 
+  uint64_t VTableNamesSectionStart = OS.tell();
+
+  // Use a dummy (and uncompressed) string as compressed vtable names and get
+  // the necessary profile format change in place for version 12.
+  // TODO: Store the list of vtable names in InstrProfWriter and use the
+  // real compressed name.
+  std::string CompressedVTableNames = "VTableNames";
+
+  uint64_t CompressedStringLen = CompressedVTableNames.length();
+
+  // Record the length of compressed string.
+  OS.write(CompressedStringLen);
+
+  // Write the chars in compressed strings.
+  for (auto &c : CompressedVTableNames)
+    OS.writeByte(static_cast<uint8_t>(c));
+
+  // Pad up to a multiple of 8.
+  // InstrProfReader would read bytes according to 'CompressedStringLen'.
+  uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
+
+  for (uint64_t K = CompressedStringLen; K < PaddedLength; K++) {
+    OS.writeByte(0);
+  }
+
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
     TemporalProfTracesSectionStart = OS.tell();
@@ -647,6 +674,7 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
       // Patch the Header.TemporalProfTracesOffset (=0 for profiles without
       // traces).
       {TemporalProfTracesOffset, &TemporalProfTracesSectionStart, 1},
+      {VTableNamesOffset, &VTableNamesSectionStart, 1},
       // Patch the summary data.
       {SummaryOffset, reinterpret_cast<uint64_t *>(TheSummary.get()),
        (int)(SummarySize / sizeof(uint64_t))},
@@ -699,7 +727,8 @@ Error InstrProfWriter::validateRecord(const InstrProfRecord &Func) {
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       DenseSet<uint64_t> SeenValues;
       for (uint32_t I = 0; I < ND; I++)
-        if ((VK != IPVK_IndirectCallTarget) && !SeenValues.insert(VD[I].Value).second)
+        if ((VK != IPVK_IndirectCallTarget && VK != IPVK_VTableTarget) &&
+            !SeenValues.insert(VD[I].Value).second)
           return make_error<InstrProfError>(instrprof_error::invalid_prof);
     }
   }
@@ -747,7 +776,7 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
       OS << ND << "\n";
       std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, S);
       for (uint32_t I = 0; I < ND; I++) {
-        if (VK == IPVK_IndirectCallTarget)
+        if (VK == IPVK_IndirectCallTarget || VK == IPVK_VTableTarget)
           OS << Symtab.getFuncOrVarNameIfDefined(VD[I].Value) << ":"
              << VD[I].Count << "\n";
         else
diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage.ll b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
index bbf895ea4b34e..08cbcaa962b76 100644
--- a/llvm/test/Instrumentation/InstrProfiling/coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/coverage.ll
@@ -5,12 +5,12 @@ target triple = "aarch64-unknown-linux-gnu"
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK: @__profc_foo = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
-; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
+; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64)
+; BINARY: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_foo to i64),
 @__profn_bar = private constant [3 x i8] c"bar"
 ; CHECK: @__profc_bar = private global [1 x i8] c"\FF", section "__llvm_prf_cnts", comdat, align 1
-; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
-; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
+; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64)
+; BINARY: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 ptrtoint (ptr @__profc_bar to i64),
 
 ; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names"
 ; BINARY: @__llvm_prf_nm ={{.*}} section "__llvm_covnames"
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/Transforms/PGOProfile/Inputs/thinlto_indirect_call_promotion.profraw
index 5efda10bb98a941c04b6846db05d3691bc36aac0..3daa98f937b691880ffff203c9426bfacddf749d 100644
GIT binary patch
delta 133
zcmbQhvVeuNu_!ISs37M**F;W##f(djpGdFz|9^9xwDglu1`NP7F;ks2U=>hu;#6za
s1Tf>OHE#ik0aQLiPe%I5WLZXI)&n4s$)Sw16~Kysa*R;Jz`Bw604`-Eq5uE@

delta 117
zcmZ3$GJ%D&u_!ISs37M*=R{6_L67IVA1SZ;|9^9yv+SKv1_s87mFlblGl86mORZTI
rz>KHXyapf!P<n@?i|n1rx{SuG4Iq)psf@D~z=}Xx86W_x8;K796T>9f

diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 8c6942c0f527b..1bad0db1b4762 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [3 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index 9cd225587c92511e99f3497ce1d5f47c6fc5f0af..a3e884343942ebc70ba95ab4ee006630b6816d80 100644
GIT binary patch
delta 40
ycmV+@0N4NE5AY8OfpTVVa&T<_3Xus<4&W)M0UE0R|DByz{^eDZP6HaTaBv4~Q4!$)

delta 39
vcmeys|A3#fu_!ISs37M*=R{6_K?|$bHJ=*(|L<GyrHQwmfq`*jWjQ+lUJ(&8

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index 9966729d92ddc33bf89eeb3fee87215bbabbbef1..e3f77e870d4d20828119348e70eb44e6d39e0ec0 100644
GIT binary patch
delta 40
wcmdnMe}JE}u_!ISs37M**F;W#z6X_m+ZL?)|9`G8Q)PVUWItx9jRg+u0BX?@Q2+n{

delta 39
vcmX@Wzk#2#u_!ISs37M*=R{6_L5r?)Gq*SV|KArNnC4N>z`(e%(w!XuMI#TR

diff --git a/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw b/llvm/test/tools/llvm-profdata/Inputs/thinlto_indirect_call_promotion.profraw
new file mode 100644
index 0000000000000000000000000000000000000000..84707ba2070a92b8683010d9daaef747df35f9ac
GIT binary patch
literal 528
zcmZoHO3N=Q$obF700xW@ih+Rz#(>i3d^BkWXQ;q~{}8~jee0hktN#DrJkOIkI+TF{
zX0YI^%?f`vOg;fr_5L!KFBeQb%shva5cM!VOdpINJ<~YH=c-N(O#cd~eK7d|0{XA2
zYFH&6%DWHJCbaDydjXpM1gQQWo?dWwGr<f8(!Z|uJ~vC5dK;(>?0yS0{Tm3_5AzQ$
z+Q7KtR(HRVzu%dYp1!6!$!AXbT=MqY*4O{3u}gA_;W2kfsb$ZfsH;9ZvV7_@)#;23
r{WSu+S$HaLo%TI*hM9pynsFJ}wH81UW(Uaqj8G0Nd|-00@P_dLvBrhT

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index eda63203a304a..61881b69cfd5c 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -32,6 +34,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 38b838e0d100a..316a9a4c9df4c 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index c967e850dbe35..8b686d5c50cb7 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index 2e747f81a6bfa..089afad420622 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 3c23bc7dd0f7f..e404ba4210cc1 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -10,10 +10,12 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, VNamesSize, VNamesSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumVTables, NumVTables)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -26,6 +28,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Data Section
 //
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 4a5c42843ff4d..ee54bfb978567 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -10,6 +10,8 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Binary IDs - There are only two in this case that are 20 bytes.
 RUN: printf '\24\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index 2a92575ee3407..dfa163f1f3439 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -15,6 +15,8 @@ RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: not llvm-profdata show %t -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index 8220361df6cfa..63782c8b94d4a 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,5 +1,6 @@
+// Header
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +13,8 @@ RUN: printf '\0\0\0\0\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +23,8 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
@@ -31,9 +33,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 9352ae132380d..e9569bec1178b 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index c3e995add6ff2..0bc579eec58ab 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,5 +1,5 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\11' >> %t
+RUN: printf '\0\0\0\0\0\0\0\12' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\3' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 0b3ef2a89abe5..ca9ea54c3f014 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t
 RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
@@ -20,9 +22,8 @@ RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
@@ -31,9 +32,8 @@ RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index f4a9aa8e1bbc3..70a4210dea9f8 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -12,6 +12,8 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
@@ -26,7 +28,7 @@ RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\12\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
@@ -39,6 +41,8 @@ RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw

From 40ba1f60e9f4b186d71272d4bc23b5af6204244d Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Tue, 27 Feb 2024 11:09:38 -0800
Subject: [PATCH 480/546] [clang] Update -Wformat warnings for fixed-point
 format specifiers (#82855)

ISO/IEC TR 18037 defines %r, %R, %k, and %K for fixed point format
specifiers. -Wformat should not warn on these when they are provided.
---
 clang/include/clang/AST/ASTContext.h   |   4 +
 clang/include/clang/AST/FormatString.h |  11 ++
 clang/lib/AST/ASTContext.cpp           |  36 ++++++
 clang/lib/AST/FormatString.cpp         |  25 +++++
 clang/lib/AST/PrintfFormatString.cpp   |  85 +++++++++++++-
 clang/test/Sema/format-fixed-point.c   | 148 +++++++++++++++++++++++++
 6 files changed, 306 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Sema/format-fixed-point.c

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 12ce9af1e53f6..ff6b64c7f72d5 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2981,6 +2981,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   // corresponding saturated type for a given fixed point type.
   QualType getCorrespondingSaturatedType(QualType Ty) const;
 
+  // Per ISO N1169, this method accepts fixed point types and returns the
+  // corresponding non-saturated type for a given fixed point type.
+  QualType getCorrespondingUnsaturatedType(QualType Ty) const;
+
   // This method accepts fixed point types and returns the corresponding signed
   // type. Unlike getCorrespondingUnsignedType(), this only accepts unsigned
   // fixed point types because there are unsigned integer types like bool and
diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h
index 5c4ad9baaef60..e2232fb4a4715 100644
--- a/clang/include/clang/AST/FormatString.h
+++ b/clang/include/clang/AST/FormatString.h
@@ -171,6 +171,14 @@ class ConversionSpecifier {
 
     ZArg, // MS extension
 
+    // ISO/IEC TR 18037 (fixed-point) specific specifiers.
+    kArg, // %k for signed accum types
+    KArg, // %K for unsigned accum types
+    rArg, // %r for signed fract types
+    RArg, // %R for unsigned fract types
+    FixedPointArgBeg = kArg,
+    FixedPointArgEnd = RArg,
+
     // Objective-C specific specifiers.
     ObjCObjArg, // '@'
     ObjCBeg = ObjCObjArg,
@@ -237,6 +245,9 @@ class ConversionSpecifier {
   bool isDoubleArg() const {
     return kind >= DoubleArgBeg && kind <= DoubleArgEnd;
   }
+  bool isFixedPointArg() const {
+    return kind >= FixedPointArgBeg && kind <= FixedPointArgEnd;
+  }
 
   const char *toString() const;
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index c475c841233c5..5a8fae76a43a4 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -13314,6 +13314,42 @@ QualType ASTContext::getCommonSugaredType(QualType X, QualType Y,
   return R;
 }
 
+QualType ASTContext::getCorrespondingUnsaturatedType(QualType Ty) const {
+  assert(Ty->isFixedPointType());
+
+  if (Ty->isUnsaturatedFixedPointType())
+    return Ty;
+
+  switch (Ty->castAs<BuiltinType>()->getKind()) {
+  default:
+    llvm_unreachable("Not a saturated fixed point type!");
+  case BuiltinType::SatShortAccum:
+    return ShortAccumTy;
+  case BuiltinType::SatAccum:
+    return AccumTy;
+  case BuiltinType::SatLongAccum:
+    return LongAccumTy;
+  case BuiltinType::SatUShortAccum:
+    return UnsignedShortAccumTy;
+  case BuiltinType::SatUAccum:
+    return UnsignedAccumTy;
+  case BuiltinType::SatULongAccum:
+    return UnsignedLongAccumTy;
+  case BuiltinType::SatShortFract:
+    return ShortFractTy;
+  case BuiltinType::SatFract:
+    return FractTy;
+  case BuiltinType::SatLongFract:
+    return LongFractTy;
+  case BuiltinType::SatUShortFract:
+    return UnsignedShortFractTy;
+  case BuiltinType::SatUFract:
+    return UnsignedFractTy;
+  case BuiltinType::SatULongFract:
+    return UnsignedLongFractTy;
+  }
+}
+
 QualType ASTContext::getCorrespondingSaturatedType(QualType Ty) const {
   assert(Ty->isFixedPointType());
 
diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp
index c5d14b4af7ff1..0c80ad109ccbb 100644
--- a/clang/lib/AST/FormatString.cpp
+++ b/clang/lib/AST/FormatString.cpp
@@ -403,6 +403,10 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const {
         else if (ETy->isUnscopedEnumerationType())
           argTy = ETy->getDecl()->getIntegerType();
       }
+
+      if (argTy->isSaturatedFixedPointType())
+        argTy = C.getCorrespondingUnsaturatedType(argTy);
+
       argTy = C.getCanonicalType(argTy).getUnqualifiedType();
 
       if (T == argTy)
@@ -761,6 +765,16 @@ const char *ConversionSpecifier::toString() const {
 
   // MS specific specifiers.
   case ZArg: return "Z";
+
+  // ISO/IEC TR 18037 (fixed-point) specific specifiers.
+  case rArg:
+    return "r";
+  case RArg:
+    return "R";
+  case kArg:
+    return "k";
+  case KArg:
+    return "K";
   }
   return nullptr;
 }
@@ -825,6 +839,9 @@ bool FormatSpecifier::hasValidLengthModifier(const TargetInfo &Target,
       if (LO.OpenCL && CS.isDoubleArg())
         return !VectorNumElts.isInvalid();
 
+      if (CS.isFixedPointArg())
+        return true;
+
       if (Target.getTriple().isOSMSVCRT()) {
         switch (CS.getKind()) {
           case ConversionSpecifier::cArg:
@@ -877,6 +894,9 @@ bool FormatSpecifier::hasValidLengthModifier(const TargetInfo &Target,
         return true;
       }
 
+      if (CS.isFixedPointArg())
+        return true;
+
       switch (CS.getKind()) {
         case ConversionSpecifier::bArg:
         case ConversionSpecifier::BArg:
@@ -1043,6 +1063,11 @@ bool FormatSpecifier::hasStandardConversionSpecifier(
     case ConversionSpecifier::UArg:
     case ConversionSpecifier::ZArg:
       return false;
+    case ConversionSpecifier::rArg:
+    case ConversionSpecifier::RArg:
+    case ConversionSpecifier::kArg:
+    case ConversionSpecifier::KArg:
+      return LangOpt.FixedPoint;
   }
   llvm_unreachable("Invalid ConversionSpecifier Kind!");
 }
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index 3b09ca40bd2a5..fec8ce13e8c44 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -348,6 +348,8 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
     case 'r':
       if (isFreeBSDKPrintf)
         k = ConversionSpecifier::FreeBSDrArg; // int
+      else if (LO.FixedPoint)
+        k = ConversionSpecifier::rArg;
       break;
     case 'y':
       if (isFreeBSDKPrintf)
@@ -373,6 +375,20 @@ static PrintfSpecifierResult ParsePrintfSpecifier(FormatStringHandler &H,
       if (Target.getTriple().isOSMSVCRT())
         k = ConversionSpecifier::ZArg;
       break;
+    // ISO/IEC TR 18037 (fixed-point) specific.
+    // NOTE: 'r' is handled up above since FreeBSD also supports %r.
+    case 'k':
+      if (LO.FixedPoint)
+        k = ConversionSpecifier::kArg;
+      break;
+    case 'K':
+      if (LO.FixedPoint)
+        k = ConversionSpecifier::KArg;
+      break;
+    case 'R':
+      if (LO.FixedPoint)
+        k = ConversionSpecifier::RArg;
+      break;
   }
 
   // Check to see if we used the Objective-C modifier flags with
@@ -627,6 +643,9 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx,
     }
   }
 
+  if (CS.isFixedPointArg() && !Ctx.getLangOpts().FixedPoint)
+    return ArgType::Invalid();
+
   switch (CS.getKind()) {
     case ConversionSpecifier::sArg:
       if (LM.getKind() == LengthModifier::AsWideChar) {
@@ -658,6 +677,50 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx,
       return ArgType::CPointerTy;
     case ConversionSpecifier::ObjCObjArg:
       return ArgType::ObjCPointerTy;
+    case ConversionSpecifier::kArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.AccumTy;
+      case LengthModifier::AsShort:
+        return Ctx.ShortAccumTy;
+      case LengthModifier::AsLong:
+        return Ctx.LongAccumTy;
+      default:
+        return ArgType::Invalid();
+      }
+    case ConversionSpecifier::KArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.UnsignedAccumTy;
+      case LengthModifier::AsShort:
+        return Ctx.UnsignedShortAccumTy;
+      case LengthModifier::AsLong:
+        return Ctx.UnsignedLongAccumTy;
+      default:
+        return ArgType::Invalid();
+      }
+    case ConversionSpecifier::rArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.FractTy;
+      case LengthModifier::AsShort:
+        return Ctx.ShortFractTy;
+      case LengthModifier::AsLong:
+        return Ctx.LongFractTy;
+      default:
+        return ArgType::Invalid();
+      }
+    case ConversionSpecifier::RArg:
+      switch (LM.getKind()) {
+      case LengthModifier::None:
+        return Ctx.UnsignedFractTy;
+      case LengthModifier::AsShort:
+        return Ctx.UnsignedShortFractTy;
+      case LengthModifier::AsLong:
+        return Ctx.UnsignedLongFractTy;
+      default:
+        return ArgType::Invalid();
+      }
     default:
       break;
   }
@@ -955,6 +1018,8 @@ bool PrintfSpecifier::hasValidPlusPrefix() const {
   case ConversionSpecifier::AArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::kArg:
     return true;
 
   default:
@@ -966,7 +1031,7 @@ bool PrintfSpecifier::hasValidAlternativeForm() const {
   if (!HasAlternativeForm)
     return true;
 
-  // Alternate form flag only valid with the bBoxXaAeEfFgG conversions
+  // Alternate form flag only valid with the bBoxXaAeEfFgGrRkK conversions
   switch (CS.getKind()) {
   case ConversionSpecifier::bArg:
   case ConversionSpecifier::BArg:
@@ -984,6 +1049,10 @@ bool PrintfSpecifier::hasValidAlternativeForm() const {
   case ConversionSpecifier::GArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::RArg:
+  case ConversionSpecifier::kArg:
+  case ConversionSpecifier::KArg:
     return true;
 
   default:
@@ -995,7 +1064,7 @@ bool PrintfSpecifier::hasValidLeadingZeros() const {
   if (!HasLeadingZeroes)
     return true;
 
-  // Leading zeroes flag only valid with the bBdiouxXaAeEfFgG conversions
+  // Leading zeroes flag only valid with the bBdiouxXaAeEfFgGrRkK conversions
   switch (CS.getKind()) {
   case ConversionSpecifier::bArg:
   case ConversionSpecifier::BArg:
@@ -1018,6 +1087,10 @@ bool PrintfSpecifier::hasValidLeadingZeros() const {
   case ConversionSpecifier::GArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::RArg:
+  case ConversionSpecifier::kArg:
+  case ConversionSpecifier::KArg:
     return true;
 
   default:
@@ -1044,6 +1117,8 @@ bool PrintfSpecifier::hasValidSpacePrefix() const {
   case ConversionSpecifier::AArg:
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::kArg:
     return true;
 
   default:
@@ -1089,7 +1164,7 @@ bool PrintfSpecifier::hasValidPrecision() const {
   if (Precision.getHowSpecified() == OptionalAmount::NotSpecified)
     return true;
 
-  // Precision is only valid with the bBdiouxXaAeEfFgGsP conversions
+  // Precision is only valid with the bBdiouxXaAeEfFgGsPrRkK conversions
   switch (CS.getKind()) {
   case ConversionSpecifier::bArg:
   case ConversionSpecifier::BArg:
@@ -1114,6 +1189,10 @@ bool PrintfSpecifier::hasValidPrecision() const {
   case ConversionSpecifier::FreeBSDrArg:
   case ConversionSpecifier::FreeBSDyArg:
   case ConversionSpecifier::PArg:
+  case ConversionSpecifier::rArg:
+  case ConversionSpecifier::RArg:
+  case ConversionSpecifier::kArg:
+  case ConversionSpecifier::KArg:
     return true;
 
   default:
diff --git a/clang/test/Sema/format-fixed-point.c b/clang/test/Sema/format-fixed-point.c
new file mode 100644
index 0000000000000..47b22f1a7a5f3
--- /dev/null
+++ b/clang/test/Sema/format-fixed-point.c
@@ -0,0 +1,148 @@
+// RUN: %clang_cc1 -ffixed-point -fsyntax-only -verify -Wformat -isystem %S/Inputs %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wformat -isystem %S/Inputs %s -DWITHOUT_FIXED_POINT
+
+int printf(const char *restrict, ...);
+
+short s;
+unsigned short us;
+int i;
+unsigned int ui;
+long l;
+unsigned long ul;
+float fl;
+double d;
+char c;
+unsigned char uc;
+
+#ifndef WITHOUT_FIXED_POINT
+short _Fract sf;
+_Fract f;
+long _Fract lf;
+unsigned short _Fract usf;
+unsigned _Fract uf;
+unsigned long _Fract ulf;
+short _Accum sa;
+_Accum a;
+long _Accum la;
+unsigned short _Accum usa;
+unsigned _Accum ua;
+unsigned long _Accum ula;
+_Sat short _Fract sat_sf;
+_Sat _Fract sat_f;
+_Sat long _Fract sat_lf;
+_Sat unsigned short _Fract sat_usf;
+_Sat unsigned _Fract sat_uf;
+_Sat unsigned long _Fract sat_ulf;
+_Sat short _Accum sat_sa;
+_Sat _Accum sat_a;
+_Sat long _Accum sat_la;
+_Sat unsigned short _Accum sat_usa;
+_Sat unsigned _Accum sat_ua;
+_Sat unsigned long _Accum sat_ula;
+
+void test_invalid_args(void) {
+  /// None of these should match against a fixed point type.
+  printf("%r", s);   // expected-warning{{format specifies type '_Fract' but the argument has type 'short'}}
+  printf("%r", us);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned short'}}
+  printf("%r", i);   // expected-warning{{format specifies type '_Fract' but the argument has type 'int'}}
+  printf("%r", ui);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned int'}}
+  printf("%r", l);   // expected-warning{{format specifies type '_Fract' but the argument has type 'long'}}
+  printf("%r", ul);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned long'}}
+  printf("%r", fl);  // expected-warning{{format specifies type '_Fract' but the argument has type 'float'}}
+  printf("%r", d);   // expected-warning{{format specifies type '_Fract' but the argument has type 'double'}}
+  printf("%r", c);   // expected-warning{{format specifies type '_Fract' but the argument has type 'char'}}
+  printf("%r", uc);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned char'}}
+}
+
+void test_fixed_point_specifiers(void) {
+  printf("%r", f);
+  printf("%R", uf);
+  printf("%k", a);
+  printf("%K", ua);
+
+  /// Test different sizes.
+  printf("%r", sf);   // expected-warning{{format specifies type '_Fract' but the argument has type 'short _Fract'}}
+  printf("%r", lf);   // expected-warning{{format specifies type '_Fract' but the argument has type 'long _Fract'}}
+  printf("%R", usf);  // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type 'unsigned short _Fract'}}
+  printf("%R", ulf);  // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type 'unsigned long _Fract'}}
+  printf("%k", sa);   // expected-warning{{format specifies type '_Accum' but the argument has type 'short _Accum'}}
+  printf("%k", la);   // expected-warning{{format specifies type '_Accum' but the argument has type 'long _Accum'}}
+  printf("%K", usa);  // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type 'unsigned short _Accum'}}
+  printf("%K", ula);  // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type 'unsigned long _Accum'}}
+
+  /// Test signs.
+  printf("%r", uf);  // expected-warning{{format specifies type '_Fract' but the argument has type 'unsigned _Fract'}}
+  printf("%R", f);   // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type '_Fract'}}
+  printf("%k", ua);  // expected-warning{{format specifies type '_Accum' but the argument has type 'unsigned _Accum'}}
+  printf("%K", a);   // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type '_Accum'}}
+
+  /// Test between types.
+  printf("%r", a);   // expected-warning{{format specifies type '_Fract' but the argument has type '_Accum'}}
+  printf("%R", ua);  // expected-warning{{format specifies type 'unsigned _Fract' but the argument has type 'unsigned _Accum'}}
+  printf("%k", f);   // expected-warning{{format specifies type '_Accum' but the argument has type '_Fract'}}
+  printf("%K", uf);  // expected-warning{{format specifies type 'unsigned _Accum' but the argument has type 'unsigned _Fract'}}
+
+  /// Test saturated types.
+  printf("%r", sat_f);
+  printf("%R", sat_uf);
+  printf("%k", sat_a);
+  printf("%K", sat_ua);
+}
+
+void test_length_modifiers_and_flags(void) {
+  printf("%hr", sf);
+  printf("%lr", lf);
+  printf("%hR", usf);
+  printf("%lR", ulf);
+  printf("%hk", sa);
+  printf("%lk", la);
+  printf("%hK", usa);
+  printf("%lK", ula);
+
+  printf("%hr", sat_sf);
+  printf("%lr", sat_lf);
+  printf("%hR", sat_usf);
+  printf("%lR", sat_ulf);
+  printf("%hk", sat_sa);
+  printf("%lk", sat_la);
+  printf("%hK", sat_usa);
+  printf("%lK", sat_ula);
+
+  printf("%10r", f);
+  printf("%10.10r", f);
+  printf("%010r", f);
+  printf("%-10r", f);
+  printf("%.10r", f);
+  printf("%+r", f);
+  printf("% r", f);
+  printf("%#r", f);
+  printf("%#.r", f);
+  printf("%#.0r", f);
+
+  /// Test some invalid length modifiers.
+  printf("%zr", f);   // expected-warning{{length modifier 'z' results in undefined behavior or no effect with 'r' conversion specifier}}
+  printf("%llr", f);  // expected-warning{{length modifier 'll' results in undefined behavior or no effect with 'r' conversion specifier}}
+  printf("%hhr", f);  // expected-warning{{length modifier 'hh' results in undefined behavior or no effect with 'r' conversion specifier}}
+
+  // + on an unsigned fixed point type.
+  printf("%+hR", usf);  // expected-warning{{flag '+' results in undefined behavior with 'R' conversion specifier}}
+  printf("%+R", uf);    // expected-warning{{flag '+' results in undefined behavior with 'R' conversion specifier}}
+  printf("%+lR", ulf);  // expected-warning{{flag '+' results in undefined behavior with 'R' conversion specifier}}
+  printf("%+hK", usa);  // expected-warning{{flag '+' results in undefined behavior with 'K' conversion specifier}}
+  printf("%+K", ua);    // expected-warning{{flag '+' results in undefined behavior with 'K' conversion specifier}}
+  printf("%+lK", ula);  // expected-warning{{flag '+' results in undefined behavior with 'K' conversion specifier}}
+  printf("% hR", usf);  // expected-warning{{flag ' ' results in undefined behavior with 'R' conversion specifier}}
+  printf("% R", uf);    // expected-warning{{flag ' ' results in undefined behavior with 'R' conversion specifier}}
+  printf("% lR", ulf);  // expected-warning{{flag ' ' results in undefined behavior with 'R' conversion specifier}}
+  printf("% hK", usa);  // expected-warning{{flag ' ' results in undefined behavior with 'K' conversion specifier}}
+  printf("% K", ua);    // expected-warning{{flag ' ' results in undefined behavior with 'K' conversion specifier}}
+  printf("% lK", ula);  // expected-warning{{flag ' ' results in undefined behavior with 'K' conversion specifier}}
+}
+#else
+void test_fixed_point_specifiers_no_printf() {
+  printf("%k", i);  // expected-warning{{invalid conversion specifier 'k'}}  
+  printf("%K", i);  // expected-warning{{invalid conversion specifier 'K'}}
+  printf("%r", i);  // expected-warning{{invalid conversion specifier 'r'}}
+  printf("%R", i);  // expected-warning{{invalid conversion specifier 'R'}}
+}
+#endif  // WITHOUT_FIXED_POINT

From 19181f24e516ce1cb58cd5ba27515eb968ae22d9 Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Tue, 27 Feb 2024 11:08:47 -0800
Subject: [PATCH 481/546] [compiler-rt] Add missing include to
 sanitizer_stackdepot_test.cpp

Without this change I am seeing build failures due to missing
std::next_permutation since my standard library does implicitly pull
in <algorithm> anymore.
---
 .../lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
index 479e4a0c184f7..e810122a824f6 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "sanitizer_common/sanitizer_stackdepot.h"
 
+#include <algorithm>
 #include <atomic>
 #include <numeric>
 #include <regex>

From 9ca8db352d22444feabd859380252f13826a8aff Mon Sep 17 00:00:00 2001
From: Micah Weston <micahsweston@gmail.com>
Date: Tue, 27 Feb 2024 14:13:00 -0500
Subject: [PATCH 482/546] [SHT_LLVM_BB_ADDR_MAP] Adds pretty printing of BFI
 and BPI for PGO Analysis Map in tools. (#82292)

Primary change is to add a flag `--pretty-pgo-analysis-map` to
llvm-readobj and llvm-objdump that prints block frequencies and branch
probabilities in the same manner as BFI and BPI respectively. This can
be helpful if you are manually inspecting the outputs from the tools.

In order to print, I moved the `printBlockFreqImpl` function from
Analysis to Support and renamed it to `printRelativeBlockFreq`.
---
 llvm/docs/CommandGuide/llvm-objdump.rst       | 16 +++++-
 llvm/docs/CommandGuide/llvm-readobj.rst       | 11 ++++
 .../llvm/Analysis/BlockFrequencyInfoImpl.h    |  3 -
 llvm/include/llvm/Support/BlockFrequency.h    |  4 ++
 llvm/lib/Analysis/BlockFrequencyInfo.cpp      |  2 +-
 llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp  | 15 -----
 .../lib/CodeGen/MachineBlockFrequencyInfo.cpp |  2 +-
 llvm/lib/Support/BlockFrequency.cpp           | 17 ++++++
 .../llvm-objdump/X86/elf-pgoanalysismap.yaml  | 56 +++++++++++++------
 .../ELF/bb-addr-map-pgo-analysis-map.test     | 28 ++++++----
 llvm/tools/llvm-objdump/ObjdumpOpts.td        |  4 ++
 llvm/tools/llvm-objdump/llvm-objdump.cpp      | 29 +++++++---
 llvm/tools/llvm-readobj/ELFDumper.cpp         | 35 +++++++-----
 llvm/tools/llvm-readobj/ObjDumper.h           |  4 +-
 llvm/tools/llvm-readobj/Opts.td               |  1 +
 llvm/tools/llvm-readobj/llvm-readobj.cpp      |  8 ++-
 16 files changed, 164 insertions(+), 71 deletions(-)

diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst
index 959452a74b23e..7f8def756c696 100644
--- a/llvm/docs/CommandGuide/llvm-objdump.rst
+++ b/llvm/docs/CommandGuide/llvm-objdump.rst
@@ -271,7 +271,12 @@ OPTIONS
 
   When printing a PC-relative global symbol reference, print it as an offset from the leading symbol.
 
-  When a bb-address-map section is present (i.e., the object file is built with ``-fbasic-block-sections=labels``), labels are retrieved from that section instead.
+  When a bb-address-map section is present (i.e., the object file is built with
+  ``-fbasic-block-sections=labels``), labels are retrieved from that section
+  instead. If a pgo-analysis-map is present alongside the bb-address-map, any
+  available analyses are printed after the relevant block label. By default,
+  any analysis with a special representation (i.e. BlockFrequency,
+  BranchProbability, etc) are printed as raw hex values.
 
   Only works with PowerPC objects or X86 linked images.
 
@@ -291,6 +296,15 @@ OPTIONS
        cmp eax, dword ptr <g>
        jge	<L0>
 
+.. option:: --pretty-pgo-analysis-map
+
+  When using :option:`--symbolize-operands` with bb-address-map and
+  pgo-analysis-map, print analyses using the same format as their analysis
+  passes would. An example of pretty format would be printing block frequencies
+  relative to the entry block, the same as BFI.
+
+  Only works when :option:`--symbolize-operands` is enabled.
+
 .. option:: --triple=<string>
 
   Target triple to disassemble for, see ``--version`` for available targets.
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index 6d78a03872344..09dabb28cfa71 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -164,6 +164,17 @@ The following options are implemented only for the ELF file format.
  Display the contents of the basic block address map section(s), which contain the
  address of each function, along with the relative offset of each basic block.
 
+ When pgo analysis maps are present, all analyses are printed as their raw
+ value.
+
+.. option:: --pretty-pgo-analysis-map
+
+ When pgo analysis maps are present in the basic block address map section(s),
+ analyses with special formats (i.e. BlockFrequency, BranchProbability, etc)
+ are printed using the same format as their respective analysis pass.
+
+ Requires :option:`--bb-addr-map` to have an effect.
+
 .. option:: --demangle, -C
 
  Display demangled symbol names in the output.
diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 8acb75e872541..4aa922635c374 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -539,9 +539,6 @@ class BlockFrequencyInfoImplBase {
   }
 };
 
-void printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq,
-                        BlockFrequency Freq);
-
 namespace bfi_detail {
 
 template <class BlockT> struct TypeMap {};
diff --git a/llvm/include/llvm/Support/BlockFrequency.h b/llvm/include/llvm/Support/BlockFrequency.h
index 8b172ee486aab..aeab99615a951 100644
--- a/llvm/include/llvm/Support/BlockFrequency.h
+++ b/llvm/include/llvm/Support/BlockFrequency.h
@@ -19,6 +19,7 @@
 
 namespace llvm {
 
+class raw_ostream;
 class BranchProbability;
 
 // This class represents Block Frequency as a 64-bit value.
@@ -119,6 +120,9 @@ class BlockFrequency {
   }
 };
 
+void printRelativeBlockFreq(raw_ostream &OS, BlockFrequency EntryFreq,
+                            BlockFrequency Freq);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index 96c9bfa0e372c..ebad8388cbe41 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -284,7 +284,7 @@ void BlockFrequencyInfo::verifyMatch(BlockFrequencyInfo &Other) const {
 Printable llvm::printBlockFreq(const BlockFrequencyInfo &BFI,
                                BlockFrequency Freq) {
   return Printable([&BFI, Freq](raw_ostream &OS) {
-    printBlockFreqImpl(OS, BFI.getEntryFreq(), Freq);
+    printRelativeBlockFreq(OS, BFI.getEntryFreq(), Freq);
   });
 }
 
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index ae08d56ef098a..9f6e53ba15b6a 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -634,21 +634,6 @@ BlockFrequencyInfoImplBase::getLoopName(const LoopData &Loop) const {
   return getBlockName(Loop.getHeader()) + (Loop.isIrreducible() ? "**" : "*");
 }
 
-void llvm::printBlockFreqImpl(raw_ostream &OS, BlockFrequency EntryFreq,
-                              BlockFrequency Freq) {
-  if (Freq == BlockFrequency(0)) {
-    OS << "0";
-    return;
-  }
-  if (EntryFreq == BlockFrequency(0)) {
-    OS << "<invalid BFI>";
-    return;
-  }
-  Scaled64 Block(Freq.getFrequency(), 0);
-  Scaled64 Entry(EntryFreq.getFrequency(), 0);
-  OS << Block / Entry;
-}
-
 void IrreducibleGraph::addNodesInLoop(const BFIBase::LoopData &OuterLoop) {
   Start = OuterLoop.getHeader();
   Nodes.reserve(OuterLoop.Nodes.size());
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 7ee72e2144263..cbebdd87398e4 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -280,7 +280,7 @@ BlockFrequency MachineBlockFrequencyInfo::getEntryFreq() const {
 Printable llvm::printBlockFreq(const MachineBlockFrequencyInfo &MBFI,
                                BlockFrequency Freq) {
   return Printable([&MBFI, Freq](raw_ostream &OS) {
-    printBlockFreqImpl(OS, MBFI.getEntryFreq(), Freq);
+    printRelativeBlockFreq(OS, MBFI.getEntryFreq(), Freq);
   });
 }
 
diff --git a/llvm/lib/Support/BlockFrequency.cpp b/llvm/lib/Support/BlockFrequency.cpp
index 329f1e12cdc29..7d5498e7cb997 100644
--- a/llvm/lib/Support/BlockFrequency.cpp
+++ b/llvm/lib/Support/BlockFrequency.cpp
@@ -13,6 +13,8 @@
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScaledNumber.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -45,3 +47,18 @@ std::optional<BlockFrequency> BlockFrequency::mul(uint64_t Factor) const {
     return {};
   return BlockFrequency(ResultFrequency);
 }
+
+void llvm::printRelativeBlockFreq(raw_ostream &OS, BlockFrequency EntryFreq,
+                                  BlockFrequency Freq) {
+  if (Freq == BlockFrequency(0)) {
+    OS << "0";
+    return;
+  }
+  if (EntryFreq == BlockFrequency(0)) {
+    OS << "<invalid BFI>";
+    return;
+  }
+  ScaledNumber<uint64_t> Block(Freq.getFrequency(), 0);
+  ScaledNumber<uint64_t> Entry(EntryFreq.getFrequency(), 0);
+  OS << Block / Entry;
+}
diff --git a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
index 732fab3e2a378..4d1e5408d86d4 100644
--- a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
+++ b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
@@ -47,7 +47,9 @@ Symbols:
 
 # RUN: yaml2obj %s --docnum=2 -o %t2
 # RUN: llvm-objdump %t2 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
-# RUN:   FileCheck %s --check-prefix=ENTRYCOUNT-BLOCKFREQ
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRYCOUNT-BLOCKFREQ
+# RUN: llvm-objdump %t2 -d --symbolize-operands --pretty-pgo-analysis-map --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRYCOUNT-BLOCKFREQ-PRETTY
 
 --- !ELF
 FileHeader:
@@ -98,18 +100,28 @@ Symbols:
     Section: .text.foo
     Value:   0x0
 
-# ENTRYCOUNT-BLOCKFREQ: <foo>:
-# ENTRYCOUNT-BLOCKFREQ: <BB3> (Entry count: 1000, Frequency: 1000):
-# ENTRYCOUNT-BLOCKFREQ: <BB1> (Frequency: 133):
-# ENTRYCOUNT-BLOCKFREQ: <BB2> (Frequency: 18):
-# ENTRYCOUNT-BLOCKFREQ: <BB5> (Frequency: 1000):
+# ENTRYCOUNT-BLOCKFREQ:<foo>:
+# ENTRYCOUNT-BLOCKFREQ:<BB3> (Entry count: 1000, Frequency: 1000):
+# ENTRYCOUNT-BLOCKFREQ:<BB1> (Frequency: 133):
+# ENTRYCOUNT-BLOCKFREQ:<BB2> (Frequency: 18):
+# ENTRYCOUNT-BLOCKFREQ:<BB5> (Frequency: 1000):
+
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<foo>:
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB3> (Entry count: 1000, Frequency: 1.0):
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB1> (Frequency: 0.133):
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB2> (Frequency: 0.018):
+# ENTRYCOUNT-BLOCKFREQ-PRETTY:<BB5> (Frequency: 1.0):
 
 ## Check the case where we have entry points, block frequency, and branch
 ## proabability information.
 
 # RUN: yaml2obj %s --docnum=3 -o %t3
 # RUN: llvm-objdump %t3 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
-# RUN:   FileCheck %s --check-prefix=ENTRY-FREQ-PROB
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY-FREQ-PROB
+# RUN: llvm-objdump %t3 -d --symbolize-operands --pretty-pgo-analysis-map --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck --match-full-lines --strict-whitespace %s --check-prefix=ENTRY-FREQ-PROB-PRETTY
+# RUN: llvm-objdump %t3 -d --pretty-pgo-analysis-map --no-show-raw-insn --no-leading-addr 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=MISSING-SYMBOLIZE-OPERANDS
 
 --- !ELF
 FileHeader:
@@ -154,21 +166,21 @@ Sections:
           - BBFreq: 1000
             Successors:
             - ID:          1
-              BrProb:      0x22222222
+              BrProb:      0x10000000
             - ID:          2
-              BrProb:      0x33333333
+              BrProb:      0x15000000
             - ID:          3
-              BrProb:      0xaaaaaaaa
+              BrProb:      0x50000000
           - BBFreq: 133
             Successors:
             - ID:          2
-              BrProb:      0x11111111
+              BrProb:      0x10000000
             - ID:          3
-              BrProb:      0xeeeeeeee
+              BrProb:      0x70000000
           - BBFreq: 18
             Successors:
             - ID:          3
-              BrProb:      0xffffffff
+              BrProb:      0x80000000
           - BBFreq: 1000
             Successors:    []
 Symbols:
@@ -176,8 +188,16 @@ Symbols:
     Section: .text.foo
     Value:   0x0
 
-# ENTRY-FREQ-PROB: <foo>:
-# ENTRY-FREQ-PROB: <BB3> (Entry count: 1000, Frequency: 1000, Successors: BB1:22222222, BB2:33333333, BB3:aaaaaaaa):
-# ENTRY-FREQ-PROB: <BB1> (Frequency: 133, Successors: BB2:11111111, BB3:eeeeeeee):
-# ENTRY-FREQ-PROB: <BB2> (Frequency: 18, Successors: BB3:ffffffff):
-# ENTRY-FREQ-PROB: <BB5> (Frequency: 1000):
+# ENTRY-FREQ-PROB:<foo>:
+# ENTRY-FREQ-PROB:<BB3> (Entry count: 1000, Frequency: 1000, Successors: BB1:10000000, BB2:15000000, BB3:50000000):
+# ENTRY-FREQ-PROB:<BB1> (Frequency: 133, Successors: BB2:10000000, BB3:70000000):
+# ENTRY-FREQ-PROB:<BB2> (Frequency: 18, Successors: BB3:80000000):
+# ENTRY-FREQ-PROB:<BB5> (Frequency: 1000):
+
+# ENTRY-FREQ-PROB-PRETTY:<foo>:
+# ENTRY-FREQ-PROB-PRETTY:<BB3> (Entry count: 1000, Frequency: 1.0, Successors: BB1:[0x10000000 / 0x80000000 = 12.50%], BB2:[0x15000000 / 0x80000000 = 16.41%], BB3:[0x50000000 / 0x80000000 = 62.50%]):
+# ENTRY-FREQ-PROB-PRETTY:<BB1> (Frequency: 0.133, Successors: BB2:[0x10000000 / 0x80000000 = 12.50%], BB3:[0x70000000 / 0x80000000 = 87.50%]):
+# ENTRY-FREQ-PROB-PRETTY:<BB2> (Frequency: 0.018, Successors: BB3:[0x80000000 / 0x80000000 = 100.00%]):
+# ENTRY-FREQ-PROB-PRETTY:<BB5> (Frequency: 1.0):
+
+# MISSING-SYMBOLIZE-OPERANDS: warning: --symbolize-operands must be enabled for --pretty-pgo-analysis-map to have an effect
diff --git a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
index e5a9400c670c0..5faafd4d83b2f 100644
--- a/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
+++ b/llvm/test/tools/llvm-readobj/ELF/bb-addr-map-pgo-analysis-map.test
@@ -3,17 +3,19 @@
 
 ## Check 64-bit:
 # RUN: yaml2obj %s -DBITS=64 -DADDR=0x999999999 -o %t1.x64.o
-# RUN: llvm-readobj %t1.x64.o --bb-addr-map 2>&1 | FileCheck %s -DADDR=0x999999999 -DFILE=%t1.x64.o --check-prefix=CHECK
+# RUN: llvm-readobj %t1.x64.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DADDR=0x999999999 -DFILE=%t1.x64.o --check-prefixes=CHECK,RAW
+# RUN: llvm-readobj %t1.x64.o --bb-addr-map --pretty-pgo-analysis-map 2>&1 | FileCheck --match-full-lines %s -DADDR=0x999999999 -DFILE=%t1.x64.o --check-prefixes=CHECK,PRETTY
 # RUN: llvm-readelf %t1.x64.o --bb-addr-map | FileCheck %s --check-prefix=GNU
+# RUN: llvm-readobj %t1.x64.o --pretty-pgo-analysis-map 2>&1 | FileCheck %s --check-prefix=PRETTY-NO-BAM
 
 ## Check 32-bit:
 # RUN: yaml2obj %s -DBITS=32 -o %t1.x32.o
-# RUN: llvm-readobj %t1.x32.o --bb-addr-map 2>&1 | FileCheck -DADDR=0x11111 %s -DFILE=%t1.x32.o --check-prefix=CHECK
+# RUN: llvm-readobj %t1.x32.o --bb-addr-map 2>&1 | FileCheck --match-full-lines -DADDR=0x11111 %s -DFILE=%t1.x32.o --check-prefixes=CHECK,RAW
 # RUN: llvm-readelf %t1.x32.o --bb-addr-map | FileCheck %s --check-prefix=GNU
 
 ## Check that a malformed section can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DSIZE=24 -o %t2.o
-# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED
+# RUN: llvm-readobj %t2.o --bb-addr-map 2>&1 | FileCheck --match-full-lines %s -DOFFSET=0x00000018 -DFILE=%t2.o --check-prefix=TRUNCATED
 
 ## Check that missing features can be handled.
 # RUN: yaml2obj %s -DBITS=32 -DFEATURE=0x2 -o %t3.o
@@ -22,7 +24,7 @@
 # CHECK:      BBAddrMap [
 # CHECK-NEXT:   Function {
 # CHECK-NEXT:     At: [[ADDR]]
-# CHECK-NEXT: warning: '[[FILE]]': could not identify function symbol for address ([[ADDR]]) in SHT_LLVM_BB_ADDR_MAP section with index 3
+# CHECK-NEXT: {{.*}}: warning: '[[FILE]]': could not identify function symbol for address ([[ADDR]]) in SHT_LLVM_BB_ADDR_MAP section with index 3
 # CHECK-NEXT:     Name: <?>
 # CHECK-NEXT:     BB Ranges [
 # CHECK-NEXT:       {
@@ -55,16 +57,19 @@
 # CHECK-NEXT:       FuncEntryCount: 100
 # CHECK-NEXT:       PGO BB entries [
 # CHECK-NEXT:         {
-# CHECK-NEXT:           Frequency: 100
+# RAW-NEXT:             Frequency: 100
+# PRETTY-NEXT:          Frequency: 1.0
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:             {
 # CHECK-NEXT:               ID: 2
-# CHECK-NEXT:               Probability: 0xFFFFFFFF
+# RAW-NEXT:                 Probability: 0x80000000
+# PRETTY-NEXT:              Probability: 0x80000000 / 0x80000000 = 100.00%
 # CHECK-NEXT:             }
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
 # CHECK-NEXT:         {
-# CHECK-NEXT:           Frequency: 100
+# RAW-NEXT:             Frequency: 100
+# PRETTY-NEXT:          Frequency: 1.0
 # CHECK-NEXT:           Successors [
 # CHECK-NEXT:           ]
 # CHECK-NEXT:         }
@@ -95,7 +100,8 @@
 # CHECK-NEXT:       FuncEntryCount: 8888
 # CHECK-NEXT:       PGO BB entries [
 # CHECK-NEXT:         {
-# CHECK-NEXT:           Frequency: 9000
+# RAW-NEXT:             Frequency: 9000
+# PRETTY-NEXT:          Frequency: 1.0
 # CHECK-NEXT:         }
 # CHECK-NEXT:       ]
 # CHECK-NEXT:     }
@@ -104,8 +110,10 @@
 
 # GNU: GNUStyle::printBBAddrMaps not implemented
 
+# PRETTY-NO-BAM: warning: --bb-addr-map must be enabled for --pretty-pgo-analysis-map to have an effect
+
 # TRUNCATED:      BBAddrMap [
-# TRUNCATED-NEXT:   warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: unable to decode LEB128 at offset [[OFFSET]]: malformed uleb128, extends past end
+# TRUNCATED-NEXT: {{.*}}: warning: '[[FILE]]': unable to dump SHT_LLVM_BB_ADDR_MAP section with index 3: unable to decode LEB128 at offset [[OFFSET]]: malformed uleb128, extends past end
 # TRUNCATED-NEXT: ]
 ## Check that the other valid section is properly dumped.
 # TRUNCATED-NEXT: BBAddrMap [
@@ -192,7 +200,7 @@ Sections:
           - BBFreq:        100
             Successors:
               - ID:        2
-                BrProb:    0xFFFFFFFF
+                BrProb:    0x80000000
           - BBFreq:        100
             Successors:    []
       - FuncEntryCount: 8888
diff --git a/llvm/tools/llvm-objdump/ObjdumpOpts.td b/llvm/tools/llvm-objdump/ObjdumpOpts.td
index c1dec5ced89d3..c3764c6e97534 100644
--- a/llvm/tools/llvm-objdump/ObjdumpOpts.td
+++ b/llvm/tools/llvm-objdump/ObjdumpOpts.td
@@ -210,6 +210,10 @@ def : Flag<["-"], "t">, Alias<syms>, HelpText<"Alias for --syms">;
 def symbolize_operands : Flag<["--"], "symbolize-operands">,
   HelpText<"Symbolize instruction operands when disassembling">;
 
+def pretty_pgo_analysis_map : Flag<["--"], "pretty-pgo-analysis-map">,
+                              HelpText<"Display PGO analysis values with "
+                                       "formatting rather than raw numbers">;
+
 def dynamic_syms : Flag<["--"], "dynamic-syms">,
   HelpText<"Display the contents of the dynamic symbol table">;
 def : Flag<["-"], "T">, Alias<dynamic_syms>,
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 948a5d74e1ab2..78cf67b1e630b 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -188,8 +188,10 @@ class BBAddrMapFunctionEntry {
   const BBAddrMap &getAddrMap() const { return AddrMap; }
 
   // Returns the PGO string associated with the entry of index `PGOBBEntryIndex`
-  // in `PGOMap`.
-  std::string constructPGOLabelString(size_t PGOBBEntryIndex) const {
+  // in `PGOMap`. If PrettyPGOAnalysis is true, prints BFI as relative frequency
+  // and BPI as percentage. Otherwise raw values are displayed.
+  std::string constructPGOLabelString(size_t PGOBBEntryIndex,
+                                      bool PrettyPGOAnalysis) const {
     if (!PGOMap.FeatEnable.hasPGOAnalysis())
       return "";
     std::string PGOString;
@@ -211,7 +213,12 @@ class BBAddrMapFunctionEntry {
           PGOMap.BBEntries[PGOBBEntryIndex];
 
       if (PGOMap.FeatEnable.BBFreq) {
-        PGOSS << "Frequency: " << Twine(PGOBBEntry.BlockFreq.getFrequency());
+        PGOSS << "Frequency: ";
+        if (PrettyPGOAnalysis)
+          printRelativeBlockFreq(PGOSS, PGOMap.BBEntries.front().BlockFreq,
+                                 PGOBBEntry.BlockFreq);
+        else
+          PGOSS << Twine(PGOBBEntry.BlockFreq.getFrequency());
         if (PGOMap.FeatEnable.BrProb && PGOBBEntry.Successors.size() > 0) {
           PGOSS << ", ";
         }
@@ -220,9 +227,12 @@ class BBAddrMapFunctionEntry {
         PGOSS << "Successors: ";
         interleaveComma(
             PGOBBEntry.Successors, PGOSS,
-            [&PGOSS](const PGOAnalysisMap::PGOBBEntry::SuccessorEntry &SE) {
+            [&](const PGOAnalysisMap::PGOBBEntry::SuccessorEntry &SE) {
               PGOSS << "BB" << SE.ID << ":";
-              PGOSS.write_hex(SE.Prob.getNumerator());
+              if (PrettyPGOAnalysis)
+                PGOSS << "[" << SE.Prob << "]";
+              else
+                PGOSS.write_hex(SE.Prob.getNumerator());
             });
       }
     }
@@ -331,6 +341,7 @@ static bool HasStopAddressFlag;
 
 bool objdump::SymbolTable;
 static bool SymbolizeOperands;
+static bool PrettyPGOAnalysisMap;
 static bool DynamicSymbolTable;
 std::string objdump::TripleName;
 bool objdump::UnwindInfo;
@@ -1410,8 +1421,8 @@ static void collectBBAddrMapLabels(
 
     std::string LabelString = ("BB" + Twine(BBEntry.ID)).str();
     Labels[BBAddress].push_back(
-        {LabelString,
-         FunctionMap->constructPGOLabelString(NumBBEntriesBeforeRange + I)});
+        {LabelString, FunctionMap->constructPGOLabelString(
+                          NumBBEntriesBeforeRange + I, PrettyPGOAnalysisMap)});
   }
 }
 
@@ -3473,6 +3484,10 @@ static void parseObjdumpOptions(const llvm::opt::InputArgList &InputArgs) {
   HasStopAddressFlag = InputArgs.hasArg(OBJDUMP_stop_address_EQ);
   SymbolTable = InputArgs.hasArg(OBJDUMP_syms);
   SymbolizeOperands = InputArgs.hasArg(OBJDUMP_symbolize_operands);
+  PrettyPGOAnalysisMap = InputArgs.hasArg(OBJDUMP_pretty_pgo_analysis_map);
+  if (PrettyPGOAnalysisMap && !SymbolizeOperands)
+    reportCmdLineWarning("--symbolize-operands must be enabled for "
+                         "--pretty-pgo-analysis-map to have an effect");
   DynamicSymbolTable = InputArgs.hasArg(OBJDUMP_dynamic_syms);
   TripleName = InputArgs.getLastArgValue(OBJDUMP_triple_EQ).str();
   UnwindInfo = InputArgs.hasArg(OBJDUMP_unwind_info);
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 4be678df44125..e78732353cc87 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -593,7 +593,7 @@ template <typename ELFT> class GNUELFDumper : public ELFDumper<ELFT> {
   void printVersionDefinitionSection(const Elf_Shdr *Sec) override;
   void printVersionDependencySection(const Elf_Shdr *Sec) override;
   void printCGProfile() override;
-  void printBBAddrMaps() override;
+  void printBBAddrMaps(bool PrettyPGOAnalysis) override;
   void printAddrsig() override;
   void printNotes() override;
   void printELFLinkerOptions() override;
@@ -704,7 +704,7 @@ template <typename ELFT> class LLVMELFDumper : public ELFDumper<ELFT> {
   void printVersionDefinitionSection(const Elf_Shdr *Sec) override;
   void printVersionDependencySection(const Elf_Shdr *Sec) override;
   void printCGProfile() override;
-  void printBBAddrMaps() override;
+  void printBBAddrMaps(bool PrettyPGOAnalysis) override;
   void printAddrsig() override;
   void printNotes() override;
   void printELFLinkerOptions() override;
@@ -5036,7 +5036,8 @@ template <class ELFT> void GNUELFDumper<ELFT>::printCGProfile() {
   OS << "GNUStyle::printCGProfile not implemented\n";
 }
 
-template <class ELFT> void GNUELFDumper<ELFT>::printBBAddrMaps() {
+template <class ELFT>
+void GNUELFDumper<ELFT>::printBBAddrMaps(bool /*PrettyPGOAnalysis*/) {
   OS << "GNUStyle::printBBAddrMaps not implemented\n";
 }
 
@@ -7526,7 +7527,8 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printCGProfile() {
   }
 }
 
-template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
+template <class ELFT>
+void LLVMELFDumper<ELFT>::printBBAddrMaps(bool PrettyPGOAnalysis) {
   bool IsRelocatable = this->Obj.getHeader().e_type == ELF::ET_REL;
   using Elf_Shdr = typename ELFT::Shdr;
   auto IsMatch = [](const Elf_Shdr &Sec) -> bool {
@@ -7605,21 +7607,28 @@ template <class ELFT> void LLVMELFDumper<ELFT>::printBBAddrMaps() {
           for (const PGOAnalysisMap::PGOBBEntry &PBBE : PAM.BBEntries) {
             DictScope L(W);
 
-            /// FIXME: currently we just emit the raw frequency, it may be
-            /// better to provide an option to scale it by the first entry
-            /// frequence using BlockFrequency::Scaled64 number
-            if (PAM.FeatEnable.BBFreq)
-              W.printNumber("Frequency", PBBE.BlockFreq.getFrequency());
+            if (PAM.FeatEnable.BBFreq) {
+              if (PrettyPGOAnalysis) {
+                std::string BlockFreqStr;
+                raw_string_ostream SS(BlockFreqStr);
+                printRelativeBlockFreq(SS, PAM.BBEntries.front().BlockFreq,
+                                       PBBE.BlockFreq);
+                W.printString("Frequency", BlockFreqStr);
+              } else {
+                W.printNumber("Frequency", PBBE.BlockFreq.getFrequency());
+              }
+            }
 
             if (PAM.FeatEnable.BrProb) {
               ListScope L(W, "Successors");
               for (const auto &Succ : PBBE.Successors) {
                 DictScope L(W);
                 W.printNumber("ID", Succ.ID);
-                /// FIXME: currently we just emit the raw numerator of the
-                /// probably, it may be better to provide an option to emit it
-                /// as a percentage or other prettied representation
-                W.printHex("Probability", Succ.Prob.getNumerator());
+                if (PrettyPGOAnalysis) {
+                  W.printObject("Probability", Succ.Prob);
+                } else {
+                  W.printHex("Probability", Succ.Prob.getNumerator());
+                }
               }
             }
           }
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 3958dd3a33333..cd744e3bbfb71 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -129,7 +129,9 @@ class ObjDumper {
   virtual void printGroupSections() {}
   virtual void printHashHistograms() {}
   virtual void printCGProfile() {}
-  virtual void printBBAddrMaps() {}
+  // If PrettyPGOAnalysis is true, prints BFI as relative frequency and BPI as
+  // percentage. Otherwise raw values are displayed.
+  virtual void printBBAddrMaps(bool PrettyPGOAnalysis) {}
   virtual void printAddrsig() {}
   virtual void printNotes() {}
   virtual void printELFLinkerOptions() {}
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index 018facc278e89..1e9cde6b2e87c 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -19,6 +19,7 @@ def all : FF<"all", "Equivalent to setting: --file-header, --program-headers, --
              "--section-groups and --histogram">;
 def arch_specific : FF<"arch-specific", "Display architecture-specific information">;
 def bb_addr_map : FF<"bb-addr-map", "Display the BB address map section">;
+def pretty_pgo_analysis_map : FF<"pretty-pgo-analysis-map", "Display PGO analysis values with formatting rather than raw numbers">;
 def cg_profile : FF<"cg-profile", "Display call graph profile section">;
 def decompress : FF<"decompress", "Dump decompressed section content when used with -x or -p">;
 defm demangle : BB<"demangle", "Demangle symbol names", "Do not demangle symbol names (default)">;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 979433d69011c..a0b5765660165 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -95,6 +95,7 @@ static bool Addrsig;
 static bool All;
 static bool ArchSpecificInfo;
 static bool BBAddrMap;
+static bool PrettyPGOAnalysisMap;
 bool ExpandRelocs;
 static bool CGProfile;
 static bool Decompress;
@@ -212,6 +213,11 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::All = Args.hasArg(OPT_all);
   opts::ArchSpecificInfo = Args.hasArg(OPT_arch_specific);
   opts::BBAddrMap = Args.hasArg(OPT_bb_addr_map);
+  opts::PrettyPGOAnalysisMap = Args.hasArg(OPT_pretty_pgo_analysis_map);
+  if (opts::PrettyPGOAnalysisMap && !opts::BBAddrMap)
+    WithColor::warning(errs(), ToolName)
+        << "--bb-addr-map must be enabled for --pretty-pgo-analysis-map to "
+           "have an effect\n";
   opts::CGProfile = Args.hasArg(OPT_cg_profile);
   opts::Decompress = Args.hasArg(OPT_decompress);
   opts::Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, false);
@@ -466,7 +472,7 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     if (opts::CGProfile)
       Dumper->printCGProfile();
     if (opts::BBAddrMap)
-      Dumper->printBBAddrMaps();
+      Dumper->printBBAddrMaps(opts::PrettyPGOAnalysisMap);
     if (opts::Addrsig)
       Dumper->printAddrsig();
     if (opts::Notes)

From d23ef9ef3685eb42ebf719bc28cfe2e4651932fc Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 27 Feb 2024 20:19:44 +0100
Subject: [PATCH 483/546] [Clang] [Sema] Handle placeholders in '.*'
 expressions (#83103)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When analysing whether we should handle a binary expression as an
overloaded operator call or a builtin operator, we were calling
`checkPlaceholderForOverload()`, which takes care of any placeholders
that are not overload sets—which would usually make sense since those
need to be handled as part of overload resolution.

Unfortunately, we were also doing that for `.*`, which is not
overloadable, and then proceeding to create a builtin operator anyway,
which would crash if the RHS happened to be an unresolved overload set
(due hitting an assertion in `CreateBuiltinBinOp()`—specifically, in one
of its callees—in the `.*` case that makes sure its arguments aren’t
placeholders).

This pr instead makes it so we check for *all* placeholders early if the
operator is `.*`.

It’s worth noting that,
1. In the `.*` case, we now additionally also check for *any*
placeholders (not just non-overload-sets) in the LHS; this shouldn’t
make a difference, however—at least I couldn’t think of a way to trigger
the assertion with an overload set as the LHS of `.*`; it is worth
noting that the assertion in question would also complain if the LHS
happened to be of placeholder type, though.
2. There is another case in which we also don’t perform overload
resolution—namely `=` if the LHS is not of class or enumeration type
after handling non-overload-set placeholders—as in the `.*` case, but
similarly to 1., I first couldn’t think of a way of getting this case to
crash, and secondly, `CreateBuiltinBinOp()` doesn’t seem to care about
placeholders in the LHS or RHS in the `=` case (from what I can tell,
it, or rather one of its callees, only checks that the LHS is not a
pseudo-object type, but those will have already been handled by the call
to `checkPlaceholderForOverload()` by the time we get to this function),
so I don’t think this case suffers from the same problem.

This fixes #53815.

---------

Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/docs/ReleaseNotes.rst     |  2 ++
 clang/lib/Sema/SemaOverload.cpp | 22 +++++++++++++++++-----
 clang/test/SemaCXX/gh53815.cpp  | 21 +++++++++++++++++++++
 3 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaCXX/gh53815.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c60c5682dbd82..7e16b9f0c67db 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -288,6 +288,8 @@ Bug Fixes to C++ Support
   templates when determining the primary template of an explicit specialization.
 - Fixed a crash in Microsoft compatibility mode where unqualified dependent base class
   lookup searches the bases of an incomplete class.
+- Fix a crash when an unresolved overload set is encountered on the RHS of a ``.*`` operator.
+  (`#53815 <https://github.com/llvm/llvm-project/issues/53815>`_)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index ecad2b9681655..7d38043890ca2 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14571,6 +14571,23 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
                                        CurFPFeatureOverrides());
   }
 
+  // If this is the .* operator, which is not overloadable, just
+  // create a built-in binary operator.
+  if (Opc == BO_PtrMemD) {
+    auto CheckPlaceholder = [&](Expr *&Arg) {
+      ExprResult Res = CheckPlaceholderExpr(Arg);
+      if (Res.isUsable())
+        Arg = Res.get();
+      return !Res.isUsable();
+    };
+
+    // CreateBuiltinBinOp() doesn't like it if we tell it to create a '.*'
+    // expression that contains placeholders (in either the LHS or RHS).
+    if (CheckPlaceholder(Args[0]) || CheckPlaceholder(Args[1]))
+      return ExprError();
+    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
+  }
+
   // Always do placeholder-like conversions on the RHS.
   if (checkPlaceholderForOverload(*this, Args[1]))
     return ExprError();
@@ -14590,11 +14607,6 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
   if (Opc == BO_Assign && !Args[0]->getType()->isOverloadableType())
     return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
 
-  // If this is the .* operator, which is not overloadable, just
-  // create a built-in binary operator.
-  if (Opc == BO_PtrMemD)
-    return CreateBuiltinBinOp(OpLoc, Opc, Args[0], Args[1]);
-
   // Build the overload set.
   OverloadCandidateSet CandidateSet(OpLoc, OverloadCandidateSet::CSK_Operator,
                                     OverloadCandidateSet::OperatorRewriteInfo(
diff --git a/clang/test/SemaCXX/gh53815.cpp b/clang/test/SemaCXX/gh53815.cpp
new file mode 100644
index 0000000000000..326c911c7bfaf
--- /dev/null
+++ b/clang/test/SemaCXX/gh53815.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+// expected-no-diagnostics
+
+// Check that we don't crash due to forgetting to check for placeholders
+// in the RHS of '.*'.
+
+template <typename Fn>
+static bool has_explicitly_named_overload() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+int main() {
+  has_explicitly_named_overload<decltype([](auto){})>();
+}
+
+template <typename Fn>
+constexpr bool has_explicitly_named_overload_2() {
+  return requires { Fn().*&Fn::operator(); };
+}
+
+static_assert(!has_explicitly_named_overload_2<decltype([](auto){})>());

From b3189b13b274a3411f939574aa573a7656bf372b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 27 Feb 2024 11:23:17 -0800
Subject: [PATCH 484/546] [flang][cuda] CUF kernel loop directive (#82836)

This patch introduces a new operation to represent the CUDA Fortran
kernel loop directive. This operation is modeled as a LoopLikeOp
operation in a similar way to acc.loop.

The CUFKernelDoConstruct parse tree node is also placed correctly in the
PFTBuilder to be available in PFT evaluations.

Lowering from the flang parse-tree to MLIR is also done.
---
 flang/include/flang/Lower/PFTBuilder.h        |   5 +-
 .../include/flang/Optimizer/Dialect/FIROps.td |  27 ++++
 flang/lib/Lower/Bridge.cpp                    | 129 ++++++++++++++++++
 flang/lib/Optimizer/Dialect/FIROps.cpp        |  97 +++++++++++++
 .../Lower/CUDA/cuda-kernel-loop-directive.cuf |  51 +++++++
 5 files changed, 307 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf

diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index c2b0fdbf357cd..9913f584133fa 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -138,7 +138,8 @@ using Directives =
     std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
                parser::OpenACCRoutineConstruct,
                parser::OpenACCDeclarativeConstruct, parser::OpenMPConstruct,
-               parser::OpenMPDeclarativeConstruct, parser::OmpEndLoopDirective>;
+               parser::OpenMPDeclarativeConstruct, parser::OmpEndLoopDirective,
+               parser::CUFKernelDoConstruct>;
 
 using DeclConstructs = std::tuple<parser::OpenMPDeclarativeConstruct,
                                   parser::OpenACCDeclarativeConstruct>;
@@ -178,7 +179,7 @@ static constexpr bool isNopConstructStmt{common::HasMember<
 template <typename A>
 static constexpr bool isExecutableDirective{common::HasMember<
     A, std::tuple<parser::CompilerDirective, parser::OpenACCConstruct,
-                  parser::OpenMPConstruct>>};
+                  parser::OpenMPConstruct, parser::CUFKernelDoConstruct>>};
 
 template <typename A>
 static constexpr bool isFunctionLike{common::HasMember<
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 08239230f793f..db5e5f4bc682e 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3127,4 +3127,31 @@ def fir_BoxOffsetOp : fir_Op<"box_offset", [NoMemoryEffect]> {
   ];
 }
 
+def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
+    DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
+
+  let arguments = (ins
+    Variadic<I32>:$grid, // empty means `*`
+    Variadic<I32>:$block, // empty means `*`
+    Optional<I32>:$stream,
+    Variadic<Index>:$lowerbound,
+    Variadic<Index>:$upperbound,
+    Variadic<Index>:$step,
+    OptionalAttr<I64Attr>:$n
+  );
+
+  let regions = (region AnyRegion:$region);
+
+  let assemblyFormat = [{
+    `<` `<` `<` custom<CUFKernelValues>($grid, type($grid)) `,` 
+                custom<CUFKernelValues>($block, type($block))
+        ( `,` `stream` `=` $stream^ )? `>` `>` `>`
+        custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
+            $upperbound, type($upperbound), $step, type($step))
+        attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 83555e7cd82e7..f865b53f74deb 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2474,6 +2474,135 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     // Handled by genFIR(const Fortran::parser::OpenACCDeclarativeConstruct &)
   }
 
+  void genFIR(const Fortran::parser::CUFKernelDoConstruct &kernel) {
+    localSymbols.pushScope();
+    const Fortran::parser::CUFKernelDoConstruct::Directive &dir =
+        std::get<Fortran::parser::CUFKernelDoConstruct::Directive>(kernel.t);
+
+    mlir::Location loc = genLocation(dir.source);
+
+    Fortran::lower::StatementContext stmtCtx;
+
+    unsigned nestedLoops = 1;
+
+    const auto &nLoops =
+        std::get<std::optional<Fortran::parser::ScalarIntConstantExpr>>(dir.t);
+    if (nLoops)
+      nestedLoops = *Fortran::semantics::GetIntValue(*nLoops);
+
+    mlir::IntegerAttr n;
+    if (nestedLoops > 1)
+      n = builder->getIntegerAttr(builder->getI64Type(), nestedLoops);
+
+    const std::list<Fortran::parser::ScalarIntExpr> &grid = std::get<1>(dir.t);
+    const std::list<Fortran::parser::ScalarIntExpr> &block = std::get<2>(dir.t);
+    const std::optional<Fortran::parser::ScalarIntExpr> &stream =
+        std::get<3>(dir.t);
+
+    llvm::SmallVector<mlir::Value> gridValues;
+    for (const Fortran::parser::ScalarIntExpr &expr : grid)
+      gridValues.push_back(fir::getBase(
+          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    llvm::SmallVector<mlir::Value> blockValues;
+    for (const Fortran::parser::ScalarIntExpr &expr : block)
+      blockValues.push_back(fir::getBase(
+          genExprValue(*Fortran::semantics::GetExpr(expr), stmtCtx)));
+    mlir::Value streamValue;
+    if (stream)
+      streamValue = fir::getBase(
+          genExprValue(*Fortran::semantics::GetExpr(*stream), stmtCtx));
+
+    const auto &outerDoConstruct =
+        std::get<std::optional<Fortran::parser::DoConstruct>>(kernel.t);
+
+    llvm::SmallVector<mlir::Location> locs;
+    locs.push_back(loc);
+    llvm::SmallVector<mlir::Value> lbs, ubs, steps;
+
+    mlir::Type idxTy = builder->getIndexType();
+
+    llvm::SmallVector<mlir::Type> ivTypes;
+    llvm::SmallVector<mlir::Location> ivLocs;
+    llvm::SmallVector<mlir::Value> ivValues;
+    for (unsigned i = 0; i < nestedLoops; ++i) {
+      const Fortran::parser::LoopControl *loopControl;
+      Fortran::lower::pft::Evaluation *loopEval =
+          &getEval().getFirstNestedEvaluation();
+
+      mlir::Location crtLoc = loc;
+      if (i == 0) {
+        loopControl = &*outerDoConstruct->GetLoopControl();
+        crtLoc =
+            genLocation(Fortran::parser::FindSourceLocation(outerDoConstruct));
+      } else {
+        auto *doCons = loopEval->getIf<Fortran::parser::DoConstruct>();
+        assert(doCons && "expect do construct");
+        loopControl = &*doCons->GetLoopControl();
+        crtLoc = genLocation(Fortran::parser::FindSourceLocation(*doCons));
+      }
+
+      locs.push_back(crtLoc);
+
+      const Fortran::parser::LoopControl::Bounds *bounds =
+          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+      assert(bounds && "Expected bounds on the loop construct");
+
+      Fortran::semantics::Symbol &ivSym =
+          bounds->name.thing.symbol->GetUltimate();
+      ivValues.push_back(getSymbolAddress(ivSym));
+
+      lbs.push_back(builder->createConvert(
+          crtLoc, idxTy,
+          fir::getBase(genExprValue(*Fortran::semantics::GetExpr(bounds->lower),
+                                    stmtCtx))));
+      ubs.push_back(builder->createConvert(
+          crtLoc, idxTy,
+          fir::getBase(genExprValue(*Fortran::semantics::GetExpr(bounds->upper),
+                                    stmtCtx))));
+      if (bounds->step)
+        steps.push_back(fir::getBase(
+            genExprValue(*Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder->createIntegerConstant(loc, idxTy, 1));
+
+      ivTypes.push_back(idxTy);
+      ivLocs.push_back(crtLoc);
+      if (i < nestedLoops - 1)
+        loopEval = &*std::next(loopEval->getNestedEvaluations().begin());
+    }
+
+    auto op = builder->create<fir::CUDAKernelOp>(
+        loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n);
+    builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
+                         ivLocs);
+    mlir::Block &b = op.getRegion().back();
+    builder->setInsertionPointToStart(&b);
+
+    for (auto [arg, value] : llvm::zip(
+             op.getLoopRegions().front()->front().getArguments(), ivValues)) {
+      mlir::Value convArg =
+          builder->createConvert(loc, fir::unwrapRefType(value.getType()), arg);
+      builder->create<fir::StoreOp>(loc, convArg, value);
+    }
+
+    builder->create<fir::FirEndOp>(loc);
+    builder->setInsertionPointToStart(&b);
+
+    Fortran::lower::pft::Evaluation *crtEval = &getEval();
+    if (crtEval->lowerAsStructured()) {
+      crtEval = &crtEval->getFirstNestedEvaluation();
+      for (int64_t i = 1; i < nestedLoops; i++)
+        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+    }
+
+    // Generate loop body
+    for (Fortran::lower::pft::Evaluation &e : crtEval->getNestedEvaluations())
+      genFIR(e);
+
+    builder->setInsertionPointAfter(op);
+    localSymbols.popScope();
+  }
+
   void genFIR(const Fortran::parser::OpenMPConstruct &omp) {
     mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint();
     genOpenMPConstruct(*this, localSymbols, bridge.getSemanticsContext(),
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 0a534cdb3c487..9bb10a42a3997 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3866,6 +3866,103 @@ mlir::LogicalResult fir::DeclareOp::verify() {
   return fortranVar.verifyDeclareLikeOpImpl(getMemref());
 }
 
+llvm::SmallVector<mlir::Region *> fir::CUDAKernelOp::getLoopRegions() {
+  return {&getRegion()};
+}
+
+mlir::ParseResult parseCUFKernelValues(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &values,
+    llvm::SmallVectorImpl<mlir::Type> &types) {
+  if (mlir::succeeded(parser.parseOptionalStar()))
+    return mlir::success();
+
+  if (parser.parseOptionalLParen()) {
+    if (mlir::failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(values.emplace_back()))
+                return mlir::failure();
+              return mlir::success();
+            })))
+      return mlir::failure();
+    if (parser.parseRParen())
+      return mlir::failure();
+  } else {
+    if (parser.parseOperand(values.emplace_back()))
+      return mlir::failure();
+    return mlir::success();
+  }
+  return mlir::success();
+}
+
+void printCUFKernelValues(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                          mlir::ValueRange values, mlir::TypeRange types) {
+  if (values.empty())
+    p << "*";
+
+  if (values.size() > 1)
+    p << "(";
+  llvm::interleaveComma(values, p, [&p](mlir::Value v) { p << v; });
+  if (values.size() > 1)
+    p << ")";
+}
+
+mlir::ParseResult parseCUFKernelLoopControl(
+    mlir::OpAsmParser &parser, mlir::Region &region,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &lowerbound,
+    llvm::SmallVectorImpl<mlir::Type> &lowerboundType,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &upperbound,
+    llvm::SmallVectorImpl<mlir::Type> &upperboundType,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &step,
+    llvm::SmallVectorImpl<mlir::Type> &stepType) {
+
+  llvm::SmallVector<mlir::OpAsmParser::Argument> inductionVars;
+  if (parser.parseLParen() ||
+      parser.parseArgumentList(inductionVars,
+                               mlir::OpAsmParser::Delimiter::None,
+                               /*allowType=*/true) ||
+      parser.parseRParen() || parser.parseEqual() || parser.parseLParen() ||
+      parser.parseOperandList(lowerbound, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(lowerboundType) || parser.parseRParen() ||
+      parser.parseKeyword("to") || parser.parseLParen() ||
+      parser.parseOperandList(upperbound, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(upperboundType) || parser.parseRParen() ||
+      parser.parseKeyword("step") || parser.parseLParen() ||
+      parser.parseOperandList(step, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(stepType) || parser.parseRParen())
+    return mlir::failure();
+  return parser.parseRegion(region, inductionVars);
+}
+
+void printCUFKernelLoopControl(
+    mlir::OpAsmPrinter &p, mlir::Operation *op, mlir::Region &region,
+    mlir::ValueRange lowerbound, mlir::TypeRange lowerboundType,
+    mlir::ValueRange upperbound, mlir::TypeRange upperboundType,
+    mlir::ValueRange steps, mlir::TypeRange stepType) {
+  mlir::ValueRange regionArgs = region.front().getArguments();
+  if (!regionArgs.empty()) {
+    p << "(";
+    llvm::interleaveComma(
+        regionArgs, p, [&p](mlir::Value v) { p << v << " : " << v.getType(); });
+    p << ") = (" << lowerbound << " : " << lowerboundType << ") to ("
+      << upperbound << " : " << upperboundType << ") "
+      << " step (" << steps << " : " << stepType << ") ";
+  }
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
+mlir::LogicalResult fir::CUDAKernelOp::verify() {
+  if (getLowerbound().size() != getUpperbound().size() ||
+      getLowerbound().size() != getStep().size())
+    return emitOpError(
+        "expect same number of values in lowerbound, upperbound and step");
+
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
new file mode 100644
index 0000000000000..db628fe756b95
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -0,0 +1,51 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test lowering of CUDA kernel loop directive.
+
+subroutine sub1()
+  integer :: i, j
+  integer, parameter :: n = 100
+  real :: a(n), b(n)
+  real :: c(n,n), d(n,n)
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[IV:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+  !$cuf kernel do <<< 1, 2 >>>
+  do i = 1, n
+    a(i) = a(i) * b(i)
+  end do
+
+! CHECK: %[[LB:.*]] = fir.convert %c1{{.*}} : (i32) -> index
+! CHECK: %[[UB:.*]] = fir.convert %c100{{.*}} : (i32) -> index
+! CHECK: %[[STEP:.*]] = arith.constant 1 : index
+! CHECK: fir.cuda_kernel<<<%c1_i32, %c2_i32>>> (%[[ARG0:.*]] : index) = (%[[LB]] : index) to (%[[UB]] : index)  step (%[[STEP]] : index)
+! CHECK-NOT: fir.do_loop
+! CHECK: %[[ARG0_I32:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+! CHECK: fir.store %[[ARG0_I32]] to %[[IV]]#1 : !fir.ref<i32>
+
+
+  !$cuf kernel do <<< *, * >>>
+  do i = 1, n
+    a(i) = a(i) * b(i)
+  end do
+
+! CHECK: fir.cuda_kernel<<<*, *>>> (%{{.*}} : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index)
+
+  !$cuf kernel do(2) <<< 1, (256,1) >>>
+  do i = 1, n
+    do j = 1, n
+      c(i,j) = c(i,j) * d(i,j)
+    end do
+  end do
+
+! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: {n = 2 : i64}
+
+! TODO: currently these trigger error in the parser
+! !$cuf kernel do(2) <<< (1,*), (256,1) >>>
+! !$cuf kernel do(2) <<< (*,*), (32,4) >>>
+end
+
+
+

From f42e321b9fb54300c4450e699cd3cc453e994b58 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 27 Feb 2024 19:45:43 +0000
Subject: [PATCH 485/546] [AArch64] Use FMOVDr for clearing upper bits (#83107)

This adds some tablegen patterns for generating FMOVDr from concat(X,
zeroes), as the FMOV will implicitly zero the upper bits of the
register. An extra AArch64MIPeepholeOpt is needed to make sure we can
remove the FMOV in the same way we would remove the insert code.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 48 +++++++++----------
 .../Target/AArch64/AArch64MIPeepholeOpt.cpp   | 21 ++++++++
 .../implicitly-set-zero-high-64-bits.ll       | 28 +++--------
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    |  8 ++--
 4 files changed, 55 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index e73bc0d89e4c9..b01a8cd00025f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6667,31 +6667,29 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
 // INS.
-class ConcatPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
-        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
-                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
-
-def : ConcatPat<v2i64, v1i64>;
-def : ConcatPat<v2f64, v1f64>;
-def : ConcatPat<v4i32, v2i32>;
-def : ConcatPat<v4f32, v2f32>;
-def : ConcatPat<v8i16, v4i16>;
-def : ConcatPat<v8f16, v4f16>;
-def : ConcatPat<v8bf16, v4bf16>;
-def : ConcatPat<v16i8, v8i8>;
-
-// If the high lanes are undef, though, we can just ignore them:
-class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
-        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-
-def : ConcatUndefPat<v2i64, v1i64>;
-def : ConcatUndefPat<v2f64, v1f64>;
-def : ConcatUndefPat<v4i32, v2i32>;
-def : ConcatUndefPat<v4f32, v2f32>;
-def : ConcatUndefPat<v8i16, v4i16>;
-def : ConcatUndefPat<v16i8, v8i8>;
+multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> {
+  def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+            (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                         (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+  // If the high lanes are zero we can instead emit a d->d register mov, which
+  // will implicitly clear the upper bits.
+  def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), immAllZerosV)),
+            (SUBREG_TO_REG (i64 0), (FMOVDr V64:$Rn), dsub)>;
+
+  // If the high lanes are undef we can just ignore them:
+  def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+            (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+}
+
+defm : ConcatPat<v2i64, v1i64>;
+defm : ConcatPat<v2f64, v1f64>;
+defm : ConcatPat<v4i32, v2i32>;
+defm : ConcatPat<v4f32, v2f32>;
+defm : ConcatPat<v8i16, v4i16>;
+defm : ConcatPat<v8f16, v4f16>;
+defm : ConcatPat<v8bf16, v4bf16>;
+defm : ConcatPat<v16i8, v8i8>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD across lanes instructions
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 87aa3b98d9382..6865850cf04fe 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -127,6 +127,7 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   bool visitINSERT(MachineInstr &MI);
   bool visitINSviGPR(MachineInstr &MI, unsigned Opc);
   bool visitINSvi64lane(MachineInstr &MI);
+  bool visitFMOVDr(MachineInstr &MI);
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -670,6 +671,23 @@ bool AArch64MIPeepholeOpt::visitINSvi64lane(MachineInstr &MI) {
   return true;
 }
 
+bool AArch64MIPeepholeOpt::visitFMOVDr(MachineInstr &MI) {
+  // An FMOVDr sets the high 64-bits to zero implicitly, similar to ORR for GPR.
+  MachineInstr *Low64MI = MRI->getUniqueVRegDef(MI.getOperand(1).getReg());
+  if (!Low64MI || !is64bitDefwithZeroHigh64bit(Low64MI, MRI))
+    return false;
+
+  // Let's remove MIs for high 64-bits.
+  Register OldDef = MI.getOperand(0).getReg();
+  Register NewDef = MI.getOperand(1).getReg();
+  MRI->constrainRegClass(NewDef, MRI->getRegClass(OldDef));
+  MRI->replaceRegWith(OldDef, NewDef);
+  LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -748,6 +766,9 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       case AArch64::INSvi64lane:
         Changed |= visitINSvi64lane(MI);
         break;
+      case AArch64::FMOVDr:
+        Changed |= visitFMOVDr(MI);
+        break;
       }
     }
   }
diff --git a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
index 1eb9eab1c21e6..a949eaac5cfa2 100644
--- a/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
+++ b/llvm/test/CodeGen/AArch64/implicitly-set-zero-high-64-bits.ll
@@ -137,9 +137,7 @@ entry:
 define <16 x i8> @insertzero_v8i8(<8 x i8> %a) {
 ; CHECK-LABEL: insertzero_v8i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -149,9 +147,7 @@ entry:
 define <8 x i16> @insertzero_v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: insertzero_v4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -161,9 +157,7 @@ entry:
 define <4 x i32> @insertzero_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: insertzero_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -173,9 +167,7 @@ entry:
 define <2 x i64> @insertzero_v1i64(<1 x i64> %a) {
 ; CHECK-LABEL: insertzero_v1i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <1 x i64> %a, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
@@ -185,9 +177,7 @@ entry:
 define <8 x half> @insertzero_v4f16(<4 x half> %a) {
 ; CHECK-LABEL: insertzero_v4f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -210,9 +200,7 @@ entry:
 define <4 x float> @insertzero_v2f32(<2 x float> %a) {
 ; CHECK-LABEL: insertzero_v2f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -222,9 +210,7 @@ entry:
 define <2 x double> @insertzero_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: insertzero_v1f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fmov d0, d0
 ; CHECK-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <1 x double> %a, <1 x double> zeroinitializer, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 86dd1bdd511eb..66b49466cc736 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2182,8 +2182,8 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
 ; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    mov v3.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    fmov d3, d3
+; CHECK-GI-DOT-NEXT:    fmov d4, d4
 ; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
 ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]
@@ -2760,8 +2760,8 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-DOT-NEXT:    ldr b5, [x0, #15]
 ; CHECK-GI-DOT-NEXT:    mov v2.b[14], v6.b[0]
 ; CHECK-GI-DOT-NEXT:    ldr b6, [x1, #15]
-; CHECK-GI-DOT-NEXT:    mov v3.d[1], v0.d[0]
-; CHECK-GI-DOT-NEXT:    mov v4.d[1], v0.d[0]
+; CHECK-GI-DOT-NEXT:    fmov d3, d3
+; CHECK-GI-DOT-NEXT:    fmov d4, d4
 ; CHECK-GI-DOT-NEXT:    mov v1.b[15], v5.b[0]
 ; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-DOT-NEXT:    mov v2.b[15], v6.b[0]

From fd4204464905be198b158a0f30e5c06cc4ce3686 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 13:51:01 -0600
Subject: [PATCH 486/546] [libc] Re-Enable GPU tests and fix math exception
 handling (#83172)

Summary:
A lot of these tests failed previously and were disabled. However we
have fixed some things since then and many of these seem to pass.
Additionally, the last remaining math tests that failed seemed to be due
to the exception handling. For now we just set it to be 'errno'.

These pass locally when tested on a gfx1030, gfx90a, and sm_89
architecture. Hopefully these pass correctly on the sm_60 bot as I've
had things fail on that one only before.
---
 libc/include/llvm-libc-macros/math-macros.h |   2 +
 libc/test/src/__support/CMakeLists.txt      |  23 +-
 libc/test/src/math/CMakeLists.txt           | 281 ++++++++++----------
 libc/test/src/math/smoke/CMakeLists.txt     | 118 ++++----
 libc/test/src/stdlib/CMakeLists.txt         |  50 ++--
 libc/test/src/time/CMakeLists.txt           |  26 +-
 6 files changed, 236 insertions(+), 264 deletions(-)

diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index 0a23647319f4d..9f8edd954b7e0 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -32,6 +32,8 @@
 #define math_errhandling 0
 #elif defined(__NO_MATH_ERRNO__)
 #define math_errhandling (MATH_ERREXCEPT)
+#elif defined(__NVPTX__) || defined(__AMDGPU__)
+#define math_errhandling (MATH_ERRNO)
 #else
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
 #endif
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index c5634866f839b..7200ac276fe50 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -1,17 +1,14 @@
 add_custom_target(libc-support-tests)
 
-# FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_OS_IS_GPU)
-  add_libc_test(
-    blockstore_test
-    SUITE
-      libc-support-tests
-    SRCS
-      blockstore_test.cpp
-    DEPENDS
-      libc.src.__support.blockstore
-  )
-endif()
+add_libc_test(
+  blockstore_test
+  SUITE
+    libc-support-tests
+  SRCS
+    blockstore_test.cpp
+  DEPENDS
+    libc.src.__support.blockstore
+)
 
 add_libc_test(
   endian_test
@@ -42,8 +39,6 @@ add_libc_test(
   DEPENDS
     libc.src.__support.high_precision_decimal
     libc.src.__support.uint128
-  # FIXME Test segfaults on gfx90a GPU
-  UNIT_TEST_ONLY
 )
 
 add_libc_test(
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 81d2e1e55b552..ad7dfdb3dfd9e 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -758,40 +758,37 @@ add_fp_unittest(
     libc.src.__support.FPUtil.basic_operations
 )
 
-# FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    ilogb_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      ilogb_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.ilogb
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
+add_fp_unittest(
+  ilogb_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    ilogb_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.ilogb
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
-  add_fp_unittest(
-    ilogbf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      ilogbf_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.ilogbf
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
-endif()
+add_fp_unittest(
+  ilogbf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    ilogbf_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.ilogbf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
 add_fp_unittest(
   ilogbl_test
@@ -989,92 +986,89 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-# FIXME: These tests are currently broken on the GPU.
-if(NOT LIBC_TARGET_OS_IS_GPU)
-  add_fp_unittest(
-    fminf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fminf_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fminf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fminf_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fminf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmin_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmin_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmin
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmin_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmin_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmin
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fminl_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fminl_test.cpp
-    HDRS
-      FMinTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fminl
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fminl_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fminl_test.cpp
+  HDRS
+    FMinTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fminl
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmaxf_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmaxf
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmaxf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmaxf_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmaxf
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmax_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmax_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmax
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  fmax_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmax_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmax
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    fmaxl_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      fmaxl_test.cpp
-    HDRS
-      FMaxTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.fmaxl
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  fmaxl_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    fmaxl_test.cpp
+  HDRS
+    FMaxTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmaxl
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   sqrtf_test
@@ -1234,38 +1228,35 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-# FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    nextafter_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      nextafter_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafter
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  nextafter_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    nextafter_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafter
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    nextafterf_test
-    SUITE
-      libc-math-unittests
-    SRCS
-      nextafterf_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafterf
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  nextafterf_test
+  SUITE
+    libc-math-unittests
+  SRCS
+    nextafterf_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafterf
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   nextafterl_test
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 825000e1cb7af..be1810944495b 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -758,38 +758,35 @@ add_fp_unittest(
     libc.src.math.frexpf128
 )
 
-# FIXME: These tests are currently broken for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    ilogb_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      ilogb_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.src.math.ilogb
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
+add_fp_unittest(
+  ilogb_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogb_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogb
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
-  add_fp_unittest(
-    ilogbf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      ilogbf_test.cpp
-    HDRS
-      ILogbTest.h
-    DEPENDS
-      libc.src.math.ilogbf
-      libc.src.__support.CPP.limits
-      libc.src.__support.FPUtil.fp_bits
-      libc.src.__support.FPUtil.manipulation_functions
-  )
-endif()
+add_fp_unittest(
+  ilogbf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ilogbf_test.cpp
+  HDRS
+    ILogbTest.h
+  DEPENDS
+    libc.src.math.ilogbf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.manipulation_functions
+)
 
 add_fp_unittest(
   ilogbl_test
@@ -1417,38 +1414,35 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
-# FIXME: These tests are currently spurious for NVPTX.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    nextafter_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      nextafter_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafter
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
+add_fp_unittest(
+  nextafter_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextafter_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafter
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
-  add_fp_unittest(
-    nextafterf_test
-    SUITE
-      libc-math-smoke-tests
-    SRCS
-      nextafterf_test.cpp
-    HDRS
-      NextAfterTest.h
-    DEPENDS
-      libc.include.math
-      libc.src.math.nextafterf
-      libc.src.__support.FPUtil.basic_operations
-      libc.src.__support.FPUtil.fp_bits
-  )
-endif()
+add_fp_unittest(
+  nextafterf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    nextafterf_test.cpp
+  HDRS
+    NextAfterTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.nextafterf
+    libc.src.__support.FPUtil.basic_operations
+    libc.src.__support.FPUtil.fp_bits
+)
 
 add_fp_unittest(
   nextafterl_test
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 5826cfe8d4ca3..5488a61c4ef18 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -54,20 +54,17 @@ add_libc_test(
     libc.src.stdlib.atoll
 )
 
-# This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_fp_unittest(
-    strtod_test
-    SUITE
-      libc-stdlib-tests
-    SRCS
-      strtod_test.cpp
-    DEPENDS
-      libc.src.errno.errno
-      libc.src.stdlib.strtod
-      libc.src.__support.FPUtil.fenv_impl
-  )
-endif()
+add_fp_unittest(
+  strtod_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    strtod_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.stdlib.strtod
+    libc.src.__support.FPUtil.fenv_impl
+)
 
 add_fp_unittest(
   strtof_test
@@ -126,20 +123,17 @@ add_libc_test(
     .strtol_test_support
 )
 
-# This fails on NVPTX where the output value is one-off of the expected value.
-if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_libc_test(
-    strtold_test
-    SUITE
-      libc-stdlib-tests
-    SRCS
-      strtold_test.cpp
-    DEPENDS
-      libc.src.errno.errno
-      libc.src.__support.uint128
-      libc.src.stdlib.strtold
-  )
-endif()
+add_libc_test(
+  strtold_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    strtold_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.uint128
+    libc.src.stdlib.strtold
+)
 
 add_libc_test(
   strtoll_test
diff --git a/libc/test/src/time/CMakeLists.txt b/libc/test/src/time/CMakeLists.txt
index ebb0998feb23e..51cacef0a62fe 100644
--- a/libc/test/src/time/CMakeLists.txt
+++ b/libc/test/src/time/CMakeLists.txt
@@ -102,21 +102,17 @@ add_libc_unittest(
     libc.src.__support.CPP.limits
 )
 
-# Sleeping is not supported on older NVPTX architectures.
-set(unsupported_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62")
-if (NOT ("${LIBC_GPU_TARGET_ARCHITECTURE}" IN_LIST unsupported_architectures))
-  add_libc_test(
-    nanosleep_test
-    SUITE
-      libc_time_unittests
-    SRCS
-      nanosleep_test.cpp
-    DEPENDS
-      libc.include.time
-      libc.src.time.nanosleep
-      libc.src.errno.errno
-  )
-endif()
+add_libc_test(
+  nanosleep_test
+  SUITE
+    libc_time_unittests
+  SRCS
+    nanosleep_test.cpp
+  DEPENDS
+    libc.include.time
+    libc.src.time.nanosleep
+    libc.src.errno.errno
+)
 
 add_libc_unittest(
   time_test

From a76c524adc57220253b34e166f59ed19634e28f5 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 27 Feb 2024 19:57:18 +0000
Subject: [PATCH 487/546] [gn build] Port f44c3faccaa4

---
 llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index 747ca8f9c91d3..b966b74842670 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -75,7 +75,6 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonOptAddrMode.cpp",
     "HexagonOptimizeSZextends.cpp",
     "HexagonPeephole.cpp",
-    "HexagonPostIncOpt.cpp",
     "HexagonRDFOpt.cpp",
     "HexagonRegisterInfo.cpp",
     "HexagonSelectionDAGInfo.cpp",

From 0d1f95760b07a31293ccc82086306833326b70a4 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Tue, 27 Feb 2024 12:05:28 -0800
Subject: [PATCH 488/546] =?UTF-8?q?[mlir][sparse]=20support=20type=20conve?=
 =?UTF-8?q?rsion=20from=20batched=20sparse=20tensors=20to=E2=80=A6=20(#831?=
 =?UTF-8?q?63)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… memrefs.
---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      |  2 +-
 .../SparseTensor/IR/SparseTensorAttrDefs.td   |  2 ++
 .../IR/SparseTensorStorageLayout.h            | 14 ++++----
 .../SparseTensor/IR/SparseTensorType.h        |  8 +++++
 .../SparseTensor/IR/SparseTensorDialect.cpp   | 21 ++++++++----
 .../Transforms/SparseTensorCodegen.cpp        |  5 ++-
 mlir/test/Dialect/SparseTensor/codegen.mlir   | 34 +++++++++++++++++++
 7 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index cc134e7d953ec..9e79b6aca1c9b 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -342,7 +342,7 @@ struct LevelType {
   /// Check if the `LevelType` needs coordinates array.
   constexpr bool isWithCrdLT() const {
     // All sparse levels has coordinate array.
-    return !isa<LevelFormat::Dense>();
+    return !isa<LevelFormat::Dense, LevelFormat::Batch>();
   }
 
   std::string toMLIRString() const {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index ca98665256be5..5d1db2323f95f 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -374,6 +374,8 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     /// is non-null (since no fixed result is valid for every dense-tensor).
     ::mlir::sparse_tensor::Level getLvlRank() const;
 
+    uint64_t getBatchLvlRank() const;
+
     //
     // lvlTypes methods.
     //
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h
index 27dc39609cdad..ce34ae43d1c18 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorStorageLayout.h
@@ -30,15 +30,15 @@ namespace sparse_tensor {
 ///   ;  if dense:
 ///        <nothing>
 ///   ;  if compressed:
-///        memref<? x pos>  positions   ; positions for level l
-///        memref<? x crd>  coordinates ; coordinates for level l
-///   ;  if loose-compressed:
-///        memref<? x pos>  positions   ; lo/hi position pairs for level l
-///        memref<? x crd>  coordinates ; coordinates for level l
+///        memref<[batch] x ? x pos>  positions   ; positions for level l
+///        memref<[batch] x ? x crd>  coordinates ; coordinates for level l
+///   ;  if loose-[batch] x compressed:
+///        memref<[batch] x ? x pos>  positions   ; lo/hi pos pairs for level l
+///        memref<[batch] x ? x crd>  coordinates ; coordinates for level l
 ///   ;  if singleton/2-out-of-4:
-///        memref<? x crd>  coordinates ; coordinates for level l
+///        memref<[batch] x ? x crd>  coordinates ; coordinates for level l
 ///
-///   memref<? x eltType> values        ; values
+///   memref<[batch] x ? x eltType> values        ; values
 ///
 ///   struct sparse_tensor.storage_specifier {
 ///     array<rank x int> lvlSizes    ; sizes/cardinalities for each level
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 1a090ddb782fd..c93a4fcd922c2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -253,6 +253,14 @@ class SparseTensorType {
                                        CrdTransDirectionKind::dim2lvl);
   }
 
+  /// Returns the Level-shape.
+  SmallVector<Size> getBatchLvlShape() const {
+    auto lvlShape = getEncoding().tranlateShape(getDimShape(),
+                                                CrdTransDirectionKind::dim2lvl);
+    lvlShape.truncate(getEncoding().getBatchLvlRank());
+    return lvlShape;
+  }
+
   /// Returns the type with an identity mapping.
   RankedTensorType getDemappedType() const {
     return RankedTensorType::get(getLvlShape(), getElementType(),
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index fd0ed26fbde07..69c3413f35ea9 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -126,13 +126,16 @@ void sparse_tensor::foreachFieldAndTypeInSparseTensor(
   const Type posType = stt.getPosType();
   const Type eltType = stt.getElementType();
 
+  SmallVector<int64_t> memrefShape = stt.getBatchLvlShape();
+  memrefShape.push_back(ShapedType::kDynamic);
+
   const Type specType = StorageSpecifierType::get(stt.getEncoding());
-  // memref<? x pos>  positions
-  const Type posMemType = MemRefType::get({ShapedType::kDynamic}, posType);
-  // memref<? x crd>  coordinates
-  const Type crdMemType = MemRefType::get({ShapedType::kDynamic}, crdType);
-  // memref<? x eltType> values
-  const Type valMemType = MemRefType::get({ShapedType::kDynamic}, eltType);
+  // memref<[batch] x ? x pos>  positions
+  const Type posMemType = MemRefType::get(memrefShape, posType);
+  // memref<[batch] x ? x crd>  coordinates
+  const Type crdMemType = MemRefType::get(memrefShape, crdType);
+  // memref<[batch] x ? x eltType> values
+  const Type valMemType = MemRefType::get(memrefShape, eltType);
 
   StorageLayout(stt).foreachField([specType, posMemType, crdMemType, valMemType,
                                    callback](FieldIndex fieldIdx,
@@ -336,6 +339,12 @@ SparseTensorEncodingAttr SparseTensorEncodingAttr::withoutDimSlices() const {
   return withDimSlices(ArrayRef<SparseTensorDimSliceAttr>{});
 }
 
+uint64_t SparseTensorEncodingAttr::getBatchLvlRank() const {
+  ArrayRef<LevelType> lvlTypes = getLvlTypes();
+  auto lastBatch = std::find_if(lvlTypes.rbegin(), lvlTypes.rend(), isBatchLT);
+  return std::distance(lastBatch, lvlTypes.rend());
+}
+
 bool SparseTensorEncodingAttr::isAllDense() const {
   return !getImpl() || llvm::all_of(getLvlTypes(), isDenseLT);
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index 0ccb11f3a6b85..d5eec4ae67e79 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -1293,7 +1293,7 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
             Value tensor = fKind == SparseTensorFieldKind::ValMemRef
                                ? op.getValues()
                                : op.getLevels()[fIdx];
-
+            // TODO: handle batch.
             TypedValue<BaseMemRefType> mem = genToMemref(rewriter, loc, tensor);
             if (mem.getType().getRank() > 1) {
               // Flattens the buffer to rank 1.
@@ -1322,9 +1322,8 @@ struct SparseAssembleOpConverter : public OpConversionPattern<AssembleOp> {
     for (Level lvl = 0, lvlRank = stt.getLvlRank(); lvl < lvlRank; lvl++) {
       assert(!ShapedType::isDynamic(stt.getDimShape()[lvl]));
 
-      // FIXME: dim/lvl confusion!
       // Sets up the level size.
-      auto lvlSize = constantIndex(rewriter, loc, stt.getDimShape()[lvl]);
+      auto lvlSize = constantIndex(rewriter, loc, stt.getLvlShape()[lvl]);
       desc.setLvlSize(rewriter, loc, lvl, lvlSize);
       // We use a single AOS array to store the trailing COO, so there is only
       // one memory size to set for the entire COO section.
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index c1a976c84fecc..b63762485c961 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -34,6 +34,10 @@
   map = (d0, d1) -> (d1 : dense, d0 : compressed)
 }>
 
+#BCSR = #sparse_tensor.encoding<{
+  map = (d0, d1, d2, d3) -> (d0: batch, d1: batch, d2 : dense, d3 : compressed)
+}>
+
 #DCSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed, d1 : compressed),
   crdWidth = 64,
@@ -182,6 +186,36 @@ func.func @sparse_csr(%arg0: tensor<?x?xf64, #CSR>) {
   return
 }
 
+// CHECK-LABEL: func @sparse_bcsr_0(
+//  CHECK-SAME: %[[A1:.*0]]: memref<?x2x?xindex>,
+//  CHECK-SAME: %[[A2:.*1]]: memref<?x2x?xindex>,
+//  CHECK-SAME: %[[A3:.*]]: memref<?x2x?xf64>,
+//  CHECK-SAME: %[[A4:.*]]: !sparse_tensor.storage_specifier
+//       CHECK: return
+func.func @sparse_bcsr_0(%arg0: tensor<?x2x?x?xf64, #BCSR>) {
+  return
+}
+
+// CHECK-LABEL: func @sparse_bcsr_1(
+//  CHECK-SAME: %[[A1:.*0]]: memref<?x?x?xindex>,
+//  CHECK-SAME: %[[A2:.*1]]: memref<?x?x?xindex>,
+//  CHECK-SAME: %[[A3:.*]]: memref<?x?x?xf64>,
+//  CHECK-SAME: %[[A4:.*]]: !sparse_tensor.storage_specifier
+//       CHECK: return
+func.func @sparse_bcsr_1(%arg0: tensor<?x?x?x?xf64, #BCSR>) {
+  return
+}
+
+// CHECK-LABEL: func @sparse_bcsr_2(
+//  CHECK-SAME: %[[A1:.*0]]: memref<18x6x?xindex>,
+//  CHECK-SAME: %[[A2:.*1]]: memref<18x6x?xindex>,
+//  CHECK-SAME: %[[A3:.*]]: memref<18x6x?xf64>,
+//  CHECK-SAME: %[[A4:.*]]: !sparse_tensor.storage_specifier
+//       CHECK: return
+func.func @sparse_bcsr_2(%arg0: tensor<18x6x4x2xf64, #BCSR>) {
+  return
+}
+
 // CHECK-LABEL: func @sparse_dcsr(
 //  CHECK-SAME: %[[A0:.*0]]: memref<?xi32>,
 //  CHECK-SAME: %[[A1:.*1]]: memref<?xi64>,

From d2a9df2c8ffd21fd52fbd8199a191d10078f41af Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 27 Feb 2024 20:09:54 +0000
Subject: [PATCH 489/546] [TBAA] Handle bitfields when generating !tbaa.struct
 metadata. (#82922)

At the moment, clang generates what I believe are incorrect !tbaa.struct
fields for named bitfields. At the moment, the base type size is used
for named bifields (e.g. sizeof(int)) instead of the bifield width per
field. This results in overalpping fields in !tbaa.struct metadata.

This causes incorrect results when extracting individual copied fields
from !tbaa.struct as in added in dc85719d5.

This patch fixes that by skipping by combining adjacent bitfields
in fields with correct sizes.

Fixes https://github.com/llvm/llvm-project/issues/82586
---
 clang/lib/CodeGen/CodeGenModule.cpp |  4 +--
 clang/lib/CodeGen/CodeGenTBAA.cpp   | 44 +++++++++++++++++++++--------
 clang/lib/CodeGen/CodeGenTBAA.h     |  7 +++--
 clang/test/CodeGen/tbaa-struct.cpp  |  6 ++--
 4 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 1550b000a89a3..d16d12fac8b03 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -397,8 +397,8 @@ CodeGenModule::CodeGenModule(ASTContext &C,
   // Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0.
   if (LangOpts.Sanitize.has(SanitizerKind::Thread) ||
       (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0))
-    TBAA.reset(new CodeGenTBAA(Context, TheModule, CodeGenOpts, getLangOpts(),
-                               getCXXABI().getMangleContext()));
+    TBAA.reset(new CodeGenTBAA(Context, getTypes(), TheModule, CodeGenOpts,
+                               getLangOpts(), getCXXABI().getMangleContext()));
 
   // If debug info or coverage generation is enabled, create the CGDebugInfo
   // object.
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index dc288bc3f6157..8a08161219397 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -15,6 +15,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTBAA.h"
+#include "CGRecordLayout.h"
+#include "CodeGenTypes.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Mangle.h"
@@ -26,16 +28,16 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
 using namespace clang;
 using namespace CodeGen;
 
-CodeGenTBAA::CodeGenTBAA(ASTContext &Ctx, llvm::Module &M,
-                         const CodeGenOptions &CGO,
+CodeGenTBAA::CodeGenTBAA(ASTContext &Ctx, CodeGenTypes &CGTypes,
+                         llvm::Module &M, const CodeGenOptions &CGO,
                          const LangOptions &Features, MangleContext &MContext)
-  : Context(Ctx), Module(M), CodeGenOpts(CGO),
-    Features(Features), MContext(MContext), MDHelper(M.getContext()),
-    Root(nullptr), Char(nullptr)
-{}
+    : Context(Ctx), CGTypes(CGTypes), Module(M), CodeGenOpts(CGO),
+      Features(Features), MContext(MContext), MDHelper(M.getContext()),
+      Root(nullptr), Char(nullptr) {}
 
 CodeGenTBAA::~CodeGenTBAA() {
 }
@@ -294,14 +296,34 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset,
         return false;
 
     const ASTRecordLayout &Layout = Context.getASTRecordLayout(RD);
+    const CGRecordLayout &CGRL = CGTypes.getCGRecordLayout(RD);
 
     unsigned idx = 0;
-    for (RecordDecl::field_iterator i = RD->field_begin(),
-         e = RD->field_end(); i != e; ++i, ++idx) {
-      if ((*i)->isZeroSize(Context) || (*i)->isUnnamedBitfield())
+    for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
+         i != e; ++i, ++idx) {
+      if ((*i)->isZeroSize(Context))
         continue;
-      uint64_t Offset = BaseOffset +
-                        Layout.getFieldOffset(idx) / Context.getCharWidth();
+
+      uint64_t Offset =
+          BaseOffset + Layout.getFieldOffset(idx) / Context.getCharWidth();
+
+      // Create a single field for consecutive named bitfields using char as
+      // base type.
+      if ((*i)->isBitField()) {
+        const CGBitFieldInfo &Info = CGRL.getBitFieldInfo(*i);
+        if (Info.Offset != 0)
+          continue;
+        unsigned CurrentBitFieldSize = Info.StorageSize;
+        uint64_t Size =
+            llvm::divideCeil(CurrentBitFieldSize, Context.getCharWidth());
+        llvm::MDNode *TBAAType = getChar();
+        llvm::MDNode *TBAATag =
+            getAccessTagInfo(TBAAAccessInfo(TBAAType, Size));
+        Fields.push_back(
+            llvm::MDBuilder::TBAAStructField(Offset, Size, TBAATag));
+        continue;
+      }
+
       QualType FieldQTy = i->getType();
       if (!CollectFields(Offset, FieldQTy, Fields,
                          MayAlias || TypeHasMayAlias(FieldQTy)))
diff --git a/clang/lib/CodeGen/CodeGenTBAA.h b/clang/lib/CodeGen/CodeGenTBAA.h
index a65963596fe9d..aa6da2731a416 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.h
+++ b/clang/lib/CodeGen/CodeGenTBAA.h
@@ -29,6 +29,7 @@ namespace clang {
   class Type;
 
 namespace CodeGen {
+class CodeGenTypes;
 
 // TBAAAccessKind - A kind of TBAA memory access descriptor.
 enum class TBAAAccessKind : unsigned {
@@ -115,6 +116,7 @@ struct TBAAAccessInfo {
 /// while lowering AST types to LLVM types.
 class CodeGenTBAA {
   ASTContext &Context;
+  CodeGenTypes &CGTypes;
   llvm::Module &Module;
   const CodeGenOptions &CodeGenOpts;
   const LangOptions &Features;
@@ -167,8 +169,9 @@ class CodeGenTBAA {
   llvm::MDNode *getBaseTypeInfoHelper(const Type *Ty);
 
 public:
-  CodeGenTBAA(ASTContext &Ctx, llvm::Module &M, const CodeGenOptions &CGO,
-              const LangOptions &Features, MangleContext &MContext);
+  CodeGenTBAA(ASTContext &Ctx, CodeGenTypes &CGTypes, llvm::Module &M,
+              const CodeGenOptions &CGO, const LangOptions &Features,
+              MangleContext &MContext);
   ~CodeGenTBAA();
 
   /// getTypeInfo - Get metadata used to describe accesses to objects of the
diff --git a/clang/test/CodeGen/tbaa-struct.cpp b/clang/test/CodeGen/tbaa-struct.cpp
index 28c7d396121af..883c982be26c8 100644
--- a/clang/test/CodeGen/tbaa-struct.cpp
+++ b/clang/test/CodeGen/tbaa-struct.cpp
@@ -162,11 +162,11 @@ void copy10(NamedBitfields3 *a1, NamedBitfields3 *a2) {
 // CHECK-OLD: [[TS3]] = !{i64 0, i64 8, !{{.*}}, i64 0, i64 2, !{{.*}}, i64 4, i64 8, !{{.*}}}
 // CHECK-OLD: [[TS4]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]]}
 // CHECK-OLD: [[TS5]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 5, i64 1, [[TAG_CHAR]]}
-// CHECK-OLD: [[TS6]] = !{i64 0, i64 4, [[TAG_INT]], i64 1, i64 4, [[TAG_INT]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
+// CHECK-OLD: [[TS6]] = !{i64 0, i64 2, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE:!.+]]}
 // CHECK-OLD: [[TAG_DOUBLE]] = !{[[DOUBLE:!.+]], [[DOUBLE]], i64 0}
 // CHECK-OLD  [[DOUBLE]] = !{!"double", [[CHAR]], i64 0}
-// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 4, [[TAG_INT]], i64 3, i64 4, [[TAG_INT]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 4, [[TAG_INT]]}
-// CHECK-OLD: [[TS8]] = !{i64 1, i64 4, [[TAG_INT]], i64 2, i64 4, [[TAG_INT]], i64 8, i64 8, [[TAG_DOUBLE]]}
+// CHECK-OLD: [[TS7]] = !{i64 0, i64 1, [[TAG_CHAR]], i64 1, i64 1, [[TAG_CHAR]], i64 2, i64 1, [[TAG_CHAR]], i64 3, i64 1, [[TAG_CHAR]], i64 4, i64 1, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]], i64 16, i64 1, [[TAG_CHAR]]}
+// CHECK-OLD: [[TS8]] = !{i64 0, i64 4, [[TAG_CHAR]], i64 8, i64 8, [[TAG_DOUBLE]]}
 
 // CHECK-NEW-DAG: [[TYPE_char:!.*]] = !{{{.*}}, i64 1, !"omnipotent char"}
 // CHECK-NEW-DAG: [[TAG_char]] = !{[[TYPE_char]], [[TYPE_char]], i64 0, i64 0}

From 0e0bee26e7f33c065eebef9a674b2f19bb156414 Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton@intel.com>
Date: Tue, 27 Feb 2024 14:15:48 -0600
Subject: [PATCH 490/546] [OpenMP] Fix distributed barrier hang for
 OMP_WAIT_POLICY=passive (#83058)

The resume thread logic inside __kmp_free_team() is faulty. Only
checking b_go for sleep status doesn't wake up distributed barrier.
Change to generic check for th_sleep_loc and calling
__kmp_null_resume_wrapper().

Fixes: #80664
---
 openmp/runtime/src/kmp_runtime.cpp            |  5 +--
 .../runtime/test/barrier/llvm-issue-80664.c   | 37 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 openmp/runtime/test/barrier/llvm-issue-80664.c

diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index fc5e8405a415e..7edb0b440acc7 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -5708,9 +5708,8 @@ void __kmp_free_team(kmp_root_t *root,
           }
 #endif
           // first check if thread is sleeping
-          kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
-          if (fl.is_sleeping())
-            fl.resume(__kmp_gtid_from_thread(th));
+          if (th->th.th_sleep_loc)
+            __kmp_null_resume_wrapper(th);
           KMP_CPU_PAUSE();
         }
       }
diff --git a/openmp/runtime/test/barrier/llvm-issue-80664.c b/openmp/runtime/test/barrier/llvm-issue-80664.c
new file mode 100644
index 0000000000000..79aa228afa6be
--- /dev/null
+++ b/openmp/runtime/test/barrier/llvm-issue-80664.c
@@ -0,0 +1,37 @@
+// RUN: %libomp-compile
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='linear,linear' %libomp-run
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='tree,tree' %libomp-run
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper' %libomp-run
+// RUN: env OMP_WAIT_POLICY=passive \
+// RUN:     KMP_FORKJOIN_BARRIER_PATTERN='dist,dist' %libomp-run
+//
+// LLVM ISSUE 80664: https://github.com/llvm/llvm-project/issues/80664
+//
+// Distributed barrier + OMP_WAIT_POLICY=passive hangs in library termination
+// Reason: the resume logic in __kmp_free_team() was faulty and, when checking
+// for sleep status, didn't look at correct location for distributed barrier.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int a = 0;
+
+void test_omp_barrier() {
+#pragma omp parallel
+  {
+#pragma omp task
+    {
+#pragma omp atomic
+      a++;
+    }
+  }
+}
+
+int main() {
+  test_omp_barrier();
+  printf("a = %d\n", a);
+  return EXIT_SUCCESS;
+}

From e5ed7b6e2fd368b722b6359556cd0125881e7638 Mon Sep 17 00:00:00 2001
From: rohit-rao <rohitrao@google.com>
Date: Tue, 27 Feb 2024 15:23:00 -0500
Subject: [PATCH 491/546] [clang] Extend define-target-os-macros to support
 XROS. (#82833)

Updates the extension feature `define-target-os-macros` to support the
recently-added XROS target (TARGET_OS_VISION).
---
 clang/include/clang/Basic/TargetOSMacros.def |  5 ++-
 clang/test/Driver/fdefine-target-os-macros.c | 46 ++++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/TargetOSMacros.def b/clang/include/clang/Basic/TargetOSMacros.def
index dfc2e033f6fd0..58dce330f9c8f 100644
--- a/clang/include/clang/Basic/TargetOSMacros.def
+++ b/clang/include/clang/Basic/TargetOSMacros.def
@@ -34,18 +34,19 @@ TARGET_OS(TARGET_OS_UNIX, Triple.isOSNetBSD() ||
 TARGET_OS(TARGET_OS_MAC, Triple.isOSDarwin())
 TARGET_OS(TARGET_OS_OSX, Triple.isMacOSX())
 TARGET_OS(TARGET_OS_IPHONE, Triple.isiOS() || Triple.isTvOS() ||
-                            Triple.isWatchOS())
+                            Triple.isWatchOS() || Triple.isXROS())
 // Triple::isiOS() also includes tvOS
 TARGET_OS(TARGET_OS_IOS, Triple.getOS() == llvm::Triple::IOS)
 TARGET_OS(TARGET_OS_TV, Triple.isTvOS())
 TARGET_OS(TARGET_OS_WATCH, Triple.isWatchOS())
+TARGET_OS(TARGET_OS_VISION, Triple.isXROS())
 TARGET_OS(TARGET_OS_DRIVERKIT, Triple.isDriverKit())
 TARGET_OS(TARGET_OS_MACCATALYST, Triple.isMacCatalystEnvironment())
 TARGET_OS(TARGET_OS_SIMULATOR, Triple.isSimulatorEnvironment())
 
 // Deprecated Apple target conditionals.
 TARGET_OS(TARGET_OS_EMBEDDED, (Triple.isiOS() || Triple.isTvOS() \
-                               || Triple.isWatchOS()) \
+                               || Triple.isWatchOS() || Triple.isXROS()) \
                                && !Triple.isMacCatalystEnvironment() \
                                && !Triple.isSimulatorEnvironment())
 TARGET_OS(TARGET_OS_NANO, Triple.isWatchOS())
diff --git a/clang/test/Driver/fdefine-target-os-macros.c b/clang/test/Driver/fdefine-target-os-macros.c
index d7379dd3d5396..a4de51e8e7244 100644
--- a/clang/test/Driver/fdefine-target-os-macros.c
+++ b/clang/test/Driver/fdefine-target-os-macros.c
@@ -12,6 +12,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -27,6 +28,7 @@
 // RUN:                -DIOS=1         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=1    \
@@ -42,6 +44,7 @@
 // RUN:                -DIOS=1         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=1 \
 // RUN:                -DEMBEDDED=0    \
@@ -57,6 +60,7 @@
 // RUN:                -DIOS=1         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -72,6 +76,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=1          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=1    \
@@ -87,6 +92,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=1          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -102,6 +108,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=1       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=1    \
@@ -117,6 +124,39 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=1       \
+// RUN:                -DVISION=0      \
+// RUN:                -DDRIVERKIT=0   \
+// RUN:                -DMACCATALYST=0 \
+// RUN:                -DEMBEDDED=0    \
+// RUN:                -DSIMULATOR=1   \
+// RUN:                -DWINDOWS=0     \
+// RUN:                -DLINUX=0       \
+// RUN:                -DUNIX=0
+
+// RUN: %clang -dM -E --target=arm64-apple-xros %s 2>&1 \
+// RUN: | FileCheck %s -DMAC=1         \
+// RUN:                -DOSX=0         \
+// RUN:                -DIPHONE=1      \
+// RUN:                -DIOS=0         \
+// RUN:                -DTV=0          \
+// RUN:                -DWATCH=0       \
+// RUN:                -DVISION=1      \
+// RUN:                -DDRIVERKIT=0   \
+// RUN:                -DMACCATALYST=0 \
+// RUN:                -DEMBEDDED=1    \
+// RUN:                -DSIMULATOR=0   \
+// RUN:                -DWINDOWS=0     \
+// RUN:                -DLINUX=0       \
+// RUN:                -DUNIX=0
+
+// RUN: %clang -dM -E --target=arm64-apple-xros-simulator %s 2>&1 \
+// RUN: | FileCheck %s -DMAC=1         \
+// RUN:                -DOSX=0         \
+// RUN:                -DIPHONE=1      \
+// RUN:                -DIOS=0         \
+// RUN:                -DTV=0          \
+// RUN:                -DWATCH=0       \
+// RUN:                -DVISION=1      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -132,6 +172,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=1   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -148,6 +189,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -164,6 +206,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -180,6 +223,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -196,6 +240,7 @@
 // RUN:                -DIOS=0         \
 // RUN:                -DTV=0          \
 // RUN:                -DWATCH=0       \
+// RUN:                -DVISION=0      \
 // RUN:                -DDRIVERKIT=0   \
 // RUN:                -DMACCATALYST=0 \
 // RUN:                -DEMBEDDED=0    \
@@ -226,6 +271,7 @@
 // CHECK-DAG: #define TARGET_OS_IOS [[IOS]]
 // CHECK-DAG: #define TARGET_OS_TV [[TV]]
 // CHECK-DAG: #define TARGET_OS_WATCH [[WATCH]]
+// CHECK-DAG: #define TARGET_OS_VISION [[VISION]]
 // CHECK-DAG: #define TARGET_OS_DRIVERKIT [[DRIVERKIT]]
 // CHECK-DAG: #define TARGET_OS_MACCATALYST [[MACCATALYST]]
 // CHECK-DAG: #define TARGET_OS_SIMULATOR [[SIMULATOR]]

From 1d1186de34c55149be336068bf312e8f755dca37 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 27 Feb 2024 12:28:25 -0800
Subject: [PATCH 492/546] [llvm-exegesis] Add loop-register snippet annotation
 (#82873)

This patch adds a LLVM-EXEGESIS-LOOP-REGISTER snippet annotation which
allows a user to specify the register to use for the loop counter in the
loop repetition mode. This allows for executing snippets that don't work
with the default value (currently R8 on X86).
---
 llvm/docs/CommandGuide/llvm-exegesis.rst      |  8 ++++
 .../llvm-exegesis/X86/latency/loop-register.s | 12 ++++++
 .../tools/llvm-exegesis/lib/BenchmarkResult.h |  2 +
 llvm/tools/llvm-exegesis/lib/SnippetFile.cpp  | 20 ++++++++++
 .../llvm-exegesis/lib/SnippetRepetitor.cpp    | 14 +++----
 .../llvm-exegesis/lib/SnippetRepetitor.h      |  3 +-
 llvm/tools/llvm-exegesis/lib/Target.h         |  7 +++-
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp   | 18 +++++----
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp    | 37 ++++++++++++-------
 .../llvm-exegesis/X86/SnippetFileTest.cpp     | 19 ++++++++++
 .../X86/SnippetRepetitorTest.cpp              | 16 +++++---
 11 files changed, 117 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s

diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index 9e3c19078f1cc..fdf17c7fe4128 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -89,6 +89,14 @@ properly.
   annotation requires the subprocess execution mode. This is useful in
   cases where the memory accessed by the snippet depends on the location
   of the snippet, like RIP-relative addressing.
+* `LLVM-EXEGESIS-LOOP-REGISTER <register name>` - This annotation specifies
+  the loop register to use for keeping track of the current iteration when
+  using the loop repetition mode. :program:`llvm-exegesis` needs to keep track
+  of the current loop iteration within the loop repetition mode in a performant
+  manner (i.e., no memory accesses), and uses a register to do this. This register
+  has an architecture specific default (e.g., `R8` on X86), but this might conflict
+  with some snippets. This annotation allows changing the register to prevent
+  interference between the loop index register and the snippet.
 
 EXAMPLE 1: benchmarking instructions
 ------------------------------------
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
new file mode 100644
index 0000000000000..81ca75251381a
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
@@ -0,0 +1,12 @@
+# REQUIRES: exegesis-can-measure-latency, x86_64-linux
+
+# Test that specifying the loop register to use works as expected.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s | FileCheck %s
+
+# CHECK: measurements:
+
+# LLVM-EXEGESIS-DEFREG R11 ff
+# LLVM-EXEGESIS-LOOP-REGISTER R12
+
+addq $0xff, %r11
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 0aecaaeea4b2e..4ae6bc2a54cd5 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -74,6 +74,8 @@ struct BenchmarkKey {
   // The address that the snippet should be loaded in at if the execution mode
   // being used supports it.
   intptr_t SnippetAddress = 0;
+  // The register that should be used to hold the loop counter.
+  unsigned LoopRegister;
 };
 
 struct BenchmarkMeasure {
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
index 7258fcb4279c7..431d99c72b808 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
@@ -9,6 +9,7 @@
 #include "SnippetFile.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
+#include "Target.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -175,6 +176,20 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer {
 
       return;
     }
+    if (CommentText.consume_front("LOOP-REGISTER")) {
+      // LLVM-EXEGESIS-LOOP-REGISTER <loop register>
+      unsigned LoopRegister;
+
+      if (!(LoopRegister = findRegisterByName(CommentText.trim()))) {
+        errs() << "unknown register '" << CommentText
+               << "' in 'LLVM-EXEGESIS-LOOP-REGISTER " << CommentText << "'\n";
+        ++InvalidComments;
+        return;
+      }
+
+      Result->Key.LoopRegister = LoopRegister;
+      return;
+    }
   }
 
   unsigned numInvalidComments() const { return InvalidComments; }
@@ -221,6 +236,11 @@ Expected<std::vector<BenchmarkCode>> readSnippets(const LLVMState &State,
 
   BenchmarkCode Result;
 
+  // Ensure that there is a default loop register value specified.
+  Result.Key.LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
   const TargetMachine &TM = State.getTargetMachine();
   MCContext Context(TM.getTargetTriple(), TM.getMCAsmInfo(),
                     TM.getMCRegisterInfo(), TM.getMCSubtargetInfo());
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 561687a62319b..0bab30d158200 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -48,10 +48,8 @@ class DuplicateSnippetRepetitor : public SnippetRepetitor {
 
 class LoopSnippetRepetitor : public SnippetRepetitor {
 public:
-  explicit LoopSnippetRepetitor(const LLVMState &State)
-      : SnippetRepetitor(State),
-        LoopCounter(State.getExegesisTarget().getLoopCounterRegister(
-            State.getTargetMachine().getTargetTriple())) {}
+  explicit LoopSnippetRepetitor(const LLVMState &State, unsigned LoopRegister)
+      : SnippetRepetitor(State), LoopCounter(LoopRegister) {}
 
   // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
   FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
@@ -113,8 +111,8 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
         (void)_;
         Loop.addInstructions(Instructions);
       }
-      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
-                                     State.getInstrInfo());
+      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, State.getInstrInfo(),
+                                     LoopCounter);
 
       // Set up the exit basic block.
       Loop.MBB->addSuccessor(Exit.MBB, BranchProbability::getZero());
@@ -138,14 +136,14 @@ SnippetRepetitor::~SnippetRepetitor() {}
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
-                         const LLVMState &State) {
+                         const LLVMState &State, unsigned LoopRegister) {
   switch (Mode) {
   case Benchmark::Duplicate:
   case Benchmark::MiddleHalfDuplicate:
     return std::make_unique<DuplicateSnippetRepetitor>(State);
   case Benchmark::Loop:
   case Benchmark::MiddleHalfLoop:
-    return std::make_unique<LoopSnippetRepetitor>(State);
+    return std::make_unique<LoopSnippetRepetitor>(State, LoopRegister);
   case Benchmark::AggregateMin:
     break;
   }
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
index 2b3c416c9029f..c62e80f161f12 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
@@ -29,7 +29,8 @@ namespace exegesis {
 class SnippetRepetitor {
 public:
   static std::unique_ptr<const SnippetRepetitor>
-  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State);
+  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State,
+         unsigned LoopRegister);
 
   virtual ~SnippetRepetitor();
 
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 7bbd946b03331..522c75d15703d 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -202,12 +202,15 @@ class ExegesisTarget {
   }
 
   // Returns a counter usable as a loop counter.
-  virtual unsigned getLoopCounterRegister(const Triple &) const { return 0; }
+  virtual unsigned getDefaultLoopCounterRegister(const Triple &) const {
+    return 0;
+  }
 
   // Adds the code to decrement the loop counter and
   virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                            MachineBasicBlock &TargetMBB,
-                                           const MCInstrInfo &MII) const {
+                                           const MCInstrInfo &MII,
+                                           unsigned LoopRegister) const {
     llvm_unreachable("decrementLoopCounterAndBranch() requires "
                      "getLoopCounterRegister() > 0");
   }
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 6fc951a6e35d6..a41a995f5560a 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -720,7 +720,7 @@ class ExegesisX86Target : public ExegesisTarget {
 
   unsigned getScratchMemoryRegister(const Triple &TT) const override;
 
-  unsigned getLoopCounterRegister(const Triple &) const override;
+  unsigned getDefaultLoopCounterRegister(const Triple &) const override;
 
   unsigned getMaxMemoryAccessSize() const override { return 64; }
 
@@ -733,7 +733,8 @@ class ExegesisX86Target : public ExegesisTarget {
 
   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                    MachineBasicBlock &TargetMBB,
-                                   const MCInstrInfo &MII) const override;
+                                   const MCInstrInfo &MII,
+                                   unsigned LoopRegister) const override;
 
   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
                                const APInt &Value) const override;
@@ -852,7 +853,7 @@ const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const unsigned kLoopCounterReg = X86::R8;
+constexpr const unsigned kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -870,11 +871,12 @@ unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
   return TT.isOSWindows() ? X86::RCX : X86::RDI;
 }
 
-unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
+unsigned
+ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
   if (!TT.isArch64Bit()) {
     return 0;
   }
-  return kLoopCounterReg;
+  return kDefaultLoopCounterReg;
 }
 
 Error ExegesisX86Target::randomizeTargetMCOperand(
@@ -912,10 +914,10 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
 
 void ExegesisX86Target::decrementLoopCounterAndJump(
     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
-    const MCInstrInfo &MII) const {
+    const MCInstrInfo &MII, unsigned LoopRegister) const {
   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
-      .addDef(kLoopCounterReg)
-      .addUse(kLoopCounterReg)
+      .addDef(LoopRegister)
+      .addUse(LoopRegister)
       .addImm(-1);
   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
       .addMBB(&TargetMBB)
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 782d44422791c..1ae2565e894c6 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -497,22 +497,42 @@ void benchmarkMain() {
   }
 
   const auto Opcodes = getOpcodesOrDie(State);
+  std::vector<BenchmarkCode> Configurations;
+
+  unsigned LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
+  if (Opcodes.empty()) {
+    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
+    for (const auto &Configuration : Configurations) {
+      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
+          (Configuration.Key.MemoryMappings.size() != 0 ||
+           Configuration.Key.MemoryValues.size() != 0 ||
+           Configuration.Key.SnippetAddress != 0))
+        ExitWithError("Memory and snippet address annotations are only "
+                      "supported in subprocess "
+                      "execution mode");
+    }
+    LoopRegister = Configurations[0].Key.LoopRegister;
+  }
 
   SmallVector<std::unique_ptr<const SnippetRepetitor>, 2> Repetitors;
   if (RepetitionMode != Benchmark::RepetitionModeE::AggregateMin)
-    Repetitors.emplace_back(SnippetRepetitor::Create(RepetitionMode, State));
+    Repetitors.emplace_back(
+        SnippetRepetitor::Create(RepetitionMode, State, LoopRegister));
   else {
     for (Benchmark::RepetitionModeE RepMode :
          {Benchmark::RepetitionModeE::Duplicate,
           Benchmark::RepetitionModeE::Loop})
-      Repetitors.emplace_back(SnippetRepetitor::Create(RepMode, State));
+      Repetitors.emplace_back(
+          SnippetRepetitor::Create(RepMode, State, LoopRegister));
   }
 
   BitVector AllReservedRegs;
   for (const std::unique_ptr<const SnippetRepetitor> &Repetitor : Repetitors)
     AllReservedRegs |= Repetitor->getReservedRegs();
 
-  std::vector<BenchmarkCode> Configurations;
   if (!Opcodes.empty()) {
     for (const unsigned Opcode : Opcodes) {
       // Ignore instructions without a sched class if
@@ -534,17 +554,6 @@ void benchmarkMain() {
       std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
                 std::back_inserter(Configurations));
     }
-  } else {
-    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
-    for (const auto &Configuration : Configurations) {
-      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
-          (Configuration.Key.MemoryMappings.size() != 0 ||
-           Configuration.Key.MemoryValues.size() != 0 ||
-           Configuration.Key.SnippetAddress != 0))
-        ExitWithError("Memory and snippet address annotations are only "
-                      "supported in subprocess "
-                      "execution mode");
-    }
   }
 
   if (MinInstructions == 0) {
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
index 505a030675f64..f1fa891171177 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
@@ -219,6 +219,25 @@ TEST_F(X86SnippetFileTest, SnippetAddress) {
   EXPECT_EQ(Snippet.Key.SnippetAddress, 0x10000);
 }
 
+TEST_F(X86SnippetFileTest, LoopRegister) {
+  auto Snippets = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER R11
+  )");
+  ASSERT_TRUE(static_cast<bool>(Snippets));
+  EXPECT_THAT(*Snippets, SizeIs(1));
+  const auto &Snippet = (*Snippets)[0];
+  EXPECT_EQ(Snippet.Key.LoopRegister, X86::R11);
+}
+
+TEST_F(X86SnippetFileTest, LoopRegisterInvalidRegister) {
+  auto Error = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER INVALID
+  )")
+                   .takeError();
+  EXPECT_TRUE(static_cast<bool>(Error));
+  consumeError(std::move(Error));
+}
+
 } // namespace
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index 25e8836087c15..b55ca5057ae01 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -40,7 +40,10 @@ class X86SnippetRepetitorTest : public X86TestBase {
 
   void TestCommon(Benchmark::RepetitionModeE RepetitionMode,
                   unsigned SnippetInstructions = 1) {
-    const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
+    const auto Repetitor = SnippetRepetitor::Create(
+        RepetitionMode, State,
+        State.getExegesisTarget().getDefaultLoopCounterRegister(
+            State.getTargetMachine().getTargetTriple()));
     const std::vector<MCInst> Instructions(SnippetInstructions,
                                            MCInstBuilder(X86::NOOP));
     FunctionFiller Sink(*MF, {X86::EAX});
@@ -98,11 +101,12 @@ TEST_F(X86SnippetRepetitorTest, Loop) {
                           HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
                           HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
                           HasOpcode(X86::JCC_1)));
-  EXPECT_THAT(LoopBlock.liveins(),
-              UnorderedElementsAre(
-                  LiveReg(X86::EAX),
-                  LiveReg(State.getExegesisTarget().getLoopCounterRegister(
-                      State.getTargetMachine().getTargetTriple()))));
+  EXPECT_THAT(
+      LoopBlock.liveins(),
+      UnorderedElementsAre(
+          LiveReg(X86::EAX),
+          LiveReg(State.getExegesisTarget().getDefaultLoopCounterRegister(
+              State.getTargetMachine().getTargetTriple()))));
   EXPECT_THAT(MF->getBlockNumbered(2)->instrs(),
               ElementsAre(HasOpcode(X86::RET64)));
 }

From b0bae445176d30a3fa577d30c21f36dad61003b8 Mon Sep 17 00:00:00 2001
From: rohit-rao <rohitrao@google.com>
Date: Tue, 27 Feb 2024 15:29:34 -0500
Subject: [PATCH 493/546] [lld] Adds support for xros. (#83031)

---
 lld/MachO/Driver.cpp              | 19 +++++++++++++------
 lld/MachO/Options.td              |  2 +-
 lld/test/MachO/lc-build-version.s |  7 +++++++
 lld/test/MachO/platform-version.s |  2 +-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index a57f60c5eed36..018ceec97f204 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -691,6 +691,8 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) {
           .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR)
           .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR)
           .Cases("driverkit", "10", PLATFORM_DRIVERKIT)
+          .Cases("xros", "11", PLATFORM_XROS)
+          .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR)
           .Default(PLATFORM_UNKNOWN);
   if (platformVersion.platform == PLATFORM_UNKNOWN)
     error(Twine("malformed platform: ") + platformStr);
@@ -985,6 +987,8 @@ PlatformType macho::removeSimulator(PlatformType platform) {
     return PLATFORM_TVOS;
   case PLATFORM_WATCHOSSIMULATOR:
     return PLATFORM_WATCHOS;
+  case PLATFORM_XROS_SIMULATOR:
+    return PLATFORM_XROS;
   default:
     return platform;
   }
@@ -1001,15 +1005,17 @@ static bool shouldAdhocSignByDefault(Architecture arch, PlatformType platform) {
 
   return platform == PLATFORM_MACOS || platform == PLATFORM_IOSSIMULATOR ||
          platform == PLATFORM_TVOSSIMULATOR ||
-         platform == PLATFORM_WATCHOSSIMULATOR;
+         platform == PLATFORM_WATCHOSSIMULATOR ||
+         platform == PLATFORM_XROS_SIMULATOR;
 }
 
 static bool dataConstDefault(const InputArgList &args) {
-  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 6> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(10, 15)},
         {PLATFORM_IOS, VersionTuple(13, 0)},
         {PLATFORM_TVOS, VersionTuple(13, 0)},
         {PLATFORM_WATCHOS, VersionTuple(6, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)},
         {PLATFORM_BRIDGEOS, VersionTuple(4, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
@@ -1045,11 +1051,12 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   bool isRequested = arg != nullptr;
 
   // Version numbers taken from the Xcode 13.3 release notes.
-  static const std::array<std::pair<PlatformType, VersionTuple>, 4> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(11, 0)},
         {PLATFORM_IOS, VersionTuple(13, 4)},
         {PLATFORM_TVOS, VersionTuple(14, 0)},
-        {PLATFORM_WATCHOS, VersionTuple(7, 0)}}};
+        {PLATFORM_WATCHOS, VersionTuple(7, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
                           [&](const auto &p) { return p.first == platform; });
@@ -1688,8 +1695,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   if (args.getLastArg(OPT_reproducible))
     config->zeroModTime = true;
 
-  std::array<PlatformType, 3> encryptablePlatforms{
-      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS};
+  std::array<PlatformType, 4> encryptablePlatforms{
+      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS, PLATFORM_XROS};
   config->emitEncryptionInfo =
       args.hasFlag(OPT_encryptable, OPT_no_encryption,
                    is_contained(encryptablePlatforms, config->platform()));
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 01e73b789f9aa..a524e4a4c5084 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -377,7 +377,7 @@ def grp_version : OptionGroup<"version">, HelpText<"VERSION TARGETING">;
 
 def platform_version : MultiArg<["-"], "platform_version", 3>,
     MetaVarName<"<platform> <min_version> <sdk_version>">,
-    HelpText<"Platform (e.g., macos, ios, tvos, watchos, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, driverkit) and version numbers">,
+    HelpText<"Platform (e.g., macos, ios, tvos, watchos, xros, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, xros-sim, driverkit) and version numbers">,
     Group<grp_version>;
 def sdk_version : Separate<["-"], "sdk_version">,
     HelpText<"This option is undocumented in ld64">,
diff --git a/lld/test/MachO/lc-build-version.s b/lld/test/MachO/lc-build-version.s
index 7b78f803428a7..1fd7078919b15 100644
--- a/lld/test/MachO/lc-build-version.s
+++ b/lld/test/MachO/lc-build-version.s
@@ -64,6 +64,13 @@
 
 # WATCHOS-4-0: cmd LC_VERSION_MIN_WATCHOS
 
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros 1.0 1.1 -o %t.xros-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-1-0 | FileCheck %s --check-prefix=XROS-1-0
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros-simulator 1.0 1.1 -o %t.xros-sim-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-sim-1-0 | FileCheck %s --check-prefix=XROS-1-0
+
+# XROS-1-0: cmd LC_BUILD_VERSION
+
 .text
 .global _main
 _main:
diff --git a/lld/test/MachO/platform-version.s b/lld/test/MachO/platform-version.s
index 047aea02fcde3..57fbae62b2ffc 100644
--- a/lld/test/MachO/platform-version.s
+++ b/lld/test/MachO/platform-version.s
@@ -55,7 +55,7 @@
 # RUN:        -platform_version 0 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # RUN: not %no-arg-lld -arch x86_64 -o %t %t.o 2>&1 \
-# RUN:        -platform_version 11 1 5 \
+# RUN:        -platform_version 13 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # FAIL-PLATFORM: malformed platform: {{.*}}
 # FAIL-PLATFORM-NOT: malformed {{minimum|sdk}} version: {{.*}}

From 3250330997cf214293a20a1d532b617d72bafb09 Mon Sep 17 00:00:00 2001
From: Wu Yingcong <yingcong.wu@intel.com>
Date: Tue, 27 Feb 2024 12:52:04 -0800
Subject: [PATCH 494/546] [asan] Disable instrumentation for
 available_externally global with COFF (#81109)

For COFF, available_externally global will be instrumented because of
the lack of filtering, and will trigger the Verifier pass assertion and
crash the compilation. This patch will filter out the
available_externally global for COFF.

For non-COFF, `!G->hasExactDefinition()` in line 1954 will filter out
the available_externally globals.

There is a related bug reported in
https://bugs.llvm.org/show_bug.cgi?id=47950 /
https://github.com/llvm/llvm-project/issues/47294. I tried the
reproducer posted on the page and this will fix the problem.

Reproducer:
```
#include <locale>

void grouping_impl() {
  std::use_facet<std::numpunct<char>>(std::locale());
}

// clang -fsanitize=address -D_DLL -std=c++14 -c format.cc
```
---
 .../Transforms/Instrumentation/AddressSanitizer.cpp    |  4 ++++
 .../do-not-instrument-globals-windows.ll               | 10 ++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8a2864a078731..5d5c4ea57ed56 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1957,6 +1957,10 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
     // On COFF, don't instrument non-ODR linkages.
     if (G->isInterposable())
       return false;
+    // If the global has AvailableExternally linkage, then it is not in this
+    // module, which means it does not need to be instrumented.
+    if (G->hasAvailableExternallyLinkage())
+      return false;
   }
 
   // If a comdat is present, it must have a selection kind that implies ODR
diff --git a/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
new file mode 100644
index 0000000000000..c143f69f126a8
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
@@ -0,0 +1,10 @@
+; This test checks that we are not instrumenting unnecessary globals
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@v_available_externally = available_externally global i32 zeroinitializer
+; CHECK-NOT: {{asan_gen.*v_available_externally}}
+
+; CHECK: @asan.module_ctor

From 5e31e82698d9f1d3f1dd881c87b8c5399d790772 Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Tue, 27 Feb 2024 13:00:07 -0800
Subject: [PATCH 495/546] [compiler-rt] Use locally configured llvm-lit for
 standalone builds (#83178)

When building a standalone build with
`-DLLVM_CMAKE_DIR=$HOME/output/llvm-install
-DCOMPILER_RT_INCLUDE_TESTS=ON`, the current code will attempt to use
`LLVM_DEFAULT_EXTERNAL_LIT` which is set to
`$HOME/output/llvm-install/bin/llvm-lit` inside `LLVMConfig.cmake` even
though it is not actually installed. If we are adding the llvm-lit
subdirectory, we can use `get_llvm_lit_path()` immediately afterwards to
set LLVM_EXTERNAL_LIT so that subsequent calls within
`add_lit_testsuite()` use llvm-lit from the current build directory
instead of the nonexistant one.
---
 compiler-rt/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index bbb4e8d7c333e..8a2b138d8d702 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -771,8 +771,6 @@ mark_as_advanced(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
 add_subdirectory(lib)
 
 if(COMPILER_RT_INCLUDE_TESTS)
-  add_subdirectory(unittests)
-  add_subdirectory(test)
   # Don't build llvm-lit for runtimes-build, it will clean up map_config.
   if (COMPILER_RT_STANDALONE_BUILD AND NOT LLVM_RUNTIMES_BUILD)
     # If we have a valid source tree, generate llvm-lit into the bin directory.
@@ -782,11 +780,17 @@ if(COMPILER_RT_INCLUDE_TESTS)
       # Needed for lit support in standalone builds.
       include(AddLLVM)
       add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/llvm-lit ${CMAKE_CURRENT_BINARY_DIR}/llvm-lit)
+      # Ensure that the testsuite uses the local lit rather than
+      # LLVM_INSTALL_DIR/bin/llvm-lit (which probably does not exist).
+      get_llvm_lit_path(_base_dir _file_name)
+      set(LLVM_EXTERNAL_LIT "${_base_dir}/${_file_name}" CACHE STRING "Command used to spawn lit" FORCE)
     elseif(NOT EXISTS ${LLVM_EXTERNAL_LIT})
       message(WARNING "Could not find LLVM source directory and LLVM_EXTERNAL_LIT does not"
                        "point to a valid file.  You will not be able to run tests.")
     endif()
   endif()
+  add_subdirectory(unittests)
+  add_subdirectory(test)
 endif()
 
 add_subdirectory(tools)

From c6fa71cdd0b85a8edb612e326ea275eb23aab032 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 15:14:01 -0600
Subject: [PATCH 496/546] [libc][NFC] Add `-Wno-multi-gpu` everywhere for the
 GPU build (#83173)

Summary:
This warning is intended to indicate if `-march=native` returns
different values in a single compilation sense. As it stands we don't
care and it absolutely spams the test output if you run it on a machine
with more than one GPU like any cluster machine. Disable these warnings
everywhere we compile.
---
 libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 1 +
 libc/cmake/modules/LLVMLibCTestRules.cmake          | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index c7ccd392354cb..72b04822d8b84 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -87,6 +87,7 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
     list(APPEND compile_options "-flto")
+    list(APPEND compile_options "-Wno-multi-gpu")
 
     if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
       list(APPEND compile_options "-Wno-unknown-cuda-version")
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 76ce6754bd733..836e15d34741b 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -463,7 +463,7 @@ function(add_integration_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
@@ -471,7 +471,7 @@ function(add_integration_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
@@ -647,14 +647,14 @@ function(add_libc_hermetic_test test_name)
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")

From f7a99664681390f4bb0211a52f0d89c70aa96762 Mon Sep 17 00:00:00 2001
From: Janeczko Jakub <kuba@ev1.pl>
Date: Tue, 27 Feb 2024 22:17:11 +0100
Subject: [PATCH 497/546] Fix typo in mlir::Value doxygen comment (#83150)

Fix #82900
---
 mlir/include/mlir/IR/Value.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index fff3b87faff66..a74d0faa1dfc4 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -90,7 +90,7 @@ class alignas(8) ValueImpl : public IRObjectWithUseList<OpOperand> {
 /// class has value-type semantics and is just a simple wrapper around a
 /// ValueImpl that is either owner by a block(in the case of a BlockArgument) or
 /// an Operation(in the case of an OpResult).
-/// As most IR construct, this isn't const-correct, but we keep method
+/// As most IR constructs, this isn't const-correct, but we keep method
 /// consistent and as such method that immediately modify this Value aren't
 /// marked `const` (include modifying the Value use-list).
 class Value {

From d82e93e7f129d9e8b72570efdf4a15d6ec3d4336 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Tue, 27 Feb 2024 13:18:43 -0800
Subject: [PATCH 498/546] [mlir][sparse] add merger support on Batch LevelType.
 (#83186)

---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      | 18 +++++-
 .../mlir/Dialect/SparseTensor/Utils/Merger.h  |  3 +-
 .../lib/Dialect/SparseTensor/Utils/Merger.cpp |  8 +--
 .../Dialect/SparseTensor/MergerTest.cpp       | 58 +++++++++++++------
 4 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 9e79b6aca1c9b..5563cb907e935 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -333,16 +333,28 @@ struct LevelType {
     return lvlBits & static_cast<uint64_t>(p);
   }
 
+  /// Check if the `LevelType` is considered to be sparse.
+  constexpr bool hasSparseSemantic() const {
+    return isa<LevelFormat::Compressed, LevelFormat::Singleton,
+               LevelFormat::LooseCompressed, LevelFormat::NOutOfM>();
+  }
+
+  /// Check if the `LevelType` is considered to be dense-like.
+  constexpr bool hasDenseSemantic() const {
+    return isa<LevelFormat::Dense, LevelFormat::Batch>();
+  }
+
   /// Check if the `LevelType` needs positions array.
   constexpr bool isWithPosLT() const {
-    return isa<LevelFormat::Compressed>() ||
-           isa<LevelFormat::LooseCompressed>();
+    assert(!isa<LevelFormat::Undef>());
+    return isa<LevelFormat::Compressed, LevelFormat::LooseCompressed>();
   }
 
   /// Check if the `LevelType` needs coordinates array.
   constexpr bool isWithCrdLT() const {
+    assert(!isa<LevelFormat::Undef>());
     // All sparse levels has coordinate array.
-    return !isa<LevelFormat::Dense, LevelFormat::Batch>();
+    return hasSparseSemantic();
   }
 
   std::string toMLIRString() const {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 490ef3071af1b..7f9820df984b2 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -509,8 +509,7 @@ class Merger {
   bool isSparseLvlWithNonTrivialIdxExp(TensorLoopId b) const {
     if (isLvlWithNonTrivialIdxExp(b)) {
       auto lt = getLoopDependentLevelType(b);
-      return isCompressedLT(lt) || isSingletonLT(lt) ||
-             isLooseCompressedLT(lt) || isNOutOfMLT(lt);
+      return lt.hasSparseSemantic();
     }
     return false;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 731cd79a1e3b4..72b722c69ae34 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -476,7 +476,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Starts resetting from a dense level, so that the first bit (if kept)
     // is not undefined level-type.
     for (unsigned b = 0; b < be; b++) {
-      if (simple[b] && isDenseLT(getLvlType(TensorLoopId{b}))) {
+      if (simple[b] && getLvlType(TensorLoopId{b}).hasDenseSemantic()) {
         offset = be - b - 1; // relative to the end
         break;
       }
@@ -489,8 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Slice on dense level has `locate` property as well, and can be optimized.
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto lt = getLvlType(b);
-      if (!isCompressedLT(lt) && !isSingletonLT(lt) &&
-          !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) {
+      if (!lt.hasSparseSemantic()) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -670,8 +669,7 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
 bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto lt = getLvlType(b);
-    if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        isNOutOfMLT(lt))
+    if (lt.hasSparseSemantic())
       return true;
   }
   return hasSparseIdxReduction(bits);
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index 62a19c084cac0..943e7d5c120b8 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -120,7 +120,8 @@ static Match synZeroMatch() { return Match(); }
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
 #undef IMPL_BINOP_PATTERN
 
-class MergerTestBase : public ::testing::Test {
+// Parameterize LevelFormat to test both Dense and Batch LevelFormat.
+class MergerTestBase : public ::testing::TestWithParam<LevelFormat> {
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : merger(numTensors, numLoops, /*maxRank=*/numLoops) {
@@ -317,10 +318,14 @@ class MergerTest3T1L : public MergerTestBase {
     // Tensor 1: sparse input vector.
     merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Compressed);
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1L, MergerTest3T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 /// Four tensors (three inputs, one output); and a single loop.
 class MergerTest4T1L : public MergerTestBase {
 protected:
@@ -333,10 +338,14 @@ class MergerTest4T1L : public MergerTestBase {
     // Tensor 2: sparse input vector
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Compressed);
     // Tensor 3: dense output vector
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1L, MergerTest4T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both sparse and dense input.
 ///
@@ -349,12 +358,16 @@ class MergerTest3T1LD : public MergerTestBase {
     // Tensor 0: sparse input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Compressed);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1LD, MergerTest3T1LD,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both undef and dense input.
 ///
@@ -367,14 +380,18 @@ class MergerTest4T1LU : public MergerTestBase {
     // Tensor 0: undef input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Undef);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: undef input vector.
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Undef);
     // Tensor 3: dense output vector.
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1LU, MergerTest4T1LU,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with operation on sparse output.
 ///
@@ -395,6 +412,11 @@ class MergerTest3T1LSo : public MergerTestBase {
   }
 };
 
+// This testsuite does not use any dense-like format, just one of {Dense, Batch}
+// is enough.
+INSTANTIATE_TEST_SUITE_P(Test3T1LSo, MergerTest3T1LSo,
+                         ::testing::Values(LevelFormat::Dense));
+
 } // namespace
 
 /// Vector multiplication (conjunction) of 3 vectors, i.e.;
@@ -409,7 +431,7 @@ class MergerTest3T1LSo : public MergerTestBase {
 ///   lat( i_01_D / (tensor_0 * tensor_1 * tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_UNDEF(CONJ1, CONJ2)                         \
-  TEST_F(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
+  TEST_P(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -443,7 +465,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_UNDEF)
 ///   lat( i_03_U / (tensor_0 * tensor_1 * output_tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT(CONJ1, CONJ2)                    \
-  TEST_F(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
+  TEST_P(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -482,7 +504,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT)
 ///   lat( i_01 / tensor_1 )
 /// }
 #define IMPL_MERGER_TEST_DISJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -514,7 +536,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_DISJ)
 ///   lat( i_00 i_01 / (tensor_0 * tensor_1) )
 /// }
 #define IMPL_MERGER_TEST_CONJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -544,7 +566,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_CONJ)
 ///    lat( i_02 / tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_DISJ(CONJ, DISJ)                                 \
-  TEST_F(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
+  TEST_P(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
     const auto em = CONJ##Expr(tensor(0), tensor(1));                          \
     const auto e = DISJ##Expr(em, tensor(2));                                  \
     const auto l0 = lid(0);                                                    \
@@ -587,7 +609,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
 ///   lat( i_00 / tensor_0 )
 /// }
 #define IMPL_MERGER_TEST_DISJ_DISJ(DISJ1, DISJ2)                               \
-  TEST_F(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
+  TEST_P(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
     const auto em = DISJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = DISJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -636,7 +658,7 @@ FOREVERY_PAIR_OF_COMMON_DISJ_DISJ_BINOP(IMPL_MERGER_TEST_DISJ_DISJ)
 ///    lat( i_00 i_01 i_02 / tensor_0 * tensor_1 * tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ(CONJ1, CONJ2)                               \
-  TEST_F(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
+  TEST_P(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -675,7 +697,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ)
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 + dense_tensor_1) ).
 #define IMPL_MERGER_TEST_OPTIMIZED_DISJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -711,7 +733,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
 /// }
 /// since i_01 is a dense dimension.
 #define IMPL_MERGER_TEST_OPTIMIZED_CONJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -746,7 +768,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)
 ///   lat( i_00 / tensor_0 cmp 0 )
 ///   lat( i_01 / 0 cmp tensor_1 )
 /// }
-TEST_F(MergerTest3T1L, vector_cmp) {
+TEST_P(MergerTest3T1L, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);
@@ -784,7 +806,7 @@ TEST_F(MergerTest3T1L, vector_cmp) {
 ///
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ).
-TEST_F(MergerTest3T1LD, vector_cmp) {
+TEST_P(MergerTest3T1LD, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);

From 06bcd9da1670b1d62e08b9fdd58b3a64368da87b Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Tue, 27 Feb 2024 13:45:37 -0800
Subject: [PATCH 499/546] [libc][stdbit] implement stdc_has_single_bit (C23)
 (#83168)

---
 libc/config/linux/x86_64/entrypoints.txt      |  5 +++++
 libc/docs/stdbit.rst                          | 12 +++++-----
 libc/include/llvm-libc-macros/stdbit-macros.h | 22 +++++++++++++++++++
 libc/spec/spec.td                             |  1 +
 libc/spec/stdc.td                             | 10 +++++++--
 libc/src/stdbit/CMakeLists.txt                |  1 +
 libc/src/stdbit/stdc_has_single_bit_uc.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_uc.h      | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ui.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ui.h      | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ul.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ul.h      | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ull.cpp   | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ull.h     | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_us.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_us.h      | 18 +++++++++++++++
 libc/test/include/stdbit_test.cpp             | 13 +++++++++++
 libc/test/src/stdbit/CMakeLists.txt           |  1 +
 .../stdbit/stdc_has_single_bit_uc_test.cpp    | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_ui_test.cpp    | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_ul_test.cpp    | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_ull_test.cpp   | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_us_test.cpp    | 20 +++++++++++++++++
 23 files changed, 347 insertions(+), 8 deletions(-)
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_uc.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_uc.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ui.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ui.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ul.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ul.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ull.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ull.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_us.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_us.h
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c2300a2aa681a..27c9a42934c24 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -142,6 +142,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_count_ones_ui
     libc.src.stdbit.stdc_count_ones_ul
     libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 0308caeb92932..b579e9dbbc2f5 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -81,11 +81,11 @@ stdc_count_ones_us           |check|
 stdc_count_ones_ui           |check|
 stdc_count_ones_ul           |check|
 stdc_count_ones_ull          |check|
-stdc_has_single_bit_uc
-stdc_has_single_bit_us
-stdc_has_single_bit_ui
-stdc_has_single_bit_ul
-stdc_has_single_bit_ull
+stdc_has_single_bit_uc       |check|
+stdc_has_single_bit_us       |check|
+stdc_has_single_bit_ui       |check|
+stdc_has_single_bit_ul       |check|
+stdc_has_single_bit_ull      |check|
 stdc_bit_width_uc
 stdc_bit_width_us
 stdc_bit_width_ui
@@ -124,7 +124,7 @@ stdc_first_trailing_zero   |check|
 stdc_first_trailing_one    |check|
 stdc_count_zeros           |check|
 stdc_count_ones            |check|
-stdc_has_single_bit
+stdc_has_single_bit        |check|
 stdc_bit_width
 stdc_bit_floor
 stdc_bit_ceil
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index 5ee152e105f77..e3a36d10ed92a 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -157,6 +157,21 @@ inline unsigned stdc_count_ones(unsigned long x) {
 inline unsigned stdc_count_ones(unsigned long long x) {
   return stdc_count_ones_ull(x);
 }
+inline bool stdc_has_single_bit(unsigned char x) {
+  return stdc_has_single_bit_uc(x);
+}
+inline bool stdc_has_single_bit(unsigned short x) {
+  return stdc_has_single_bit_us(x);
+}
+inline bool stdc_has_single_bit(unsigned x) {
+  return stdc_has_single_bit_ui(x);
+}
+inline bool stdc_has_single_bit(unsigned long x) {
+  return stdc_has_single_bit_ul(x);
+}
+inline bool stdc_has_single_bit(unsigned long long x) {
+  return stdc_has_single_bit_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -228,6 +243,13 @@ inline unsigned stdc_count_ones(unsigned long long x) {
       unsigned: stdc_count_ones_ui,                                            \
       unsigned long: stdc_count_ones_ul,                                       \
       unsigned long long: stdc_count_ones_ull)(x)
+#define stdc_has_single_bit(x)                                                 \
+  _Generic((x),                                                                \
+      unsigned char: stdc_has_single_bit_uc,                                   \
+      unsigned short: stdc_has_single_bit_us,                                  \
+      unsigned: stdc_has_single_bit_ui,                                        \
+      unsigned long: stdc_has_single_bit_ul,                                   \
+      unsigned long long: stdc_has_single_bit_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 90c076580be12..998f37fb26dee 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -51,6 +51,7 @@ def LongDoubleType : NamedType<"long double">;
 def CharType : NamedType<"char">;
 def UnsignedCharType : NamedType<"unsigned char">;
 def UnsignedShortType : NamedType<"unsigned short">;
+def BoolType : NamedType<"bool">;
 
 def Float128Type : NamedType<"float128">;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 8a1a235e4eecd..5b97255b89974 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -799,7 +799,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_first_trailing_zero">,
         Macro<"stdc_first_trailing_one">,
         Macro<"stdc_count_zeros">,
-        Macro<"stdc_count_ones">
+        Macro<"stdc_count_ones">,
+        Macro<"stdc_has_single_bit">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -848,7 +849,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_count_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_count_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_count_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_uc", RetValSpec<BoolType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_has_single_bit_us", RetValSpec<BoolType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_has_single_bit_ui", RetValSpec<BoolType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_has_single_bit_ul", RetValSpec<BoolType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_ull", RetValSpec<BoolType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 5fb77d21e57a1..8bc7dd7852bbc 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -9,6 +9,7 @@ set(prefixes
   first_trailing_one
   count_zeros
   count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.cpp b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
new file mode 100644
index 0000000000000..e5acdc2a71b4b
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_uc --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_uc, (unsigned char value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.h b/libc/src/stdbit/stdc_has_single_bit_uc.h
new file mode 100644
index 0000000000000..028d4ee710505
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_uc --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.cpp b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
new file mode 100644
index 0000000000000..37578882324aa
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ui --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ui, (unsigned value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.h b/libc/src/stdbit/stdc_has_single_bit_ui.h
new file mode 100644
index 0000000000000..1e8cd9afaee88
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ui --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.cpp b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
new file mode 100644
index 0000000000000..85133ab81cc60
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ul --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ul, (unsigned long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.h b/libc/src/stdbit/stdc_has_single_bit_ul.h
new file mode 100644
index 0000000000000..9b924fca9f065
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ul --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.cpp b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
new file mode 100644
index 0000000000000..4491cf2b98b6d
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ull -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ull, (unsigned long long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.h b/libc/src/stdbit/stdc_has_single_bit_ull.h
new file mode 100644
index 0000000000000..d4802bc287274
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ull -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.cpp b/libc/src/stdbit/stdc_has_single_bit_us.cpp
new file mode 100644
index 0000000000000..7a42ae553aa2e
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_us --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_us, (unsigned short value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.h b/libc/src/stdbit/stdc_has_single_bit_us.h
new file mode 100644
index 0000000000000..201ff4954c3b7
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_us --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 46019075a7c10..acb79ca0f3ff1 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -81,6 +81,11 @@ unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
 unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
 unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
 unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3FU; }
+bool stdc_has_single_bit_uc(unsigned char) noexcept { return false; }
+bool stdc_has_single_bit_us(unsigned short) noexcept { return false; }
+bool stdc_has_single_bit_ui(unsigned) noexcept { return false; }
+bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
+bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
@@ -164,3 +169,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroCountOnes) {
   EXPECT_EQ(stdc_count_ones(0UL), 0x3DU);
   EXPECT_EQ(stdc_count_ones(0ULL), 0x3FU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroHasSingleBit) {
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned char>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned short>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(1U), false);
+  EXPECT_EQ(stdc_has_single_bit(1UL), false);
+  EXPECT_EQ(stdc_has_single_bit(1ULL), false);
+}
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index 659e575fedea2..a886ee4a35325 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -11,6 +11,7 @@ set(prefixes
   first_trailing_one
   count_zeros
   count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
new file mode 100644
index 0000000000000..6212b1ec765a5
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_uc ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUcTest, OneHot) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
new file mode 100644
index 0000000000000..2e00507aa0258
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ui ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUiTest, OneHot) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
new file mode 100644
index 0000000000000..8c0178998bbec
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ul ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUlTest, OneHot) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(1UL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
new file mode 100644
index 0000000000000..1d9f976b6d633
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ull -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUllTest, OneHot) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(1ULL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
new file mode 100644
index 0000000000000..52c4de8810445
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_us ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUsTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUsTest, OneHot) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(1U << i), true);
+}

From 70a7b1e8df7222557cadec4e6d007850ce64f8ed Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Tue, 27 Feb 2024 14:00:01 -0800
Subject: [PATCH 500/546] Remove test since no test on --debug output. (#83189)

---
 .../LLVMIR/erase-dangling-constants.mlir      | 73 -------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 mlir/test/Target/LLVMIR/erase-dangling-constants.mlir

diff --git a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
deleted file mode 100644
index dbb6755956003..0000000000000
--- a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
+++ /dev/null
@@ -1,73 +0,0 @@
-// REQUIRES: asserts
-// RUN: mlir-translate -mlir-to-llvmir %s -debug-only=llvm-dialect-to-llvm-ir 2>&1 | FileCheck %s
-
-// CHECK: Convert initializer for dup_const
-// CHECK: 6 new constants hit
-// CHECK: 3 dangling constants erased
-// CHECK: Convert initializer for unique_const
-// CHECK: 6 new constants hit
-// CHECK: 5 dangling constants erased
-
-
-// CHECK:@dup_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02] }
-
-llvm.mlir.global @dup_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
-    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
-    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
-
-    %empty0 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a00 = llvm.insertvalue %c0, %empty0[0] : !llvm.array<2 x f64>
-
-    %empty1 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a10 = llvm.insertvalue %c0, %empty1[0] : !llvm.array<2 x f64>
-
-    %empty2 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a20 = llvm.insertvalue %c0, %empty2[0] : !llvm.array<2 x f64>
-
-// NOTE: a00, a10, a20 are all same ConstantAggregate which not used at this point.
-//       should not delete it before all of the uses of the ConstantAggregate finished.
-
-    %a01 = llvm.insertvalue %c1, %a00[1] : !llvm.array<2 x f64>
-    %a11 = llvm.insertvalue %c1, %a10[1] : !llvm.array<2 x f64>
-    %a21 = llvm.insertvalue %c1, %a20[1] : !llvm.array<2 x f64>
-    %empty_r = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r0 = llvm.insertvalue %a01, %empty_r[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r1 = llvm.insertvalue %a11, %r0[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r2 = llvm.insertvalue %a21, %r1[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    llvm.return %r2 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-  }
-
-// CHECK:@unique_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.312250e-02, double 5.219230e-02], [2 x double] [double 3.412250e-02, double 5.419230e-02] }
-
-llvm.mlir.global @unique_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
-    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
-    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
-
-    %c2 = llvm.mlir.constant(3.312250e-02 : f64) : f64
-    %c3 = llvm.mlir.constant(5.219230e-02 : f64) : f64
-
-    %c4 = llvm.mlir.constant(3.412250e-02 : f64) : f64
-    %c5 = llvm.mlir.constant(5.419230e-02 : f64) : f64
-
-    %2 = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %3 = llvm.mlir.undef : !llvm.array<2 x f64>
-
-    %4 = llvm.insertvalue %c0, %3[0] : !llvm.array<2 x f64>
-    %5 = llvm.insertvalue %c1, %4[1] : !llvm.array<2 x f64>
-
-    %6 = llvm.insertvalue %5, %2[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %7 = llvm.insertvalue %c2, %3[0] : !llvm.array<2 x f64>
-    %8 = llvm.insertvalue %c3, %7[1] : !llvm.array<2 x f64>
-
-    %9 = llvm.insertvalue %8, %6[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %10 = llvm.insertvalue %c4, %3[0] : !llvm.array<2 x f64>
-    %11 = llvm.insertvalue %c5, %10[1] : !llvm.array<2 x f64>
-
-    %12 = llvm.insertvalue %11, %9[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    llvm.return %12 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-}

From d4cdb516eee49ecaf36380af4d8f923cc475e1d7 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 27 Feb 2024 14:00:43 -0800
Subject: [PATCH 501/546] [WebAssembly] Add RefTypeMem2Local pass (#81965)

This adds `WebAssemblyRefTypeMem2Local` pass, which changes the address
spaces of reference type `alloca`s to `addrspace(1)`. This in turn
changes the address spaces of all `load` and `store` instructions that
use the `alloca`s.

`addrspace(1)` is `WASM_ADDRESS_SPACE_VAR`, and loads and stores to this
address space become `local.get`s and `local.set`s, thanks to the Wasm
local IR support added in

https://github.com/llvm/llvm-project/commit/82f92e35c6464e23859c29422956caaceb623967.

In a follow-up PR, I am planning to replace the usage of mem2reg pass
with this to solve the reference type `alloca` problems described in
#81575.
---
 llvm/lib/Target/WebAssembly/CMakeLists.txt    |  1 +
 llvm/lib/Target/WebAssembly/WebAssembly.h     |  2 +
 .../WebAssemblyRefTypeMem2Local.cpp           | 91 +++++++++++++++++++
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |  1 +
 .../CodeGen/WebAssembly/ref-type-mem2local.ll | 57 ++++++++++++
 5 files changed, 152 insertions(+)
 create mode 100644 llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
 create mode 100644 llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll

diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index bb2ccea5c1459..f430be2653b4e 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
+  WebAssemblyRefTypeMem2Local.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 91765ad117bdb..1c40addb6d6f7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -30,6 +30,7 @@ ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
 FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv();
+FunctionPass *createWebAssemblyRefTypeMem2Local();
 
 // ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
@@ -59,6 +60,7 @@ ModulePass *createWebAssemblyMCLowerPrePass();
 // PassRegistry initialization declarations.
 void initializeFixFunctionBitcastsPass(PassRegistry &);
 void initializeOptimizeReturnedPass(PassRegistry &);
+void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
new file mode 100644
index 0000000000000..d3c60ee289dfd
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
@@ -0,0 +1,91 @@
+//=== WebAssemblyRefTypeMem2Local.cpp - WebAssembly RefType Mem2Local -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Assign reference type allocas to local addrspace (addrspace(1)) so that
+/// their loads and stores can be lowered to local.gets/local.sets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Utils/WasmAddressSpaces.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "WebAssembly.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-ref-type-mem2local"
+
+namespace {
+class WebAssemblyRefTypeMem2Local final
+    : public FunctionPass,
+      public InstVisitor<WebAssemblyRefTypeMem2Local> {
+  StringRef getPassName() const override {
+    return "WebAssembly Reference Types Memory to Local";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override;
+  bool Changed = false;
+
+public:
+  static char ID;
+  WebAssemblyRefTypeMem2Local() : FunctionPass(ID) {}
+
+  void visitAllocaInst(AllocaInst &AI);
+};
+} // End anonymous namespace
+
+char WebAssemblyRefTypeMem2Local::ID = 0;
+INITIALIZE_PASS(WebAssemblyRefTypeMem2Local, DEBUG_TYPE,
+                "Assign reference type allocas to local address space", true,
+                false)
+
+FunctionPass *llvm::createWebAssemblyRefTypeMem2Local() {
+  return new WebAssemblyRefTypeMem2Local();
+}
+
+void WebAssemblyRefTypeMem2Local::visitAllocaInst(AllocaInst &AI) {
+  if (WebAssembly::isWebAssemblyReferenceType(AI.getAllocatedType())) {
+    Changed = true;
+    IRBuilder<> IRB(AI.getContext());
+    IRB.SetInsertPoint(&AI);
+    auto *NewAI = IRB.CreateAlloca(AI.getAllocatedType(),
+                                   WebAssembly::WASM_ADDRESS_SPACE_VAR, nullptr,
+                                   AI.getName() + ".var");
+
+    // The below is basically equivalent to AI.replaceAllUsesWith(NewAI), but we
+    // cannot use it because it requires the old and new types be the same,
+    // which is not true here because the address spaces are different.
+    if (AI.hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(&AI, NewAI);
+    if (AI.isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(&AI, NewAI);
+    while (!AI.materialized_use_empty()) {
+      Use &U = *AI.materialized_use_begin();
+      U.set(NewAI);
+    }
+
+    AI.eraseFromParent();
+  }
+}
+
+bool WebAssemblyRefTypeMem2Local::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "********** WebAssembly RefType Mem2Local **********\n"
+                       "********** Function: "
+                    << F.getName() << '\n');
+
+  visit(F);
+  return Changed;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b2f7ee970a732..d088c7d925ddf 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -77,6 +77,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   initializeLowerGlobalDtorsLegacyPassPass(PR);
   initializeFixFunctionBitcastsPass(PR);
   initializeOptimizeReturnedPass(PR);
+  initializeWebAssemblyRefTypeMem2LocalPass(PR);
   initializeWebAssemblyArgumentMovePass(PR);
   initializeWebAssemblySetP2AlignOperandsPass(PR);
   initializeWebAssemblyReplacePhysRegsPass(PR);
diff --git a/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
new file mode 100644
index 0000000000000..a38243ca218cc
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -wasm-ref-type-mem2local -S | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+%externref = type ptr addrspace(10)
+%funcref = type ptr addrspace(20)
+
+declare %externref @get_externref()
+declare %funcref @get_funcref()
+declare i32 @get_i32()
+declare void @take_externref(%externref)
+declare void @take_funcref(%funcref)
+declare void @take_i32(i32)
+
+; Reference type allocas should be moved to addrspace(1)
+; CHECK-LABEL: @test_ref_type_mem2local
+define void @test_ref_type_mem2local() {
+entry:
+  %alloc.externref = alloca %externref, align 1
+  %eref = call %externref @get_externref()
+  store %externref %eref, ptr %alloc.externref, align 1
+  %eref.loaded = load %externref, ptr %alloc.externref, align 1
+  call void @take_externref(%externref %eref.loaded)
+  ; CHECK:      %alloc.externref.var = alloca ptr addrspace(10), align 1, addrspace(1)
+  ; CHECK-NEXT: %eref = call ptr addrspace(10) @get_externref()
+  ; CHECK-NEXT: store ptr addrspace(10) %eref, ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: %eref.loaded = load ptr addrspace(10), ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: call void @take_externref(ptr addrspace(10) %eref.loaded)
+
+  %alloc.funcref = alloca %funcref, align 1
+  %fref = call %funcref @get_funcref()
+  store %funcref %fref, ptr %alloc.funcref, align 1
+  %fref.loaded = load %funcref, ptr %alloc.funcref, align 1
+  call void @take_funcref(%funcref %fref.loaded)
+  ; CHECK-NEXT: %alloc.funcref.var = alloca ptr addrspace(20), align 1, addrspace(1)
+  ; CHECK-NEXT: %fref = call ptr addrspace(20) @get_funcref()
+  ; CHECK-NEXT: store ptr addrspace(20) %fref, ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: %fref.loaded = load ptr addrspace(20), ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: call void @take_funcref(ptr addrspace(20) %fref.loaded)
+
+  ret void
+}
+
+; POD type allocas should stay the same
+; CHECK-LABEL: @test_pod_type
+define void @test_pod_type() {
+entry:
+  %alloc.i32 = alloca i32
+  %i32 = call i32 @get_i32()
+  store i32 %i32, ptr %alloc.i32
+  %i32.loaded = load i32, ptr %alloc.i32
+  call void @take_i32(i32 %i32.loaded)
+  ; CHECK: %alloc.i32 = alloca i32, align 4{{$}}
+  ; CHECK-NOT: addrspace(1)
+
+  ret void
+}

From 5964f4bcf012deb8b8dadcc403644c754a6b15e0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 27 Feb 2024 22:01:27 +0000
Subject: [PATCH 502/546] [gn build] Port d4cdb516eee4

---
 llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index 949b3b2147405..a8d6290f1b990 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -61,6 +61,7 @@ static_library("LLVMWebAssemblyCodeGen") {
     "WebAssemblyOptimizeLiveIntervals.cpp",
     "WebAssemblyOptimizeReturned.cpp",
     "WebAssemblyPeephole.cpp",
+    "WebAssemblyRefTypeMem2Local.cpp",
     "WebAssemblyRegColoring.cpp",
     "WebAssemblyRegNumbering.cpp",
     "WebAssemblyRegStackify.cpp",

From 3761ad01e125e3b38ed2d6f40b3cbcbac13611a5 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 27 Feb 2024 17:31:02 -0500
Subject: [PATCH 503/546] [libc++] Remove _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
 (#82000)

As discussed in #76647, _LIBCPP_ATOMIC_ONLY_USE_BUILTINS is a
questionable configuration option. It makes our implementation of
std::atomic even more complicated than it already is for a limited
benefit.

Indeed, the original goal of that setting was to decouple libc++ from
libraries like compiler-rt and libatomic in Freestanding mode. We didn't
have a clear understanding of goals and non-goals of Freestanding back
then, but nowadays we do have a better understanding that removing all
dependencies of libc++ in Freestanding is a non-goal. We should still be
able to depend on builtins like those defined in compiler-rt for
implementing our atomic operations in Freestanding. Freestanding means
that there is no underlying operating system, not that there is no
toolchain available.

This patch removes the configuration option. This should have a very
limited fallout since that configuration was only enabled with
-ffreestanding, and libc++ basically doesn't work out of the box on
Freestanding platforms today.

The benefits are a slightly simpler implementation of std::atomic,
getting rid of one of the ABI-incompatible representations of
std::atomic, and clearing the way for proper Freestanding support to
eventually land in the library.

Fixes #81286
---
 libcxx/docs/ReleaseNotes/19.rst           |   5 +
 libcxx/include/__atomic/aliases.h         |   2 +-
 libcxx/include/__atomic/cxx_atomic_impl.h | 291 +---------------------
 libcxx/include/__config                   |   3 -
 4 files changed, 7 insertions(+), 294 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 6c8f8d17af9b1..78c6bb87a5a40 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -73,6 +73,11 @@ Deprecations and Removals
 
 - The ``_LIBCPP_INLINE_VISIBILITY`` and ``_VSTD`` macros have been removed in LLVM 19.
 
+- The ``_LIBCPP_ATOMIC_ONLY_USE_BUILTINS`` configuration option has been removed in LLVM 19. This should not affect
+  many users, except perhaps users using the library with ``-ffreestanding`` with a toolchain where compiler-rt or
+  libatomic is not available. If you are one such user, please reach out to the libc++ developers so we can collaborate
+  on a path for supporting atomics properly on freestanding platforms.
+
 
 Upcoming Deprecations and Removals
 ----------------------------------
diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h
index 0fa289de54b0f..db34f5ec02d74 100644
--- a/libcxx/include/__atomic/aliases.h
+++ b/libcxx/include/__atomic/aliases.h
@@ -92,7 +92,7 @@ using __largest_lock_free_type = short;
 #  elif ATOMIC_CHAR_LOCK_FREE == 2
 using __largest_lock_free_type = char;
 #  else
-#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen in freestanding)
+#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen on unusual platforms)
 #  endif
 
 #  ifndef _LIBCPP_NO_LOCK_FREE_TYPES
diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h
index 1a0b808a0cb1c..b900cc135f78f 100644
--- a/libcxx/include/__atomic/cxx_atomic_impl.h
+++ b/libcxx/include/__atomic/cxx_atomic_impl.h
@@ -9,16 +9,13 @@
 #ifndef _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 
-#include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__memory/addressof.h>
-#include <__type_traits/conditional.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_const.h>
 #include <cstddef>
-#include <cstring>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -26,7 +23,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP) || defined(_LIBCPP_ATOMIC_ONLY_USE_BUILTINS)
+#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
 
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
@@ -44,10 +41,6 @@ _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_assign_volatile(_Tp volatile& __a_value,
     *__to++ = *__from++;
 }
 
-#endif
-
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
-
 template <typename _Tp>
 struct __cxx_atomic_base_impl {
   _LIBCPP_HIDE_FROM_ABI
@@ -529,289 +522,7 @@ __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_o
 
 #endif // _LIBCPP_HAS_GCC_ATOMIC_IMP, _LIBCPP_HAS_C_ATOMIC_IMP
 
-#ifdef _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-
-template <typename _Tp>
-struct __cxx_atomic_lock_impl {
-  _LIBCPP_HIDE_FROM_ABI __cxx_atomic_lock_impl() _NOEXCEPT : __a_value(), __a_lock(0) {}
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __cxx_atomic_lock_impl(_Tp value) _NOEXCEPT
-      : __a_value(value),
-        __a_lock(0) {}
-
-  _Tp __a_value;
-  mutable __cxx_atomic_base_impl<_LIBCPP_ATOMIC_FLAG_TYPE> __a_lock;
-
-  _LIBCPP_HIDE_FROM_ABI void __lock() const volatile {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __lock() const {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const volatile {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const volatile {
-    __lock();
-    _Tp __old;
-    __cxx_atomic_assign_volatile(__old, __a_value);
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const {
-    __lock();
-    _Tp __old = __a_value;
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const volatile {
-    __lock();
-    __cxx_atomic_assign_volatile(*__dst, __a_value);
-    __unlock();
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const {
-    __lock();
-    *__dst = __a_value;
-    __unlock();
-  }
-};
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __a->__a_value = __val;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-  __a->__unlock();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __a->__a_value = __val;
-  __a->__unlock();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void
-__cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old      = __a->__a_value;
-  __a->__a_value = __value;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value -= __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value &= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value |= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value ^= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp,
-          typename _Base = typename conditional<__libcpp_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp>,
-                                                __cxx_atomic_lock_impl<_Tp> >::type>
-#else
 template <typename _Tp, typename _Base = __cxx_atomic_base_impl<_Tp> >
-#endif //_LIBCPP_ATOMIC_ONLY_USE_BUILTINS
 struct __cxx_atomic_impl : public _Base {
   static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<T> requires that 'T' be a trivially copyable type");
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 0797880cb2f5d..942bbe7cbb93c 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1200,9 +1200,6 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    ifndef _LIBCPP_ATOMIC_FLAG_TYPE
 #      define _LIBCPP_ATOMIC_FLAG_TYPE bool
 #    endif
-#    ifdef _LIBCPP_FREESTANDING
-#      define _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-#    endif
 #  endif
 
 #  if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(__no_thread_safety_analysis__)

From 78647116d85b30c9b8b31a4690758c33f50c0550 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 27 Feb 2024 14:34:07 -0800
Subject: [PATCH 504/546] [nfc][compiler-rt]Remove round-up in
 __llvm_profile_get_num_data (#83194)

- Update instrprof-basic.c as a regression test.
---
 compiler-rt/lib/profile/InstrProfilingBuffer.c | 11 ++---------
 compiler-rt/test/profile/instrprof-basic.c     | 12 ++++++++++++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 7c5c26f4d113b..1c451d7ec7563 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -61,19 +61,12 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
       NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
 }
 
+// NOTE: Caller should guarantee that `Begin` and `End` specifies a half-open
+// interval [Begin, End). Namely, `End` is one-byte past the end of the array.
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
-  // `sizeof(__llvm_profile_data) - 1` is required in the numerator when
-  // [Begin, End] represents an inclusive range.
-  // For ELF, [Begin, End) represents the address of linker-inserted
-  // symbols  `__start__<elf-section>` and `__stop_<elf-section>`.
-  // Thereby, `End` is one byte past the inclusive range, and
-  // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get
-  // the correct number of profile data.
-  // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true
-  // across platforms.
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
 }
diff --git a/compiler-rt/test/profile/instrprof-basic.c b/compiler-rt/test/profile/instrprof-basic.c
index de66e1b274680..702f521ba4ed8 100644
--- a/compiler-rt/test/profile/instrprof-basic.c
+++ b/compiler-rt/test/profile/instrprof-basic.c
@@ -1,6 +1,7 @@
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
+// RUN: llvm-profdata show --all-functions %t.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=ORIG
 //
 // RUN: rm -fr %t.dir1
@@ -8,6 +9,7 @@
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: llvm-profdata merge -o %t.em.profdata %t.dir1
+// RUN: llvm-profdata show --all-functions %t.em.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.em.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // RUN: rm -fr %t.dir2
@@ -16,6 +18,7 @@
 // RUN: %run %t.merge
 // RUN: %run %t.merge
 // RUN: llvm-profdata merge -o %t.m.profdata %t.dir2/
+// RUN: llvm-profdata show --all-functions %t.m.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // Test that merging is enabled by default with -fprofile-generate=
@@ -27,6 +30,7 @@
 // RUN: %run %t.merge3
 // RUN: %run %t.merge3
 // RUN: llvm-profdata merge -o %t.m3.profdata %t.dir3/
+// RUN: llvm-profdata show --all-functions %t.m3.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m3.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=PGOMERGE
 //
 // Test that merging is enabled by default with -fprofile-generate
@@ -40,6 +44,7 @@
 // RUN: %run %t.dir4/merge4
 // RUN: rm -f %t.dir4/merge4*
 // RUN: llvm-profdata merge -o %t.m4.profdata ./
+// RUN: llvm-profdata show --all-functions %t.m4.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m4.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON  --check-prefix=PGOMERGE
 
 /// Test that the merge pool size can be larger than 10.
@@ -49,6 +54,13 @@
 // RUN: not ls %t.dir5/e_%20m.profraw
 // RUN: ls %t.dir5/e_*.profraw | count 1
 
+// Test that all three functions have counters in the profile.
+// PROFCNT-DAG: begin
+// PROFCNT-DAG: end
+// PROFCNT-DAG: main
+// PROFCNT: Functions shown: 3
+// PROFCNT: Total functions: 3
+
 int begin(int i) {
   // COMMON: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !prof ![[PD1:[0-9]+]]
   if (i)

From 001e18c816736602e3ad1c5dc6259143455610ea Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 27 Feb 2024 22:45:46 +0000
Subject: [PATCH 505/546] [NFC] clang-format SROA.cpp

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 98 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 6c8785d52c4ea..fad70e8bf2861 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -293,7 +293,7 @@ calculateFragment(DILocalVariable *Variable,
   if (!CurrentFragment) {
     if (auto Size = Variable->getSizeInBits()) {
       // Treat the current fragment as covering the whole variable.
-      CurrentFragment =  DIExpression::FragmentInfo(*Size, 0);
+      CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
       if (Target == CurrentFragment)
         return UseNoFrag;
     }
@@ -1213,8 +1213,9 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
-    insertUse(II, Offset, Length ? Length->getLimitedValue()
-                                 : AllocSize - Offset.getLimitedValue(),
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
               (bool)Length);
   }
 
@@ -1669,7 +1670,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
   }
 
   // Inject loads into all of the pred blocks.
-  DenseMap<BasicBlock*, Value*> InjectedLoads;
+  DenseMap<BasicBlock *, Value *> InjectedLoads;
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
     BasicBlock *Pred = PN.getIncomingBlock(Idx);
     Value *InVal = PN.getIncomingValue(Idx);
@@ -1678,7 +1679,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
     // basic block, as long as the value is the same. So if we already injected
     // a load in the predecessor, then we should reuse the same load for all
     // duplicated entries.
-    if (Value* V = InjectedLoads.lookup(Pred)) {
+    if (Value *V = InjectedLoads.lookup(Pred)) {
       NewPN->addIncoming(V, Pred);
       continue;
     }
@@ -2077,8 +2078,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   if (BeginIndex * ElementSize != BeginOffset ||
       BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
     return false;
-  uint64_t EndOffset =
-      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   uint64_t EndIndex = EndOffset / ElementSize;
   if (EndIndex * ElementSize != EndOffset ||
       EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
@@ -2754,8 +2754,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
     IRB.SetInsertPoint(OldUserI);
     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-    IRB.getInserter().SetNamePrefix(
-        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+    IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
+                                    Twine(BeginOffset) + ".");
 
     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     if (VecTy || IntTy)
@@ -2808,7 +2808,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 #else
                           Twine()
 #endif
-                          );
+    );
   }
 
   /// Compute suitable alignment to access this slice of the *new*
@@ -3189,8 +3189,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     const bool CanContinue = [&]() {
       if (VecTy || IntTy)
         return true;
-      if (BeginOffset > NewAllocaBeginOffset ||
-          EndOffset < NewAllocaEndOffset)
+      if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
         return false;
       // Length must be in range for FixedVectorType.
       auto *C = cast<ConstantInt>(II.getLength());
@@ -3984,9 +3983,9 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     if (!Sel)
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
-                      << "\n    original: " << *Sel
-                      << "\n              " << GEPI);
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):\n";
+               dbgs() << "    original: " << *Sel << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
     auto GetNewOps = [&](Value *SelOp) {
       SmallVector<Value *> NewOps;
@@ -4023,9 +4022,9 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     Visited.insert(NSelI);
     enqueueUsers(*NSelI);
 
-    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
-                      << "\n              " << *NFalse
-                      << "\n              " << *NSel << '\n');
+    LLVM_DEBUG(dbgs() << "          to: " << *NTrue << "\n";
+               dbgs() << "              " << *NFalse << "\n";
+               dbgs() << "              " << *NSel << "\n";);
 
     return true;
   }
@@ -4037,18 +4036,17 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
 
     PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
     if (GEPI.getParent() != PHI->getParent() ||
-        llvm::any_of(PHI->incoming_values(), [](Value *In)
-          { Instruction *I = dyn_cast<Instruction>(In);
-            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
-                   succ_empty(I->getParent()) ||
-                   !I->getParent()->isLegalToHoistInto();
-          }))
+        llvm::any_of(PHI->incoming_values(), [](Value *In) {
+          Instruction *I = dyn_cast<Instruction>(In);
+          return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+                 succ_empty(I->getParent()) ||
+                 !I->getParent()->isLegalToHoistInto();
+        }))
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
-                      << "\n    original: " << *PHI
-                      << "\n              " << GEPI
-                      << "\n          to: ");
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
+               dbgs() << "    original: " << *PHI << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
     SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
@@ -4078,8 +4076,10 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     Visited.insert(NewPN);
     enqueueUsers(*NewPN);
 
-    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
-                 dbgs() << "\n              " << *In;
+    LLVM_DEBUG(dbgs() << "          to: ";
+               for (Value *In
+                    : NewPN->incoming_values()) dbgs()
+               << "\n              " << *In;
                dbgs() << "\n              " << *NewPN << '\n');
 
     return true;
@@ -4089,8 +4089,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     if (foldGEPSelect(GEPI))
       return true;
 
-    if (isa<PHINode>(GEPI.getPointerOperand()) &&
-        foldGEPPhi(GEPI))
+    if (isa<PHINode>(GEPI.getPointerOperand()) && foldGEPPhi(GEPI))
       return true;
 
     enqueueUsers(GEPI);
@@ -4162,17 +4161,17 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
     return nullptr;
 
   if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
-     Type *ElementTy;
-     uint64_t TyNumElements;
-     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
-       ElementTy = AT->getElementType();
-       TyNumElements = AT->getNumElements();
-     } else {
-       // FIXME: This isn't right for vectors with non-byte-sized or
-       // non-power-of-two sized elements.
-       auto *VT = cast<FixedVectorType>(Ty);
-       ElementTy = VT->getElementType();
-       TyNumElements = VT->getNumElements();
+    Type *ElementTy;
+    uint64_t TyNumElements;
+    if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+      ElementTy = AT->getElementType();
+      TyNumElements = AT->getNumElements();
+    } else {
+      // FIXME: This isn't right for vectors with non-byte-sized or
+      // non-power-of-two sized elements.
+      auto *VT = cast<FixedVectorType>(Ty);
+      ElementTy = VT->getElementType();
+      TyNumElements = VT->getNumElements();
     }
     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
     uint64_t NumSkippedElements = Offset / ElementSize;
@@ -4853,9 +4852,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     ++NumNewAllocas;
   }
 
-  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
-                    << "[" << P.beginOffset() << "," << P.endOffset()
-                    << ") to: " << *NewAI << "\n");
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
+                    << "," << P.endOffset() << ") to: " << *NewAI << "\n");
 
   // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
@@ -5040,8 +5038,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         IsSorted = false;
       }
     }
-  }
-  else {
+  } else {
     // We only allow whole-alloca splittable loads and stores
     // for a large alloca to avoid creating too large BitVector.
     for (Slice &S : AS) {
@@ -5069,7 +5066,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     uint64_t Offset;
     uint64_t Size;
     Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
-      : Alloca(AI), Offset(O), Size(S) {}
+        : Alloca(AI), Offset(O), Size(S) {}
   };
   SmallVector<Fragment, 4> Fragments;
 
@@ -5083,7 +5080,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
             DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
-        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+        Fragments.push_back(
+            Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
       }
     }
     ++NumPartitions;

From a3748d60ff18f612cd26a0b4ca7f05f2fbef264d Mon Sep 17 00:00:00 2001
From: Iman Hosseini <hosseini.iman@yahoo.com>
Date: Tue, 27 Feb 2024 22:55:45 +0000
Subject: [PATCH 506/546] [MLIR] Update GPU.md: add gpu kernel outlining to doc
 example. (#83141)

gpu-kernel-outlining is needed for this example to work.
---
 mlir/docs/Dialects/GPU.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
index 85255fdc5e643..8a3acc33600a4 100644
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@@ -50,6 +50,7 @@ An example of how the compilation workflow look is:
 ```
 mlir-opt example.mlir                   \
   --pass-pipeline="builtin.module(      \
+    gpu-kernel-outlining,               \ # Outline gpu.launch body to a kernel.
     nvvm-attach-target{chip=sm_90 O=3}, \ # Attach an NVVM target to a gpu.module op.
     gpu.module(convert-gpu-to-nvvm),    \ # Convert GPU to NVVM.
     gpu-to-llvm,                        \ # Convert GPU to LLVM.

From 9d0acb872a5063f570366cd0e94b069d286cc71f Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Tue, 27 Feb 2024 15:27:31 -0800
Subject: [PATCH 507/546] [mlir][Vector] Add support for trunci to narrow type
 emulation (#82565)

This PR add support for `arith.trunci` to vector narrow type emulation for iX -> i4 truncations, for X >= 8. For now, the pattern only works for 1D vectors and is based on `vector.shuffle` ops. We would need `vector.deinterleave` to add n-D vector support.
---
 .../Transforms/VectorEmulateNarrowType.cpp    | 126 +++++++++++++++++-
 .../Vector/vector-rewrite-narrow-types.mlir   |  42 ++++++
 2 files changed, 163 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index fc11ae63e718a..dc6f126aae4c8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -729,8 +729,8 @@ static LogicalResult commonConversionPrecondition(PatternRewriter &rewriter,
 
   // TODO: consider relaxing this restriction in the future if we find ways
   // to really work with subbyte elements across the MLIR/LLVM boundary.
-  unsigned resultBitwidth = preconditionType.getElementTypeBitWidth();
-  if (resultBitwidth % 8 != 0)
+  unsigned bitwidth = preconditionType.getElementTypeBitWidth();
+  if (bitwidth % 8 != 0)
     return rewriter.notifyMatchFailure(op, "bitwidth is not k * 8");
 
   return success();
@@ -768,6 +768,10 @@ static LogicalResult alignedConversionPrecondition(PatternRewriter &rewriter,
       (dstElemBitwidth % srcElemBitwidth) != 0)
     return rewriter.notifyMatchFailure(op, "Not a supported aligned case");
 
+  if ((srcType.getShape().back() % 2) != 0)
+    return rewriter.notifyMatchFailure(
+        op, "Not an even number of i4 elements in trailing dim");
+
   return success();
 }
 
@@ -876,6 +880,58 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
   return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
+/// Rewrite the i8 -> i4 truncation into a sequence of shuffles and bitwise ops
+/// that take advantage of high-level information to avoid leaving LLVM to
+/// scramble with peephole optimizations.
+static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
+                                Value srcValue) {
+  VectorType srcVecType = cast<VectorType>(srcValue.getType());
+  assert(srcVecType.getElementType().isSignlessInteger(8) &&
+         "Expected i8 type");
+
+  // 1. De-interleave low and high i8 elements.
+  int64_t vecDimSize = srcVecType.getShape().back();
+  SmallVector<int64_t> deinterleaveLowMaskValues;
+  SmallVector<int64_t> deinterleaveHighMaskValues;
+  assert((vecDimSize % 2) == 0 && "Odd number of i4 elements");
+  deinterleaveLowMaskValues.reserve(vecDimSize / 2);
+  deinterleaveHighMaskValues.reserve(vecDimSize / 2);
+  for (int i = 0, end = vecDimSize; i < end; i += 2) {
+    deinterleaveLowMaskValues.push_back(i);
+    deinterleaveHighMaskValues.push_back(i + 1);
+  }
+
+  auto lowShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveLowMaskValues));
+  auto highShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveHighMaskValues));
+
+  // 2. Zero out the upper side of each low i8 element.
+  constexpr int8_t i8LowBitMask = 0x0F;
+  Value zeroOutMask = rewriter.create<arith::ConstantOp>(
+      loc,
+      DenseElementsAttr::get(lowShuffleOp.getResultVectorType(), i8LowBitMask));
+  Value zeroOutLow =
+      rewriter.create<arith::AndIOp>(loc, lowShuffleOp, zeroOutMask);
+
+  // 3. Move high i4 values to upper side of the byte.
+  constexpr int8_t bitsToShift = 4;
+  VectorType deinterI8VecType = highShuffleOp.getResultVectorType();
+  auto shiftValues = rewriter.create<arith::ConstantOp>(
+      loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
+  Value shlHigh =
+      rewriter.create<arith::ShLIOp>(loc, highShuffleOp, shiftValues);
+
+  // 4. Merge high and low i4 values.
+  auto mergedHiLowOp = rewriter.create<arith::OrIOp>(loc, zeroOutLow, shlHigh);
+
+  // 5. Generate a bitcast vector<Xxi8> -> vector<2Xxi4>.
+  auto i4VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI4Type());
+  return rewriter.create<vector::BitCastOp>(loc, i4VecType, mergedHiLowOp);
+}
+
 namespace {
 /// Rewrite bitcast(trunci) to a sequence of shuffles and bitwise ops that take
 /// advantage of high-level information to avoid leaving LLVM to scramble with
@@ -1019,7 +1075,7 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
 
   LogicalResult matchAndRewrite(ConversionOpType conversionOp,
                                 PatternRewriter &rewriter) const override {
-    // Set up the BitCastRewriter and verify the preconditions.
+    // Verify the preconditions.
     Value srcValue = conversionOp.getIn();
     auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
     auto dstVecType = dyn_cast<VectorType>(conversionOp.getType());
@@ -1043,6 +1099,65 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
   }
 };
 
+/// Rewrite the i8 -> i4 part of any truncation into a sequence of shuffles and
+/// bitwise ops that take advantage of high-level information to avoid leaving
+/// LLVM to scramble with peephole optimizations.
+///
+/// For example:
+///    arith.trunci %in : vector<8xi32> to vector<8xi4>
+///      is rewriten as
+///
+///        %cst = arith.constant dense<15> : vector<4xi8>
+///        %cst_0 = arith.constant dense<4> : vector<4xi8>
+///        %0 = arith.trunci %in : vector<8xi32> to vector<8xi8>
+///        %1 = vector.shuffle %0, %0 [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+///        %2 = vector.shuffle %0, %0 [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+///        %3 = arith.andi %1, %cst : vector<4xi8>
+///        %4 = arith.shli %2, %cst_0 : vector<4xi8>
+///        %5 = arith.ori %3, %4 : vector<4xi8>
+///        %6 = vector.bitcast %5 : vector<4xi8> to vector<8xi4>
+///
+struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
+  using OpRewritePattern<arith::TruncIOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::TruncIOp truncOp,
+                                PatternRewriter &rewriter) const override {
+    // Verify the preconditions.
+    Value srcValue = truncOp.getIn();
+    auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
+    auto dstVecType = dyn_cast<VectorType>(truncOp.getType());
+    if (!srcVecType || !dstVecType)
+      return failure();
+
+    // Only single dim vectors are supported until we have
+    // `vector.deinterleave`.
+    if (srcVecType.getRank() != 1)
+      return failure();
+
+    if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp)))
+      return failure();
+
+    // Check general alignment preconditions. We invert the src/dst type order
+    // to reuse the existing precondition logic.
+    if (failed(alignedConversionPrecondition(rewriter, dstVecType, srcVecType,
+                                             truncOp)))
+      return failure();
+
+    // Create a new iX -> i8 truncation op.
+    Location loc = truncOp.getLoc();
+    auto i8VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI8Type());
+    Value i8TruncVal =
+        rewriter.create<arith::TruncIOp>(loc, i8VecType, srcValue);
+
+    // Rewrite the i8 -> i4 truncation part.
+    Value subByteTrunc = rewriteI8ToI4Trunc(rewriter, loc, i8TruncVal);
+
+    // Finalize the rewrite.
+    rewriter.replaceOp(truncOp, subByteTrunc);
+    return success();
+  }
+};
+
 /// Rewrite a sub-byte vector transpose into a sequence of instructions that
 /// perform the transpose on wider (byte) element types.
 /// For example:
@@ -1115,8 +1230,9 @@ void vector::populateVectorNarrowTypeRewritePatterns(
   // Patterns for aligned cases. We set higher priority as they are expected to
   // generate better performance for aligned cases.
   patterns.add<RewriteAlignedSubByteIntSignedExt<arith::ExtSIOp>,
-               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>>(
-      patterns.getContext(), benefit.getBenefit() + 1);
+               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>,
+               RewriteAlignedSubByteIntTrunc>(patterns.getContext(),
+                                              benefit.getBenefit() + 1);
 }
 
 void vector::populateVectorTransposeNarrowTypeRewritePatterns(
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 94e78ce40a3c1..8f0148119806c 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -262,6 +262,48 @@ func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> {
   return %0 : vector<8x32xf32>
 }
 
+// CHECK-LABEL: func.func @aligned_trunci(
+func.func @aligned_trunci(%a: vector<8xi32>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi32>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[I8]], %[[I8]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[I8]], %[[I8]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi32> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_base_case(
+func.func @aligned_trunci_base_case(%a: vector<8xi8>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[IN]], %[[IN]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[IN]], %[[IN]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi8> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_2d(
+func.func @aligned_trunci_2d(%a: vector<8x32xi32>) -> vector<8x32xi4> {
+// CHECK-NOT:       vector.shuffle
+// CHECK-NOT:       vector.andi
+// CHECK-NOT:       vector.shli
+// CHECK-NOT:       vector.ori
+// CHECK:           arith.trunci
+  %0 = arith.trunci %a : vector<8x32xi32> to vector<8x32xi4>
+  return %0 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func.func @i4_transpose(
 func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
 // CHECK-SAME:    %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> {

From 0e84e2748b40eb757a5c52a983c87dd4f25a1587 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Tue, 27 Feb 2024 15:38:31 -0800
Subject: [PATCH 508/546] [BOLT] Move test under X86 target. NFCI (#83202)

instrument-wrong-target.s test requires X86 host. Move it under
runtime/X86.
---
 bolt/test/runtime/{ => X86}/instrument-wrong-target.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename bolt/test/runtime/{ => X86}/instrument-wrong-target.s (88%)

diff --git a/bolt/test/runtime/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s
similarity index 88%
rename from bolt/test/runtime/instrument-wrong-target.s
rename to bolt/test/runtime/X86/instrument-wrong-target.s
index b25c924ffbcc0..343d93a89ed13 100644
--- a/bolt/test/runtime/instrument-wrong-target.s
+++ b/bolt/test/runtime/X86/instrument-wrong-target.s
@@ -1,8 +1,8 @@
 # Test that BOLT errs when trying to instrument a binary with a different
 # architecture than the one BOLT is built for.
 
-# REQUIRES: x86_64-linux,bolt-runtime
-# REQUIRES: target-x86_64 && aarch64-registered-target
+# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: aarch64-registered-target
 
 # RUN: llvm-mc -triple aarch64 -filetype=obj %s -o %t.o
 # RUN: ld.lld -q -pie -o %t.exe %t.o

From 04e8653f189bf3d65680c7fb3b3033ad82903ee9 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 17:45:15 -0600
Subject: [PATCH 509/546] [libc] Add "include/" to the LLVM include directories
 (#83199)

Summary:
Recent changes added an include path in the float128 type that used the
internal `libc` path to find the macro. This doesn't work once it's
installed because we need to search from the root of the install dir.
This patch adds "include/" to the include path so that our inclusion
of installed headers always match the internal use.
---
 libc/benchmarks/automemcpy/lib/CMakeLists.txt              | 3 ++-
 libc/cmake/modules/LLVMLibCObjectRules.cmake               | 4 ++++
 libc/cmake/modules/LLVMLibCTestRules.cmake                 | 4 ++++
 libc/cmake/modules/compiler_features/check_fixed_point.cpp | 2 +-
 libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp              | 2 +-
 libc/include/llvm-libc-types/float128.h                    | 2 +-
 libc/src/__support/CPP/limits.h                            | 2 +-
 libc/src/__support/CPP/type_traits/is_fixed_point.h        | 2 +-
 libc/src/__support/HashTable/table.h                       | 2 +-
 libc/src/__support/fixed_point/fx_bits.h                   | 2 +-
 libc/src/__support/fixed_point/fx_rep.h                    | 2 +-
 libc/src/__support/fixed_point/sqrt.h                      | 2 +-
 libc/src/__support/macros/properties/float.h               | 4 ++--
 libc/src/stdfix/abshk.h                                    | 2 +-
 libc/src/stdfix/abshr.h                                    | 2 +-
 libc/src/stdfix/absk.h                                     | 2 +-
 libc/src/stdfix/abslk.h                                    | 2 +-
 libc/src/stdfix/abslr.h                                    | 2 +-
 libc/src/stdfix/absr.h                                     | 2 +-
 libc/src/stdfix/roundhk.h                                  | 2 +-
 libc/src/stdfix/roundhr.h                                  | 2 +-
 libc/src/stdfix/roundk.h                                   | 2 +-
 libc/src/stdfix/roundlk.h                                  | 2 +-
 libc/src/stdfix/roundlr.h                                  | 2 +-
 libc/src/stdfix/roundr.h                                   | 2 +-
 libc/src/stdfix/rounduhk.h                                 | 2 +-
 libc/src/stdfix/rounduhr.h                                 | 2 +-
 libc/src/stdfix/rounduk.h                                  | 2 +-
 libc/src/stdfix/roundulk.h                                 | 2 +-
 libc/src/stdfix/roundulr.h                                 | 2 +-
 libc/src/stdfix/roundur.h                                  | 2 +-
 libc/src/stdfix/sqrtuhk.h                                  | 2 +-
 libc/src/stdfix/sqrtuhr.h                                  | 2 +-
 libc/src/stdfix/sqrtuk.h                                   | 2 +-
 libc/src/stdfix/sqrtulr.h                                  | 2 +-
 libc/src/stdfix/sqrtur.h                                   | 2 +-
 libc/src/stdio/printf_core/fixed_converter.h               | 2 +-
 libc/src/stdio/printf_core/parser.h                        | 2 +-
 libc/src/sys/epoll/epoll_pwait.h                           | 4 ++--
 libc/src/sys/epoll/epoll_pwait2.h                          | 6 +++---
 libc/src/sys/epoll/epoll_wait.h                            | 2 +-
 libc/src/sys/epoll/linux/epoll_pwait.cpp                   | 4 ++--
 libc/src/sys/epoll/linux/epoll_pwait2.cpp                  | 6 +++---
 libc/src/sys/epoll/linux/epoll_wait.cpp                    | 4 ++--
 libc/test/UnitTest/CMakeLists.txt                          | 3 ++-
 libc/test/UnitTest/LibcTest.cpp                            | 2 +-
 libc/test/include/stdbit_test.cpp                          | 2 +-
 libc/test/include/stdckdint_test.cpp                       | 2 +-
 libc/test/integration/startup/CMakeLists.txt               | 1 +
 libc/test/integration/startup/gpu/rpc_interface_test.cpp   | 2 +-
 libc/test/integration/startup/gpu/rpc_stream_test.cpp      | 2 +-
 libc/test/integration/startup/gpu/rpc_test.cpp             | 2 +-
 libc/test/src/__support/fixed_point/fx_bits_test.cpp       | 2 +-
 libc/test/src/math/differential_testing/CMakeLists.txt     | 1 +
 libc/utils/LibcTableGenUtil/CMakeLists.txt                 | 2 +-
 libc/utils/gpu/loader/Loader.h                             | 2 +-
 56 files changed, 72 insertions(+), 60 deletions(-)

diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
index 0c7d399d4023b..bb6a5631f2c3f 100644
--- a/libc/benchmarks/automemcpy/lib/CMakeLists.txt
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -18,7 +18,8 @@ add_custom_command(
 
 add_library(automemcpy_implementations "${Implementations}")
 target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
-target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+target_include_directories(automemcpy_implementations PRIVATE 
+                           ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
 target_compile_options(automemcpy_implementations PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
 llvm_update_compile_flags(automemcpy_implementations)
 
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 0649e9f7a7670..5469799f02398 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -59,6 +59,7 @@ function(create_object_library fq_target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
   # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
@@ -73,6 +74,7 @@ function(create_object_library fq_target_name)
     )
     target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
     target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
     target_compile_options(${internal_target_name} PRIVATE ${compile_options}
                            -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
@@ -279,6 +281,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
   target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
@@ -300,6 +303,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 836e15d34741b..5981d427b71f8 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -184,6 +184,7 @@ function(create_libc_unittest fq_target_name)
   )
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(NOT LIBC_UNITTEST_CXX_STANDARD)
@@ -317,6 +318,7 @@ function(add_libc_fuzzer target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   target_link_libraries(${fq_target_name} PRIVATE
     ${link_object_files}
@@ -457,6 +459,7 @@ function(add_integration_test test_name)
       PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
@@ -632,6 +635,7 @@ function(add_libc_hermetic_test test_name)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
diff --git a/libc/cmake/modules/compiler_features/check_fixed_point.cpp b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
index a5192697d43f7..9199340fe652e 100644
--- a/libc/cmake/modules/compiler_features/check_fixed_point.cpp
+++ b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
@@ -1,4 +1,4 @@
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #ifndef LIBC_COMPILER_HAS_FIXED_POINT
 #error unsupported
diff --git a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
index b4a8621891203..c385c3a8f3e44 100644
--- a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
+++ b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "src/stdio/snprintf.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
 
diff --git a/libc/include/llvm-libc-types/float128.h b/libc/include/llvm-libc-types/float128.h
index 61a094fdb96b1..1907a5e3ece72 100644
--- a/libc/include/llvm-libc-types/float128.h
+++ b/libc/include/llvm-libc-types/float128.h
@@ -9,7 +9,7 @@
 #ifndef __LLVM_LIBC_TYPES_FLOAT128_H__
 #define __LLVM_LIBC_TYPES_FLOAT128_H__
 
-#include <include/llvm-libc-macros/float-macros.h> // LDBL_MANT_DIG
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
 
 // Currently, C23 `_Float128` type is only defined as a built-in type in GCC 7
 // or later, and only for C.  For C++, or for clang, `__float128` is defined
diff --git a/libc/src/__support/CPP/limits.h b/libc/src/__support/CPP/limits.h
index 1ffde5f9556f8..6440e8cb358fa 100644
--- a/libc/src/__support/CPP/limits.h
+++ b/libc/src/__support/CPP/limits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 
-#include "include/llvm-libc-macros/limits-macros.h" // CHAR_BIT
+#include "llvm-libc-macros/limits-macros.h" // CHAR_BIT
 #include "src/__support/CPP/type_traits/is_integral.h"
 #include "src/__support/CPP/type_traits/is_signed.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
diff --git a/libc/src/__support/CPP/type_traits/is_fixed_point.h b/libc/src/__support/CPP/type_traits/is_fixed_point.h
index 317ba39748b7d..e139e6477e2e7 100644
--- a/libc/src/__support/CPP/type_traits/is_fixed_point.h
+++ b/libc/src/__support/CPP/type_traits/is_fixed_point.h
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/type_traits/remove_cv.h"
 #include "src/__support/macros/attributes.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 5b4697e5245b6..e2a26d0e2b5f3 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 
-#include "include/llvm-libc-types/ENTRY.h"
+#include "llvm-libc-types/ENTRY.h"
 #include "src/__support/CPP/bit.h" // bit_ceil
 #include "src/__support/CPP/new.h"
 #include "src/__support/HashTable/bitmask.h"
diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h
index fcd47cd72cbb3..0c8d03beb84ae 100644
--- a/libc/src/__support/fixed_point/fx_bits.h
+++ b/libc/src/__support/fixed_point/fx_bits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h
index fcd7554c4d855..7d18f14a8c483 100644
--- a/libc/src/__support/fixed_point/fx_rep.h
+++ b/libc/src/__support/fixed_point/fx_rep.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
 
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
index d8df294b18a1a..236ebb2857030 100644
--- a/libc/src/__support/fixed_point/sqrt.h
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index 08a1ab726cbde..510f392374935 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -11,8 +11,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
-#include "include/llvm-libc-types/float128.h"      // float128
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/cpu_features.h"
diff --git a/libc/src/stdfix/abshk.h b/libc/src/stdfix/abshk.h
index 13c9300caab88..80dc73053dfb4 100644
--- a/libc/src/stdfix/abshk.h
+++ b/libc/src/stdfix/abshk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abshr.h b/libc/src/stdfix/abshr.h
index 5acd0cfc4a60d..035f9a6de222e 100644
--- a/libc/src/stdfix/abshr.h
+++ b/libc/src/stdfix/abshr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absk.h b/libc/src/stdfix/absk.h
index 73dfcac0ac8e7..426415de28e6a 100644
--- a/libc/src/stdfix/absk.h
+++ b/libc/src/stdfix/absk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslk.h b/libc/src/stdfix/abslk.h
index 7de116fa22793..21e33f856bfc6 100644
--- a/libc/src/stdfix/abslk.h
+++ b/libc/src/stdfix/abslk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslr.h b/libc/src/stdfix/abslr.h
index bf5b585bbbb66..ebca35e58aa51 100644
--- a/libc/src/stdfix/abslr.h
+++ b/libc/src/stdfix/abslr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absr.h b/libc/src/stdfix/absr.h
index b5ead7ce14e2a..2744fcb5a7ecc 100644
--- a/libc/src/stdfix/absr.h
+++ b/libc/src/stdfix/absr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhk.h b/libc/src/stdfix/roundhk.h
index 9a5c874cc030d..06de5cc05cdbe 100644
--- a/libc/src/stdfix/roundhk.h
+++ b/libc/src/stdfix/roundhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhr.h b/libc/src/stdfix/roundhr.h
index ba5a67945d6c3..6729bf5b13997 100644
--- a/libc/src/stdfix/roundhr.h
+++ b/libc/src/stdfix/roundhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundk.h b/libc/src/stdfix/roundk.h
index e9fa6d8f9c3b8..02fb9a8c9b1a8 100644
--- a/libc/src/stdfix/roundk.h
+++ b/libc/src/stdfix/roundk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlk.h b/libc/src/stdfix/roundlk.h
index 5fa0e90e855a6..28be9c0054949 100644
--- a/libc/src/stdfix/roundlk.h
+++ b/libc/src/stdfix/roundlk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlr.h b/libc/src/stdfix/roundlr.h
index c015292e8f3f2..be97a35a64204 100644
--- a/libc/src/stdfix/roundlr.h
+++ b/libc/src/stdfix/roundlr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundr.h b/libc/src/stdfix/roundr.h
index b5b1375c882e0..15523f8b6c9a3 100644
--- a/libc/src/stdfix/roundr.h
+++ b/libc/src/stdfix/roundr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhk.h b/libc/src/stdfix/rounduhk.h
index 85ebf2903ec7e..d1c4a4416d763 100644
--- a/libc/src/stdfix/rounduhk.h
+++ b/libc/src/stdfix/rounduhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhr.h b/libc/src/stdfix/rounduhr.h
index 1be0aab1f5a79..6cecb733dd3b1 100644
--- a/libc/src/stdfix/rounduhr.h
+++ b/libc/src/stdfix/rounduhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduk.h b/libc/src/stdfix/rounduk.h
index 8dae89586c490..4511d69525c5d 100644
--- a/libc/src/stdfix/rounduk.h
+++ b/libc/src/stdfix/rounduk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulk.h b/libc/src/stdfix/roundulk.h
index 81dfd1dceb600..8bd90beeb830c 100644
--- a/libc/src/stdfix/roundulk.h
+++ b/libc/src/stdfix/roundulk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulr.h b/libc/src/stdfix/roundulr.h
index 002fc94907c61..65e5c27b1c853 100644
--- a/libc/src/stdfix/roundulr.h
+++ b/libc/src/stdfix/roundulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundur.h b/libc/src/stdfix/roundur.h
index 72de44b1e0c4e..110e578da7931 100644
--- a/libc/src/stdfix/roundur.h
+++ b/libc/src/stdfix/roundur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhk.h b/libc/src/stdfix/sqrtuhk.h
index 80000a0079696..b57340003fa03 100644
--- a/libc/src/stdfix/sqrtuhk.h
+++ b/libc/src/stdfix/sqrtuhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhr.h b/libc/src/stdfix/sqrtuhr.h
index fd95f0924e8d4..6b629a29de3c8 100644
--- a/libc/src/stdfix/sqrtuhr.h
+++ b/libc/src/stdfix/sqrtuhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuk.h b/libc/src/stdfix/sqrtuk.h
index 04d0adadde9ad..6bd7a2608716c 100644
--- a/libc/src/stdfix/sqrtuk.h
+++ b/libc/src/stdfix/sqrtuk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtulr.h b/libc/src/stdfix/sqrtulr.h
index 284adaaf35bf5..d1982a6b1c051 100644
--- a/libc/src/stdfix/sqrtulr.h
+++ b/libc/src/stdfix/sqrtulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtur.h b/libc/src/stdfix/sqrtur.h
index df9dfe5a0bf39..13f7d1e5e466e 100644
--- a/libc/src/stdfix/sqrtur.h
+++ b/libc/src/stdfix/sqrtur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
index de69c603be6b6..c89971e20686e 100644
--- a/libc/src/stdio/printf_core/fixed_converter.h
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 0876116a0bac8..13fdbf243a22e 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/str_to_integer.h"
diff --git a/libc/src/sys/epoll/epoll_pwait.h b/libc/src/sys/epoll/epoll_pwait.h
index 9dcb55533009f..16105850d6942 100644
--- a/libc/src/sys/epoll/epoll_pwait.h
+++ b/libc/src/sys/epoll/epoll_pwait.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_pwait2.h b/libc/src/sys/epoll/epoll_pwait2.h
index 622ede6a0f9f9..f7b28d4fbc51d 100644
--- a/libc/src/sys/epoll/epoll_pwait2.h
+++ b/libc/src/sys/epoll/epoll_pwait2.h
@@ -10,9 +10,9 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT2_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_wait.h b/libc/src/sys/epoll/epoll_wait.h
index d51c9100846ce..0dc487bba5bdf 100644
--- a/libc/src/sys/epoll/epoll_wait.h
+++ b/libc/src/sys/epoll/epoll_wait.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_WAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index ee1b4e66e9844..e0c13a7a79602 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -15,8 +15,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 671dede2a1058..a44b0c2a9f70f 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -15,9 +15,9 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index 0c43edf764545..b643e2dd720cb 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -14,8 +14,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4668f0061975f..466494f038f4e 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -26,7 +26,8 @@ function(add_unittest_framework_library name)
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
     )
-    target_include_directories(${lib} PUBLIC ${LIBC_SOURCE_DIR})
+    target_include_directories(${lib} PUBLIC 
+                               ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
     list(APPEND compile_options -fno-exceptions -fno-rtti)
     if(TARGET libc.src.time.clock)
       target_compile_definitions(${lib} PRIVATE TARGET_SUPPORTS_CLOCK)
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
index 7b0e4fca83683..babd44f9b2063 100644
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -8,7 +8,7 @@
 
 #include "LibcTest.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/UInt128.h"
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index acb79ca0f3ff1..84a4cde18b9f4 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -88,7 +88,7 @@ bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
 bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
-#include "include/llvm-libc-macros/stdbit-macros.h"
+#include "llvm-libc-macros/stdbit-macros.h"
 
 TEST(LlvmLibcStdbitTest, TypeGenericMacroLeadingZeros) {
   EXPECT_EQ(stdc_leading_zeros(static_cast<unsigned char>(0U)), 0xAAU);
diff --git a/libc/test/include/stdckdint_test.cpp b/libc/test/include/stdckdint_test.cpp
index 1180a6de9efe2..5ac8c95f4ef26 100644
--- a/libc/test/include/stdckdint_test.cpp
+++ b/libc/test/include/stdckdint_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "include/llvm-libc-macros/stdckdint-macros.h"
+#include "llvm-libc-macros/stdckdint-macros.h"
 
 TEST(LlvmLibcStdCkdIntTest, Add) {
   int result;
diff --git a/libc/test/integration/startup/CMakeLists.txt b/libc/test/integration/startup/CMakeLists.txt
index fb5d6bc787cc2..08c0d978602b8 100644
--- a/libc/test/integration/startup/CMakeLists.txt
+++ b/libc/test/integration/startup/CMakeLists.txt
@@ -31,6 +31,7 @@ function(add_startup_test target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 674e2cc1ed749..7bbd7085fc2f4 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e3..9401f822904d0 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/integer_to_string.h"
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 4032d890c53ec..bb36b6cedb63c 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
index 58627816eb8d9..5670687273d5b 100644
--- a/libc/test/src/__support/fixed_point/fx_bits_test.cpp
+++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/integer_literals.h"
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/differential_testing/CMakeLists.txt
index 878f81f1d573c..36bfdca1a442d 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@@ -47,6 +47,7 @@ function(add_diff_binary target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
   )
   if(DIFF_COMPILE_OPTIONS)
     target_compile_options(
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
index dca6a7bb83065..60208ed790d57 100644
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt
@@ -5,5 +5,5 @@ add_llvm_library(
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_COMPONENTS Support TableGen
 )
-target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
+target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
 target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index e2aabb08c11da..d74d65e899382 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,7 +11,7 @@
 
 #include "utils/gpu/server/llvmlibc_rpc_server.h"
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 
 #include <cstddef>
 #include <cstdint>

From d699d9d609a24d80809df15efe47ac539da90e93 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 15:59:25 -0800
Subject: [PATCH 510/546] [flang][runtime] Support SUM/PRODUCT/DOT_PRODUCT
 reductions for REAL(16). (#83169)

The reductions implementations rely on trivial operations that
are supported by the build compiler runtime, so they can be enabled
whenever the build compiler provides 128-bit float support.

std::conj used by DOT_PRODUCT is a template implementation
in most environments, so it should not introduce a dependency
on any 128-bit float support library. I am not goind to
test it in all the build environments before merging.
If it fails for someone, I will deal with it.
---
 flang/include/flang/Common/float128.h     | 16 ++++++++
 flang/include/flang/Runtime/reduction.h   | 12 ++++--
 flang/runtime/Float128Math/cabs.cpp       |  6 +--
 flang/runtime/Float128Math/math-entries.h |  9 -----
 flang/runtime/complex-reduction.c         | 47 ++++++++++++++++++-----
 flang/runtime/complex-reduction.h         | 13 +++++--
 flang/runtime/product.cpp                 |  6 ++-
 flang/runtime/sum.cpp                     |  3 +-
 8 files changed, 81 insertions(+), 31 deletions(-)

diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 3443aa06437b0..6aa98df5529df 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -49,4 +49,20 @@
 #endif /* (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \
           !defined(_LIBCPP_VERSION)  && !defined(__CUDA_ARCH__) */
 
+/* Define pure C CFloat128Type and CFloat128ComplexType. */
+#if LDBL_MANT_DIG == 113
+typedef long double CFloat128Type;
+typedef long double _Complex CFloat128ComplexType;
+#elif HAS_FLOAT128
+typedef __float128 CFloat128Type;
+/*
+ * Use mode() attribute supported by GCC and Clang.
+ * Adjust it for other compilers as needed.
+ */
+#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
+typedef _Complex float __attribute__((mode(TC))) CFloat128ComplexType;
+#else
+typedef _Complex float __attribute__((mode(KC))) CFloat128ComplexType;
+#endif
+#endif
 #endif /* FORTRAN_COMMON_FLOAT128_H_ */
diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index b91fec0cd26b5..6d62f4016937e 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -92,9 +92,11 @@ void RTDECL(CppSumComplex8)(std::complex<double> &, const Descriptor &,
 void RTDECL(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppSumComplex16)(std::complex<long double> &, const Descriptor &,
-    const char *source, int line, int dim = 0,
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppSumComplex16)(std::complex<CppFloat128Type> &,
+    const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(SumDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
@@ -145,12 +147,16 @@ void RTDECL(CppProductComplex4)(std::complex<float> &, const Descriptor &,
 void RTDECL(CppProductComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#if LDBL_MANT_DIG == 64
 void RTDECL(CppProductComplex10)(std::complex<long double> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppProductComplex16)(std::complex<long double> &,
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppProductComplex16)(std::complex<CppFloat128Type> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(ProductDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 63f2bdf8e177a..2867c8a4578a8 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -12,10 +12,8 @@ namespace Fortran::runtime {
 extern "C" {
 
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-// FIXME: the argument should be CppTypeFor<TypeCategory::Complex, 16>,
-// and it should be translated into the underlying library's
-// corresponding complex128 type.
-CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(ComplexF128 x) {
+// NOTE: Flang calls the runtime APIs using C _Complex ABI
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
   return CAbs<RTNAME(CAbsF128)>::invoke(x);
 }
 #endif
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index fe1525468edca..141648d2fb2c5 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -91,15 +91,6 @@ DEFINE_FALLBACK(Y0)
 DEFINE_FALLBACK(Y1)
 DEFINE_FALLBACK(Yn)
 
-// Define ComplexF128 type that is compatible with
-// the type of results/arguments of libquadmath.
-// TODO: this may need more work for other libraries/compilers.
-#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
-typedef _Complex float __attribute__((mode(TC))) ComplexF128;
-#else
-typedef _Complex float __attribute__((mode(KC))) ComplexF128;
-#endif
-
 #if HAS_LIBM
 // Define wrapper callers for libm.
 #include <ccomplex>
diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index d77e1c0a55006..06e4f15c7fa9b 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -19,6 +19,11 @@ struct CppComplexDouble {
 struct CppComplexLongDouble {
   long double r, i;
 };
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+struct CppComplexFloat128 {
+  CFloat128Type r, i;
+};
+#endif
 
 /* Not all environments define CMPLXF, CMPLX, CMPLXL. */
 
@@ -70,6 +75,27 @@ static long_double_Complex_t CMPLXL(long double r, long double i) {
 #endif
 #endif
 
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+/*
+ * GCC 7.4.0 (currently minimum GCC version for llvm builds)
+ * supports __builtin_complex. For Clang, require >=12.0.
+ * Otherwise, rely on the memory layout compatibility.
+ */
+#if (defined(__clang_major__) && (__clang_major__ >= 12)) || defined(__GNUC__)
+#define CMPLXF128 __builtin_complex
+#else
+static CFloat128ComplexType CMPLXF128(CFloat128Type r, CFloat128Type i) {
+  union {
+    struct CppComplexFloat128 x;
+    CFloat128ComplexType result;
+  } u;
+  u.x.r = r;
+  u.x.i = i;
+  return u.result;
+}
+#endif
+#endif
+
 /* RTNAME(SumComplex4) calls RTNAME(CppSumComplex4) with the same arguments
  * and converts the members of its C++ complex result to C _Complex.
  */
@@ -93,9 +119,10 @@ ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(SumComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(SumComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* PRODUCT() */
@@ -106,9 +133,10 @@ ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(ProductComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(ProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* DOT_PRODUCT() */
@@ -119,7 +147,8 @@ ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t,
     CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(DotProductComplex16, long_double_Complex_t,
-    CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(DotProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
 #endif
diff --git a/flang/runtime/complex-reduction.h b/flang/runtime/complex-reduction.h
index 5c4f1f5126e39..1d37b235d5194 100644
--- a/flang/runtime/complex-reduction.h
+++ b/flang/runtime/complex-reduction.h
@@ -15,6 +15,7 @@
 #ifndef FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 #define FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 
+#include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <complex.h>
 
@@ -40,14 +41,18 @@ float_Complex_t RTNAME(SumComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(SumComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(SumComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(SumComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(SumComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(SumComplex16)(REDUCTION_ARGS);
+#endif
 
 float_Complex_t RTNAME(ProductComplex2)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(ProductComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(ProductComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#endif
 
 #define DOT_PRODUCT_ARGS \
   const struct CppDescriptor *x, const struct CppDescriptor *y, \
@@ -60,6 +65,8 @@ float_Complex_t RTNAME(DotProductComplex3)(DOT_PRODUCT_ARGS);
 float_Complex_t RTNAME(DotProductComplex4)(DOT_PRODUCT_ARGS);
 double_Complex_t RTNAME(DotProductComplex8)(DOT_PRODUCT_ARGS);
 long_double_Complex_t RTNAME(DotProductComplex10)(DOT_PRODUCT_ARGS);
-long_double_Complex_t RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#endif
 
 #endif // FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index a516bc51a959b..4c3b8c33a12e0 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -123,7 +123,8 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(ProductReal10)(const Descriptor &x,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ProductReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim, mask,
@@ -154,7 +155,8 @@ void RTDEF(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
       mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index 048399737c850..d2495e3e956fe 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -175,7 +175,8 @@ void RTDEF(CppSumComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
   result = GetTotalReduction<TypeCategory::Complex, 10>(
       x, source, line, dim, mask, ComplexSumAccumulator<long double>{x}, "SUM");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {

From 1ea43e2a77667eab3d8702732afe62cf9f6347a9 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Tue, 27 Feb 2024 16:29:57 -0800
Subject: [PATCH 511/546] [SYCL-WEB] Fix test failures in
 address-space-conversions (#12843)

Per @Naghasan
The view in upstream is different as there is no device filtering yet
(IIRC). sycl_device isn't there yet regardless (hence why it needs to be
re-added) and in intel/llvm, nothing is emitted until you reach calls
made by sycl_device function, causing the "reordering".

1. Add __attribute__((sycl_device)) to so that the IR won't be empty.
2. Replace CHECK with CHECK-DAG because of the function order
3. Replace dso_local to {{*}} because of linkage
4. Replace declaration of tmpl to empty define tmpl..{} to avoid
compilation error.
---
 .../CodeGenSYCL/address-space-conversions.cpp | 170 +++++++++---------
 .../amd-address-space-conversions.cpp         | 160 ++++++++---------
 .../cuda-address-space-conversions.cpp        | 154 ++++++++--------
 3 files changed, 242 insertions(+), 242 deletions(-)

diff --git a/clang/test/CodeGenSYCL/address-space-conversions.cpp b/clang/test/CodeGenSYCL/address-space-conversions.cpp
index ee3183b74e038..5466c1cb4c8ea 100644
--- a/clang/test/CodeGenSYCL/address-space-conversions.cpp
+++ b/clang/test/CodeGenSYCL/address-space-conversions.cpp
@@ -1,143 +1,143 @@
 // RUN: %clang_cc1 -triple spir64 -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
 void bar(int &Data) {}
-// CHECK: define{{.*}} spir_func void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
+// CHECK-DAG: define{{.*}} spir_func void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
 void bar2(int &Data) {}
-// CHECK: define{{.*}} spir_func void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
+// CHECK-DAG: define{{.*}} spir_func void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef align 4 dereferenceable(4) %
 void bar(__attribute__((opencl_local)) int &Data) {}
-// CHECK: define{{.*}} spir_func void [[LOC_REF:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
+// CHECK-DAG: define{{.*}} spir_func void [[LOC_REF:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
 void foo(int *Data) {}
-// CHECK: define{{.*}} spir_func void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
+// CHECK-DAG: define{{.*}} spir_func void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
 void foo2(int *Data) {}
-// CHECK: define{{.*}} spir_func void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
+// CHECK-DAG: define{{.*}} spir_func void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr addrspace(4) noundef %
 void foo(__attribute__((opencl_local)) int *Data) {}
-// CHECK: define{{.*}} spir_func void [[LOC_PTR:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
+// CHECK-DAG: define{{.*}} spir_func void [[LOC_PTR:@[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
 
 template <typename T>
 void tmpl(T t) {}
 // See Check Lines below.
 
-void usages() {
+__attribute__((sycl_device)) void usages() {
   int *NoAS;
-  // CHECK: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr addrspace(4)
+  // CHECK-DAG: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr addrspace(4)
   __attribute__((opencl_global)) int *GLOB;
-  // CHECK: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1)
+  // CHECK-DAG: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1)
   __attribute__((opencl_local)) int *LOC;
-  // CHECK: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3)
+  // CHECK-DAG: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3)
   __attribute__((opencl_private)) int *PRIV;
-  // CHECK: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr
+  // CHECK-DAG: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr
   __attribute__((opencl_global_device)) int *GLOBDEVICE;
-  // CHECK: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5)
+  // CHECK-DAG: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5)
   __attribute__((opencl_global_host)) int *GLOBHOST;
-  // CHECK: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(6)
+  // CHECK-DAG: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(6)
 
-  // CHECK: [[NoAS]].ascast = addrspacecast ptr [[NoAS]] to ptr addrspace(4)
-  // CHECK: [[GLOB]].ascast = addrspacecast ptr [[GLOB]] to ptr addrspace(4)
-  // CHECK: [[LOC]].ascast = addrspacecast ptr [[LOC]] to ptr addrspace(4)
-  // CHECK: [[PRIV]].ascast = addrspacecast ptr [[PRIV]] to ptr addrspace(4)
+  // CHECK-DAG: [[NoAS]].ascast = addrspacecast ptr [[NoAS]] to ptr addrspace(4)
+  // CHECK-DAG: [[GLOB]].ascast = addrspacecast ptr [[GLOB]] to ptr addrspace(4)
+  // CHECK-DAG: [[LOC]].ascast = addrspacecast ptr [[LOC]] to ptr addrspace(4)
+  // CHECK-DAG: [[PRIV]].ascast = addrspacecast ptr [[PRIV]] to ptr addrspace(4)
   LOC = nullptr;
-  // CHECK: store ptr addrspace(3) null, ptr addrspace(4) [[LOC]].ascast, align 8
+  // CHECK-DAG: store ptr addrspace(3) null, ptr addrspace(4) [[LOC]].ascast, align 8
   GLOB = nullptr;
-  // CHECK: store ptr addrspace(1) null, ptr addrspace(4) [[GLOB]].ascast, align 8
+  // CHECK-DAG: store ptr addrspace(1) null, ptr addrspace(4) [[GLOB]].ascast, align 8
 
   // Explicit conversions
   // From named address spaces to default address space
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
-  // CHECK: store ptr addrspace(4) [[GLOB_CAST]], ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
+  // CHECK-DAG: store ptr addrspace(4) [[GLOB_CAST]], ptr addrspace(4) [[NoAS]].ascast
   NoAS = (int *)GLOB;
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr addrspace(4)
-  // CHECK: store ptr addrspace(4) [[LOC_CAST]], ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr addrspace(4)
+  // CHECK-DAG: store ptr addrspace(4) [[LOC_CAST]], ptr addrspace(4) [[NoAS]].ascast
   NoAS = (int *)LOC;
-  // CHECK: [[PRIV_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
-  // CHECK: [[PRIV_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[PRIV_LOAD]] to ptr addrspace(4)
-  // CHECK: store ptr addrspace(4) [[PRIV_CAST]], ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: [[PRIV_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
+  // CHECK-DAG: [[PRIV_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[PRIV_LOAD]] to ptr addrspace(4)
+  // CHECK-DAG: store ptr addrspace(4) [[PRIV_CAST]], ptr addrspace(4) [[NoAS]].ascast
   NoAS = (int *)PRIV;
   // From default address space to named address space
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(1)
-  // CHECK: store ptr addrspace(1) [[NoAS_CAST]], ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(1)
+  // CHECK-DAG: store ptr addrspace(1) [[NoAS_CAST]], ptr addrspace(4) [[GLOB]].ascast
   GLOB = (__attribute__((opencl_global)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(3)
-  // CHECK: store ptr addrspace(3) [[NoAS_CAST]], ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr addrspace(3)
+  // CHECK-DAG: store ptr addrspace(3) [[NoAS_CAST]], ptr addrspace(4) [[LOC]].ascast
   LOC = (__attribute__((opencl_local)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr
-  // CHECK: store ptr [[NoAS_CAST]], ptr addrspace(4) [[PRIV]].ascast
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(4) [[NoAS_LOAD]] to ptr
+  // CHECK-DAG: store ptr [[NoAS_CAST]], ptr addrspace(4) [[PRIV]].ascast
   PRIV = (__attribute__((opencl_private)) int *)NoAS;
   // From opencl_global_[host/device] address spaces to opencl_global
-  // CHECK: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr addrspace(4) [[GLOB_DEVICE]].ascast
-  // CHECK: [[GLOBDEVICE_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[GLOBDEVICE_LOAD]] to ptr addrspace(1)
-  // CHECK: store ptr addrspace(1) [[GLOBDEVICE_CAST]], ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr addrspace(4) [[GLOB_DEVICE]].ascast
+  // CHECK-DAG: [[GLOBDEVICE_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[GLOBDEVICE_LOAD]] to ptr addrspace(1)
+  // CHECK-DAG: store ptr addrspace(1) [[GLOBDEVICE_CAST]], ptr addrspace(4) [[GLOB]].ascast
   GLOB = (__attribute__((opencl_global)) int *)GLOBDEVICE;
-  // CHECK: [[GLOBHOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(6), ptr addrspace(4) [[GLOB_HOST]].ascast
-  // CHECK: [[GLOBHOST_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(6) [[GLOBHOST_LOAD]] to ptr addrspace(1)
-  // CHECK: store ptr addrspace(1) [[GLOBHOST_CAST]], ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOBHOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(6), ptr addrspace(4) [[GLOB_HOST]].ascast
+  // CHECK-DAG: [[GLOBHOST_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(6) [[GLOBHOST_LOAD]] to ptr addrspace(1)
+  // CHECK-DAG: store ptr addrspace(1) [[GLOBHOST_CAST]], ptr addrspace(4) [[GLOB]].ascast
   GLOB = (__attribute__((opencl_global)) int *)GLOBHOST;
 
   bar(*GLOB);
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
-  // CHECK: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST]])
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr addrspace(4)
+  // CHECK-DAG: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST]])
   bar2(*GLOB);
-  // CHECK: [[GLOB_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK: [[GLOB_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD2]] to ptr addrspace(4)
-  // CHECK: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST2]])
+  // CHECK-DAG: [[GLOB_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOB_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD2]] to ptr addrspace(4)
+  // CHECK-DAG: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[GLOB_CAST2]])
 
   bar(*LOC);
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK: call spir_func void [[LOC_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: call spir_func void [[LOC_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
   bar2(*LOC);
-  // CHECK: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr addrspace(4)
-  // CHECK: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[LOC_CAST2]])
+  // CHECK-DAG: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr addrspace(4)
+  // CHECK-DAG: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[LOC_CAST2]])
 
   bar(*NoAS);
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD]])
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: call spir_func void @[[RAW_REF]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD]])
   bar2(*NoAS);
-  // CHECK: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD2]])
+  // CHECK-DAG: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: call spir_func void @[[RAW_REF2]](ptr addrspace(4) noundef align 4 dereferenceable(4) [[NoAS_LOAD2]])
 
   foo(GLOB);
-  // CHECK: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr addrspace(4)
-  // CHECK: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[GLOB_CAST3]])
+  // CHECK-DAG: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr addrspace(4)
+  // CHECK-DAG: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[GLOB_CAST3]])
   foo2(GLOB);
-  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr addrspace(4)
-  // CHECK: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[GLOB_CAST4]])
+  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr addrspace(4)
+  // CHECK-DAG: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[GLOB_CAST4]])
   foo(LOC);
-  // CHECK: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK: call spir_func void [[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
+  // CHECK-DAG: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: call spir_func void [[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
   foo2(LOC);
-  // CHECK: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr addrspace(4)
-  // CHECK: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[LOC_CAST4]])
+  // CHECK-DAG: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr addrspace(4)
+  // CHECK-DAG: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[LOC_CAST4]])
   foo(NoAS);
-  // CHECK: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[NoAS_LOAD3]])
+  // CHECK-DAG: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: call spir_func void @[[RAW_PTR]](ptr addrspace(4) noundef [[NoAS_LOAD3]])
   foo2(NoAS);
-  // CHECK: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[NoAS_LOAD4]])
+  // CHECK-DAG: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: call spir_func void @[[RAW_PTR2]](ptr addrspace(4) noundef [[NoAS_LOAD4]])
 
   // Ensure that we still get 3 different template instantiations.
   tmpl(GLOB);
-  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
-  // CHECK: call spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
+  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr addrspace(4) [[GLOB]].ascast
+  // CHECK-DAG: call spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
   tmpl(LOC);
-  // CHECK: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
-  // CHECK: call spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
+  // CHECK-DAG: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr addrspace(4) [[LOC]].ascast
+  // CHECK-DAG: call spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
   tmpl(PRIV);
-  // CHECK: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
-  // CHECK: call spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef [[PRIV_LOAD5]])
+  // CHECK-DAG: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr addrspace(4) [[PRIV]].ascast
+  // CHECK-DAG: call spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef [[PRIV_LOAD5]])
   tmpl(NoAS);
-  // CHECK: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
-  // CHECK: call spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef [[NoAS_LOAD5]])
+  // CHECK-DAG: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(4), ptr addrspace(4) [[NoAS]].ascast
+  // CHECK-DAG: call spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef [[NoAS_LOAD5]])
 }
 
-// CHECK: define linkonce_odr spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef %
-// CHECK: define linkonce_odr spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef %
-// CHECK: define linkonce_odr spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef %
-// CHECK: define linkonce_odr spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef %
+// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef %
+// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef %
+// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPU3AS0iEvT_(ptr noundef %
+// CHECK-DAG: define linkonce_odr spir_func void @_Z4tmplIPiEvT_(ptr addrspace(4) noundef %
diff --git a/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp b/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp
index d316f22096d3d..32dfb3f34ab6e 100644
--- a/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp
+++ b/clang/test/CodeGenSYCL/amd-address-space-conversions.cpp
@@ -1,128 +1,128 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
 void bar(int &Data) {}
-// CHECK: define dso_local void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+// CHECK-DAG: define {{.*}} void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
 void bar2(int &Data) {}
-// CHECK: define dso_local void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+// CHECK-DAG: define {{.*}} void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
 void bar(__attribute__((opencl_local)) int &Data) {}
-// CHECK: define dso_local void @[[LOCAL_REF:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
+// CHECK-DAG: define {{.*}} void @[[LOCAL_REF:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
 void foo(int *Data) {}
-// CHECK: define dso_local void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr noundef %
+// CHECK-DAG: define {{.*}} void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr noundef %
 void foo2(int *Data) {}
-// CHECK: define dso_local void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr noundef %
+// CHECK-DAG: define {{.*}} void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr noundef %
 void foo(__attribute__((opencl_local)) int *Data) {}
-// CHECK: define dso_local void @[[LOC_PTR:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
+// CHECK-DAG: define {{.*}} void @[[LOC_PTR:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
 
 template <typename T>
-void tmpl(T t);
+void tmpl(T t) {}
 // See Check Lines below.
 
-void usages() {
+__attribute__((sycl_device)) void usages() {
   int *NoAS;
-  // CHECK: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr, align 8, addrspace(5)
+  // CHECK-DAG: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr, align 8, addrspace(5)
   __attribute__((opencl_global)) int *GLOB;
-  // CHECK: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
+  // CHECK-DAG: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
   __attribute__((opencl_local)) int *LOC;
-  // CHECK: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3), align 4, addrspace(5)
+  // CHECK-DAG: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3), align 4, addrspace(5)
   __attribute__((opencl_private)) int *PRIV;
-  // CHECK: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5), align 4, addrspace(5)
+  // CHECK-DAG: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr addrspace(5), align 4, addrspace(5)
   __attribute__((opencl_global_device)) int *GLOBDEVICE;
-  // CHECK: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
+  // CHECK-DAG: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
   __attribute__((opencl_global_host)) int *GLOBHOST;
-  // CHECK: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
+  // CHECK-DAG: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8, addrspace(5)
   LOC = nullptr;
-  // CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr [[LOC]].ascast, align 4
   GLOB = nullptr;
-  // CHECK: store ptr addrspace(1) null, ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: store ptr addrspace(1) null, ptr [[GLOB]].ascast, align 8
   NoAS = (int *)GLOB;
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
-  // CHECK: store ptr [[GLOB_CAST]], ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK-DAG: store ptr [[GLOB_CAST]], ptr [[NoAS]].ascast, align 8
   NoAS = (int *)LOC;
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
-  // CHECK: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr
-  // CHECK: store ptr [[LOC_CAST]], ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr
+  // CHECK-DAG: store ptr [[LOC_CAST]], ptr [[NoAS]].ascast, align 8
   NoAS = (int *)PRIV;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr [[PRIV]].ascast, align 4
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[NoAS_LOAD]] to ptr
-  // CHECK: store ptr %5, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr [[PRIV]].ascast, align 4
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(5) [[NoAS_LOAD]] to ptr
+  // CHECK-DAG: store ptr %5, ptr [[NoAS]].ascast, align 8
   GLOB = (__attribute__((opencl_global)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr %6 to ptr addrspace(1)
-  // CHECK: store ptr addrspace(1) %7, ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr %6 to ptr addrspace(1)
+  // CHECK-DAG: store ptr addrspace(1) %7, ptr [[GLOB]].ascast, align 8
   LOC = (__attribute__((opencl_local)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(3)
-  // CHECK: store ptr addrspace(3) %9, ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(3)
+  // CHECK-DAG: store ptr addrspace(3) %9, ptr [[LOC]].ascast, align 4
   PRIV = (__attribute__((opencl_private)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(5)
-  // CHECK: store ptr addrspace(5) [[NoAS_CAST]], ptr [[PRIV]].ascast, align 4
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(5)
+  // CHECK-DAG: store ptr addrspace(5) [[NoAS_CAST]], ptr [[PRIV]].ascast, align 4
   GLOB = (__attribute__((opencl_global)) int *)GLOBDEVICE;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]]DEVICE.ascast, align 8
-  // CHECK: store ptr addrspace(1) [[NoAS_LOAD]], ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]]DEVICE.ascast, align 8
+  // CHECK-DAG: store ptr addrspace(1) [[NoAS_LOAD]], ptr [[GLOB]].ascast, align 8
   GLOB = (__attribute__((opencl_global)) int *)GLOBHOST;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]]HOST.ascast, align 8
-  // CHECK: tore ptr addrspace(1) [[NoAS_LOAD]], ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]]HOST.ascast, align 8
+  // CHECK-DAG: tore ptr addrspace(1) [[NoAS_LOAD]], ptr [[GLOB]].ascast, align 8
   bar(*GLOB);
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
-  // CHECK: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK-DAG: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
   bar2(*GLOB);
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
-  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK-DAG: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
   bar(*LOC);
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
-  // CHECK: call void @_Z3barRU3AS3i(ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: call void @_Z3barRU3AS3i(ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
   bar2(*LOC);
-  // CHECK: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
-  // CHECK: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr
-  // CHECK: call void @_Z4bar2Ri(ptr noundef nonnull align 4 dereferenceable(4) [[LOC_CAST2]])
+  // CHECK-DAG: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr
+  // CHECK-DAG: call void @_Z4bar2Ri(ptr noundef nonnull align 4 dereferenceable(4) [[LOC_CAST2]])
   bar(*NoAS);
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: call void @_Z3barRi(ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD]])
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: call void @_Z3barRi(ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD]])
   bar2(*NoAS);
-  // CHECK: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: call void @_Z4bar2Ri(ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD2]])
+  // CHECK-DAG: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: call void @_Z4bar2Ri(ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD2]])
   foo(GLOB);
-  // CHECK: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
-  // CHECK: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr
-  // CHECK: call void @[[RAW_PTR]](ptr noundef [[GLOB_CAST3]])
+  // CHECK-DAG: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr
+  // CHECK-DAG: call void @[[RAW_PTR]](ptr noundef [[GLOB_CAST3]])
    foo2(GLOB);
-  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
-  // CHECK: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr
-  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[GLOB_CAST4]])
+  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr
+  // CHECK-DAG: call void @[[RAW_PTR2]](ptr noundef [[GLOB_CAST4]])
   foo(LOC);
-  // CHECK: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
-  // CHECK: call void @[[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
+  // CHECK-DAG: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: call void @[[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
   foo2(LOC);
-  // CHECK: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
-  // CHECK: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr
-  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[LOC_CAST4]])
+  // CHECK-DAG: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr
+  // CHECK-DAG: call void @[[RAW_PTR2]](ptr noundef [[LOC_CAST4]])
   foo(NoAS);
-  // CHECK: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: call void @[[RAW_PTR]](ptr noundef [[NoAS_LOAD3]])
+  // CHECK-DAG: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: call void @[[RAW_PTR]](ptr noundef [[NoAS_LOAD3]])
   foo2(NoAS);
-  // CHECK: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[NoAS_LOAD4]])
+  // CHECK-DAG: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: call void @[[RAW_PTR2]](ptr noundef [[NoAS_LOAD4]])
 
   // Ensure that we still get 3 different template instantiations.
   tmpl(GLOB);
-  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
-  // CHECK: call void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
+  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]].ascast, align 8
+  // CHECK-DAG: call void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
   tmpl(LOC);
-  // CHECK: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
-  // CHECK: call void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
+  // CHECK-DAG: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]].ascast, align 4
+  // CHECK-DAG: call void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
   tmpl(PRIV);
-  // CHECK: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr [[PRIV]].ascast, align 4
-  // CHECK: call void @_Z4tmplIPU3AS5iEvT_(ptr addrspace(5) noundef [[PRIV_LOAD5]])
+  // CHECK-DAG: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(5), ptr [[PRIV]].ascast, align 4
+  // CHECK-DAG: call void @_Z4tmplIPU3AS5iEvT_(ptr addrspace(5) noundef [[PRIV_LOAD5]])
   tmpl(NoAS);
-  // CHECK: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
-  // CHECK: call void @_Z4tmplIPiEvT_(ptr noundef [[NoAS_LOAD5]])
+  // CHECK-DAG: [[NoAS_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]].ascast, align 8
+  // CHECK-DAG: call void @_Z4tmplIPiEvT_(ptr noundef [[NoAS_LOAD5]])
 }
 
-// CHECK: declare void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef)
-// CHECK: declare void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef)
-// CHECK: declare void @_Z4tmplIPU3AS5iEvT_(ptr addrspace(5) noundef)
-// CHECK: declare void @_Z4tmplIPiEvT_(ptr noundef)
+// CHECK-DAG: define linkonce_odr void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef %
+// CHECK-DAG: define linkonce_odr void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef %
+// CHECK-DAG: define linkonce_odr void @_Z4tmplIPU3AS5iEvT_(ptr addrspace(5) noundef %
+// CHECK-DAG: define linkonce_odr void @_Z4tmplIPiEvT_(ptr noundef %
 
diff --git a/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp b/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp
index 1875029de0856..d8dfd07e6889d 100644
--- a/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp
+++ b/clang/test/CodeGenSYCL/cuda-address-space-conversions.cpp
@@ -1,122 +1,122 @@
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s
 void bar(int &Data) {}
-// CHECK: define dso_local void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+// CHECK-DAG: define {{.*}} void @[[RAW_REF:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
 void bar2(int &Data) {}
-// CHECK: define dso_local void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
+// CHECK-DAG: define {{.*}} void @[[RAW_REF2:[a-zA-Z0-9_]+]](ptr noundef nonnull align 4 dereferenceable(4) %
 void bar(__attribute__((opencl_local)) int &Data) {}
-// CHECK: define dso_local void @[[LOCAL_REF:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
+// CHECK-DAG: define {{.*}} void @[[LOCAL_REF:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef align 4 dereferenceable(4) %
 void foo(int *Data) {}
-// CHECK: define dso_local void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr noundef %
+// CHECK-DAG: define {{.*}} void @[[RAW_PTR:[a-zA-Z0-9_]+]](ptr noundef %
 void foo2(int *Data) {}
-// CHECK: define dso_local void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr noundef %
+// CHECK-DAG: define {{.*}} void @[[RAW_PTR2:[a-zA-Z0-9_]+]](ptr noundef %
 void foo(__attribute__((opencl_local)) int *Data) {}
-// CHECK: define dso_local void @[[LOC_PTR:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
+// CHECK-DAG: define {{.*}} void @[[LOC_PTR:[a-zA-Z0-9_]+]](ptr addrspace(3) noundef %
 
 template <typename T>
-void tmpl(T t);
+void tmpl(T t){}
 // See Check Lines below.
 
-void usages() {
+__attribute__((sycl_device)) void usages() {
   int *NoAS;
-  // CHECK: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr, align 8
+  // CHECK-DAG: [[NoAS:%[a-zA-Z0-9]+]] = alloca ptr, align 8
   __attribute__((opencl_global)) int *GLOB;
-  // CHECK: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
+  // CHECK-DAG: [[GLOB:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
   __attribute__((opencl_local)) int *LOC;
-  // CHECK: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3), align 8
+  // CHECK-DAG: [[LOC:%[a-zA-Z0-9]+]] = alloca ptr addrspace(3), align 8
   __attribute__((opencl_private)) int *PRIV;
-  // CHECK: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr, align 8
+  // CHECK-DAG: [[PRIV:%[a-zA-Z0-9]+]] = alloca ptr, align 8
   __attribute__((opencl_global_device)) int *GLOBDEVICE;
-  // CHECK: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
+  // CHECK-DAG: [[GLOB_DEVICE:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
   __attribute__((opencl_global_host)) int *GLOBHOST;
-  // CHECK: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
+  // CHECK-DAG: [[GLOB_HOST:%[a-zA-Z0-9]+]] = alloca ptr addrspace(1), align 8
   LOC = nullptr;
-  // CHECK: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr [[LOC]], align 8
+  // CHECK-DAG: store ptr addrspace(3) addrspacecast (ptr null to ptr addrspace(3)), ptr [[LOC]], align 8
   GLOB = nullptr;
-  // CHECK: store ptr addrspace(1) null, ptr [[GLOB]], align 8
+  // CHECK-DAG: store ptr addrspace(1) null, ptr [[GLOB]], align 8
   NoAS = (int *)GLOB;
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
-  // CHECK: store ptr [[GLOB_CAST]], ptr [[NoAS]], align 8
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK-DAG: store ptr [[GLOB_CAST]], ptr [[NoAS]], align 8
   NoAS = (int *)LOC;
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
-  // CHECK: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr
-  // CHECK: store ptr [[LOC_CAST]], ptr [[NoAS]], align 8
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK-DAG: [[LOC_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD]] to ptr
+  // CHECK-DAG: store ptr [[LOC_CAST]], ptr [[NoAS]], align 8
   NoAS = (int *)PRIV;
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[PRIV]], align 8
-  // CHECK: store ptr [[LOC_LOAD]], ptr [[NoAS]], align 8
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[PRIV]], align 8
+  // CHECK-DAG: store ptr [[LOC_LOAD]], ptr [[NoAS]], align 8
   GLOB = (__attribute__((opencl_global)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(1)
-  // CHECK: store ptr addrspace(1) [[NoAS_CAST]], ptr [[GLOB]], align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(1)
+  // CHECK-DAG: store ptr addrspace(1) [[NoAS_CAST]], ptr [[GLOB]], align 8
   LOC = (__attribute__((opencl_local)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(3)
-  // CHECK: store ptr addrspace(3) [[NoAS_CAST]], ptr [[LOC]], align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: [[NoAS_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr [[NoAS_LOAD]] to ptr addrspace(3)
+  // CHECK-DAG: store ptr addrspace(3) [[NoAS_CAST]], ptr [[LOC]], align 8
   PRIV = (__attribute__((opencl_private)) int *)NoAS;
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: store ptr [[NoAS_LOAD]], ptr [[PRIV]], align 8
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: store ptr [[NoAS_LOAD]], ptr [[PRIV]], align 8
   GLOB = (__attribute__((opencl_global)) int *)GLOBDEVICE;
-  // CHECK: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB_DEVICE]], align 8
-  // CHECK: store ptr addrspace(1) [[GLOBDEVICE_LOAD]], ptr %GLOB, align 8
+  // CHECK-DAG: [[GLOBDEVICE_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB_DEVICE]], align 8
+  // CHECK-DAG: store ptr addrspace(1) [[GLOBDEVICE_LOAD]], ptr %GLOB, align 8
   GLOB = (__attribute__((opencl_global)) int *)GLOBHOST;
-  // CHECK: [[GLOB_HOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB_HOST]], align 8
-  // CHECK: store ptr addrspace(1) [[GLOB_HOST_LOAD]], ptr [[GLOB]], align 8
+  // CHECK-DAG: [[GLOB_HOST_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB_HOST]], align 8
+  // CHECK-DAG: store ptr addrspace(1) [[GLOB_HOST_LOAD]], ptr [[GLOB]], align 8
   bar(*GLOB);
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
-  // CHECK: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK-DAG: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
   bar2(*GLOB);
-  // CHECK: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
-  // CHECK: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
-  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
+  // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK-DAG: [[GLOB_CAST:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD]] to ptr
+  // CHECK-DAG: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[GLOB_CAST]])
   bar(*LOC);
-  // CHECK: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
-  // CHECK: call void @[[LOCAL_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
+  // CHECK-DAG: [[LOC_LOAD:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK-DAG: call void @[[LOCAL_REF]](ptr addrspace(3) noundef align 4 dereferenceable(4) [[LOC_LOAD]])
   bar2(*LOC);
-  // CHECK: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
-  // CHECK: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr
-  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[LOC_CAST2]])
+  // CHECK-DAG: [[LOC_LOAD2:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK-DAG: [[LOC_CAST2:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD2]] to ptr
+  // CHECK-DAG: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[LOC_CAST2]])
   bar(*NoAS);
-  // CHECK: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD]])
+  // CHECK-DAG: [[NoAS_LOAD:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: call void @[[RAW_REF]](ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD]])
   bar2(*NoAS);
-  // CHECK: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD2]])
+  // CHECK-DAG: [[NoAS_LOAD2:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: call void @[[RAW_REF2]](ptr noundef nonnull align 4 dereferenceable(4) [[NoAS_LOAD2]])
   foo(GLOB);
-  // CHECK: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
-  // CHECK: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr
-  // CHECK: call void @[[RAW_PTR]](ptr noundef [[GLOB_CAST3]])
+  // CHECK-DAG: [[GLOB_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK-DAG: [[GLOB_CAST3:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD3]] to ptr
+  // CHECK-DAG: call void @[[RAW_PTR]](ptr noundef [[GLOB_CAST3]])
   foo2(GLOB);
-  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
-  // CHECK: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr
-  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[GLOB_CAST4]])
+  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK-DAG: [[GLOB_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(1) [[GLOB_LOAD4]] to ptr
+  // CHECK-DAG: call void @[[RAW_PTR2]](ptr noundef [[GLOB_CAST4]])
   foo(LOC);
-  // CHECK: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
-  // CHECK: call void @[[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
+  // CHECK-DAG: [[LOC_LOAD3:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK-DAG: call void @[[LOC_PTR]](ptr addrspace(3) noundef [[LOC_LOAD3]])
   foo2(LOC);
-  // CHECK: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
-  // CHECK: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr
-  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[LOC_CAST4]])
+  // CHECK-DAG: [[LOC_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK-DAG: [[LOC_CAST4:%[a-zA-Z0-9]+]] = addrspacecast ptr addrspace(3) [[LOC_LOAD4]] to ptr
+  // CHECK-DAG: call void @[[RAW_PTR2]](ptr noundef [[LOC_CAST4]])
   foo(NoAS);
-  // CHECK: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: call void @[[RAW_PTR]](ptr noundef [[NoAS_LOAD3]])
+  // CHECK-DAG: [[NoAS_LOAD3:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: call void @[[RAW_PTR]](ptr noundef [[NoAS_LOAD3]])
   foo2(NoAS);
-  // CHECK: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
-  // CHECK: call void @[[RAW_PTR2]](ptr noundef [[NoAS_LOAD4]])
+  // CHECK-DAG: [[NoAS_LOAD4:%[a-zA-Z0-9]+]] = load ptr, ptr [[NoAS]], align 8
+  // CHECK-DAG: call void @[[RAW_PTR2]](ptr noundef [[NoAS_LOAD4]])
   tmpl(GLOB);
-  // CHECK: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
-  // CHECK: call void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
+  // CHECK-DAG: [[GLOB_LOAD4:%[a-zA-Z0-9]+]] = load ptr addrspace(1), ptr [[GLOB]], align 8
+  // CHECK-DAG: call void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef [[GLOB_LOAD4]])
   tmpl(LOC);
-  // CHECK: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
-  // CHECK: call void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
+  // CHECK-DAG: [[LOC_LOAD5:%[a-zA-Z0-9]+]] = load ptr addrspace(3), ptr [[LOC]], align 8
+  // CHECK-DAG: call void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef [[LOC_LOAD5]])
   tmpl(PRIV);
-  // CHECK: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr [[PRIV]], align 8
-  // CHECK: call void @_Z4tmplIPiEvT_(ptr noundef [[PRIV_LOAD5]])
+  // CHECK-DAG: [[PRIV_LOAD5:%[a-zA-Z0-9]+]] = load ptr, ptr [[PRIV]], align 8
+  // CHECK-DAG: call void @_Z4tmplIPiEvT_(ptr noundef [[PRIV_LOAD5]])
   tmpl(NoAS);
-// CHECK: %33 = load ptr, ptr %NoAS, align 8
-// CHECK: call void @_Z4tmplIPiEvT_(ptr noundef %33)
+// CHECK-DAG: %33 = load ptr, ptr %NoAS, align 8
+// CHECK-DAG: call void @_Z4tmplIPiEvT_(ptr noundef %33)
 }
 
-// CHECK: declare void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef)
-// CHECK: declare void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef)
-// CHECK: declare void @_Z4tmplIPiEvT_(ptr noundef)
+// CHECK-DAG: void @_Z4tmplIPU3AS1iEvT_(ptr addrspace(1) noundef
+// CHECK-DAG: void @_Z4tmplIPU3AS3iEvT_(ptr addrspace(3) noundef
+// CHECK-DAG: void @_Z4tmplIPiEvT_(ptr noundef

From 062cfada643c1aa48a1bb81894e2920d390fe8cf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 27 Feb 2024 16:32:53 -0800
Subject: [PATCH 512/546] [builtins] Disable
 COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY by default (#83201)

Most of GCC's Linux targets have a link spec
`%{!static|static-pie:--eh-frame-hdr}` that doesn't pass --eh-frame-hdr
for `-static` links. `-static` links are supposed to utilize
`__register_frame_info` (called by `crtbeginT.o`, not by crtbegin.o or
crtbeginS.o) as a replacement.

compiler-rt crtbegin (not used with GCC) has some ehframe code, which is
not utilized because Clang driver unconditionally passes --eh-frame-hdr
for Linux targets, even for -static. In addition, LLVM libunwind
implements `__register_frame_info` as an empty stub.

Furthermore, in a non-static link, the `__register_frame_info`
references can cause an undesired weak dynamic symbol.

For now, just disable the config by default.
---
 compiler-rt/lib/builtins/CMakeLists.txt    |  2 +-
 compiler-rt/test/builtins/Unit/ctor_dtor.c | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 28ded8766f253..83f7697a4a2b4 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -916,7 +916,7 @@ cmake_dependent_option(COMPILER_RT_BUILD_CRT "Build crtbegin.o/crtend.o" ON "COM
 if (COMPILER_RT_BUILD_CRT)
   add_compiler_rt_component(crt)
 
-  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" ON)
+  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" OFF)
 
   include(CheckSectionExists)
   check_section_exists(".init_array" COMPILER_RT_HAS_INITFINI_ARRAY
diff --git a/compiler-rt/test/builtins/Unit/ctor_dtor.c b/compiler-rt/test/builtins/Unit/ctor_dtor.c
index 47560722a9f75..3d5f895a0a1cd 100644
--- a/compiler-rt/test/builtins/Unit/ctor_dtor.c
+++ b/compiler-rt/test/builtins/Unit/ctor_dtor.c
@@ -9,23 +9,13 @@
 
 // Ensure the various startup functions are called in the proper order.
 
-// CHECK: __register_frame_info()
 /// ctor() is here if ld.so/libc supports DT_INIT/DT_FINI
 // CHECK:      main()
 /// dtor() is here if ld.so/libc supports DT_INIT/DT_FINI
-// CHECK:      __deregister_frame_info()
 
 struct object;
 static int counter;
 
-void __register_frame_info(const void *fi, struct object *obj) {
-  printf("__register_frame_info()\n");
-}
-
-void __deregister_frame_info(const void *fi) {
-  printf("__deregister_frame_info()\n");
-}
-
 void __attribute__((constructor)) ctor() {
   printf("ctor()\n");
   ++counter;

From 8506a63bf7cbe593c0707f995fbd0b8f820d0d62 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 28 Feb 2024 01:02:39 +0000
Subject: [PATCH 513/546] Revert "[WebAssembly] Disable multivalue emission
 temporarily (#82714)"

This reverts commit 6e6bf9f81756ba6655b4eea8dc45469a47f89b39.

It turned out the multivalue feature had active outside users and it
could cause some disruptions to them, so I'd like to investigate more
about the workarounds before doing this.
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  7 ++---
 .../WebAssemblyMachineFunctionInfo.cpp        |  5 +---
 .../WebAssemblyRuntimeLibcallSignatures.cpp   | 26 +++++++++----------
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |  9 -------
 .../lower-em-ehsjlj-multi-return.ll           |  4 +--
 .../multivalue-dont-move-def-past-use.mir     |  2 +-
 .../WebAssembly/multivalue-stackify.ll        |  2 +-
 llvm/test/CodeGen/WebAssembly/multivalue.ll   | 10 +++----
 .../CodeGen/WebAssembly/multivalue_libcall.ll |  2 +-
 9 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 36f067956e63a..7c47790d1e351 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,8 +43,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1290,7 +1288,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
-  return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
+  return Subtarget->hasMultivalue() || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1298,8 +1296,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
-          Outs.size() <= 1) &&
+  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
          "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index b969b8370a3e5..1e959111a4dbc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,8 +22,6 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
 MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -73,8 +71,7 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
-       !WasmEmitMultiValue)) {
+      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 2a84c90c89602..3e2e029695ab6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,8 +24,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 namespace {
 
 enum RuntimeLibcallSignature {
@@ -696,7 +694,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_func_f32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -705,7 +703,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F32);
     break;
   case i64_i64_func_f64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -714,7 +712,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F64);
     break;
   case i16_i16_func_i16_i16:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -724,7 +722,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i32_i32_func_i32_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -734,7 +732,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -744,7 +742,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -756,7 +754,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_iPTR:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -769,7 +767,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
@@ -783,7 +781,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -853,7 +851,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -867,7 +865,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -876,7 +874,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index d088c7d925ddf..4d4cae110148d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,15 +54,6 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-// A temporary option to control emission of multivalue until multivalue
-// implementation is stable enough. We currently don't emit multivalue by
-// default even if the feature section allows it.
-// TODO Stabilize multivalue and delete this option
-cl::opt<bool>
-    WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
-                       cl::desc("WebAssembly: Emit multivalue in the backend"),
-                       cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index daf46c6eef025..4f33439db770d 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4fadbd5f07e6d..4b4661b144667 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index f4f93ac2f30ce..52a8c686824d3 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 846691e5ff0cd..675009c8f3e54 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -20,7 +19,6 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
-; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 7bf37b59353ad..47c5ae7b457dd 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled

From f20ea05f3b2218a7103612e5e367398b0b27bb27 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 17:02:27 -0800
Subject: [PATCH 514/546] [flang][runtime] Fixed aarach buildbots after #83169.

---
 flang/include/flang/Common/float128.h | 2 ++
 flang/runtime/complex-reduction.c     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 6aa98df5529df..61b3a77b867a9 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -20,6 +20,8 @@
 #ifndef FORTRAN_COMMON_FLOAT128_H_
 #define FORTRAN_COMMON_FLOAT128_H_
 
+#include <float.h>
+
 #ifdef __cplusplus
 /*
  * libc++ does not fully support __float128 right now, e.g.
diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index 06e4f15c7fa9b..72c31ce08b875 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -76,6 +76,7 @@ static long_double_Complex_t CMPLXL(long double r, long double i) {
 #endif
 
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+#ifndef CMPLXF128
 /*
  * GCC 7.4.0 (currently minimum GCC version for llvm builds)
  * supports __builtin_complex. For Clang, require >=12.0.
@@ -95,6 +96,7 @@ static CFloat128ComplexType CMPLXF128(CFloat128Type r, CFloat128Type i) {
 }
 #endif
 #endif
+#endif
 
 /* RTNAME(SumComplex4) calls RTNAME(CppSumComplex4) with the same arguments
  * and converts the members of its C++ complex result to C _Complex.

From bcbce807d76a30388b366d14051c5f80e9724dab Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Tue, 27 Feb 2024 20:19:07 -0500
Subject: [PATCH 515/546] Revert "[HIP] fix host min/max in header (#82956)"

This reverts commit 55783bd0f9cfc30aa93c718919dab5419d86a2c6.

Due to regressions in hipCUB.

hipCUB/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp:142:33: error: call to 'min' is ambiguous

https://github.com/ROCm/hipCUB/blob/develop/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp#L142

The ambuguity seems due to missing min(int, unsigned int).

Previously, there is only min(int, int). After the change,
there are min(int, int) and min(unsigned int, unsigned int),
therefore there is ambiguity.
---
 clang/lib/Headers/__clang_hip_math.h | 72 +++-------------------------
 1 file changed, 6 insertions(+), 66 deletions(-)

diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 34d1de0a06005..11e1e7d032586 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -1306,75 +1306,15 @@ float min(float __x, float __y) { return __builtin_fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return __builtin_fmin(__x, __y); }
 
-// Define host min/max functions.
-#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&                  \
-    !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
-
-#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
-#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
-#define DEFINE_MIN_MAX_FUNCTIONS(ret_type, type1, type2)                       \
-  inline ret_type min(const type1 __a, const type2 __b) {                      \
-    return (__a < __b) ? __a : __b;                                            \
-  }                                                                            \
-  inline ret_type max(const type1 __a, const type2 __b) {                      \
-    return (__a > __b) ? __a : __b;                                            \
-  }
-
-// Define min and max functions for same type comparisons
-DEFINE_MIN_MAX_FUNCTIONS(int, int, int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, unsigned int)
-DEFINE_MIN_MAX_FUNCTIONS(long, long, long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, unsigned long)
-DEFINE_MIN_MAX_FUNCTIONS(long long, long long, long long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long,
-                         unsigned long long)
-
-// The host min/max functions below accept mixed signed/unsigned integer
-// parameters and perform unsigned comparisons, which may produce unexpected
-// results if a signed integer was passed unintentionally. To avoid this
-// happening silently, these overloaded functions are not defined by default.
-// However, for compatibility with CUDA, they will be defined if users define
-// __HIP_DEFINE_MIXED_HOST_MIN_MAX__.
-#ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, int, unsigned int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, long, unsigned long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, long long, unsigned long long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long, long long)
-#endif // ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
-
-// Floating-point comparisons using built-in functions
-inline float min(float const __a, float const __b) {
-  return __builtin_fminf(__a, __b);
-}
-inline double min(double const __a, double const __b) {
-  return __builtin_fmin(__a, __b);
-}
-inline double min(float const __a, double const __b) {
-  return __builtin_fmin(__a, __b);
-}
-inline double min(double const __a, float const __b) {
-  return __builtin_fmin(__a, __b);
+#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
+__host__ inline static int min(int __arg1, int __arg2) {
+  return __arg1 < __arg2 ? __arg1 : __arg2;
 }
 
-inline float max(float const __a, float const __b) {
-  return __builtin_fmaxf(__a, __b);
-}
-inline double max(double const __a, double const __b) {
-  return __builtin_fmax(__a, __b);
-}
-inline double max(float const __a, double const __b) {
-  return __builtin_fmax(__a, __b);
+__host__ inline static int max(int __arg1, int __arg2) {
+  return __arg1 > __arg2 ? __arg1 : __arg2;
 }
-inline double max(double const __a, float const __b) {
-  return __builtin_fmax(__a, __b);
-}
-
-#pragma pop_macro("DEFINE_MIN_MAX_FUNCTIONS")
-
-#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&
-       // !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
+#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif
 
 #pragma pop_macro("__DEVICE__")

From dba2dd2c487d7bed7ad3b76a67fdfce464f0edbf Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Tue, 27 Feb 2024 17:26:06 -0800
Subject: [PATCH 516/546] Revert "[CLANG][DWARF] Do not emit -ggnu-pubnames for
 split dwarf version 5." (#83214)

Reverts llvm/llvm-project#82840
---
 clang/lib/Driver/ToolChains/Clang.cpp | 7 +++----
 clang/test/Driver/split-debug.c       | 5 -----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index dbfc729bba24c..6e1b7e8657d0d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4479,10 +4479,9 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
                       options::OPT_gpubnames, options::OPT_gno_pubnames);
   if (DwarfFission != DwarfFissionKind::None ||
       (PubnamesArg && checkDebugInfoOption(PubnamesArg, Args, D, TC)))
-    if (DebuggerTuning != llvm::DebuggerKind::LLDB &&
-        (!PubnamesArg ||
-         (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
-          !PubnamesArg->getOption().matches(options::OPT_gno_pubnames))))
+    if (!PubnamesArg ||
+        (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
+         !PubnamesArg->getOption().matches(options::OPT_gno_pubnames)))
       CmdArgs.push_back(PubnamesArg && PubnamesArg->getOption().matches(
                                            options::OPT_gpubnames)
                             ? "-gpubnames"
diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c
index a2a3dc0235450..968f33b4cc035 100644
--- a/clang/test/Driver/split-debug.c
+++ b/clang/test/Driver/split-debug.c
@@ -124,8 +124,3 @@
 // G1_NOSPLIT: "-debug-info-kind=line-tables-only"
 // G1_NOSPLIT-NOT: "-split-dwarf-file"
 // G1_NOSPLIT-NOT: "-split-dwarf-output"
-
-/// Do not generate -ggnu-pubnames for -glldb
-// RUN: %clang -### -c -target x86_64 -gsplit-dwarf -g -glldb %s 2>&1 | FileCheck %s --check-prefixes=GLLDBSPLIT
-
-// GLLDBSPLIT-NOT: "-ggnu-pubnames"

From e9e7aeadaf0ce9d66ff352856fd2d1005b0f7d74 Mon Sep 17 00:00:00 2001
From: Daniel Hoekwater <hoekwater@google.com>
Date: Wed, 28 Feb 2024 01:34:48 +0000
Subject: [PATCH 517/546] [Driver] Allow -fbasic-block-address-map for AArch64
 ELF (#82662)

Emitting the basic block address map with
`-fbasic-block-sections=labels` is allowed for AArch64 ELF since
7eaf94fefa1250fc8a46982cea8ce99abacae11f. Allow doing so with
`-fbasic-block-address-map`.
---
 clang/lib/Driver/ToolChains/Clang.cpp       | 2 +-
 clang/test/Driver/basic-block-address-map.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6e1b7e8657d0d..66c3a237c1211 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5958,7 +5958,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_address_map,
                                options::OPT_fno_basic_block_address_map)) {
-    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+    if ((Triple.isX86() || Triple.isAArch64()) && Triple.isOSBinFormatELF()) {
       if (A->getOption().matches(options::OPT_fbasic_block_address_map))
         A->render(Args, CmdArgs);
     } else {
diff --git a/clang/test/Driver/basic-block-address-map.c b/clang/test/Driver/basic-block-address-map.c
index 022f972b412d6..12393e8ebfd54 100644
--- a/clang/test/Driver/basic-block-address-map.c
+++ b/clang/test/Driver/basic-block-address-map.c
@@ -1,8 +1,9 @@
-// RUN: %clang -### -target x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=aarch64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
 // CHECK-PRESENT: -fbasic-block-address-map
 
-// RUN: %clang -### -target x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
+// RUN: %clang -### --target=x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
 // CHECK-ABSENT-NOT: -fbasic-block-address-map
 
-// RUN: not %clang -c -target x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: not %clang -c --target=x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
 // CHECK-TRIPLE: error: unsupported option '-fbasic-block-address-map' for target

From 50136ca11f62050b34876a920fcd87d2aefccfdb Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Tue, 27 Feb 2024 18:01:37 -0800
Subject: [PATCH 518/546] [DirectX][NFC] Rename ShaderFlag to
 SHADER_FEATURE_FLAG. (#82700)

This is preparation for add ShaderFlag in DXIL.

For #57925
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  |  2 +-
 .../BinaryFormat/DXContainerConstants.def     | 74 +++++++++----------
 llvm/include/llvm/Object/DXContainer.h        |  8 +-
 .../include/llvm/ObjectYAML/DXContainerYAML.h | 14 ++--
 llvm/lib/Object/DXContainer.cpp               |  8 +-
 llvm/lib/ObjectYAML/DXContainerYAML.cpp       | 14 ++--
 llvm/lib/Target/DirectX/DXILShaderFlags.cpp   |  2 +-
 llvm/lib/Target/DirectX/DXILShaderFlags.h     |  6 +-
 llvm/tools/obj2yaml/dxcontainer2yaml.cpp      |  4 +-
 9 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index c3dcd568216b7..a28e19edb4c6a 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -141,7 +141,7 @@ enum class PartType {
 #include "DXContainerConstants.def"
 };
 
-#define SHADER_FLAG(Num, Val, Str) Val = 1ull << Num,
+#define SHADER_FEATURE_FLAG(Num, Val, Str) Val = 1ull << Num,
 enum class FeatureFlags : uint64_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 87dd0a5cb6ba7..80ed86bc3a499 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -11,43 +11,43 @@ CONTAINER_PART(PSG1)
 #undef CONTAINER_PART
 #endif 
 
-#ifdef SHADER_FLAG
-
-SHADER_FLAG(0, Doubles, "Double-precision floating point")
-SHADER_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
-SHADER_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
-SHADER_FLAG(3, Max64UAVs, "64 UAV slots")
-SHADER_FLAG(4, MinimumPrecision, "Minimum-precision data types")
-SHADER_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
-SHADER_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
-SHADER_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
-SHADER_FLAG(8, TiledResources, "Tiled resources")
-SHADER_FLAG(9, StencilRef, "PS Output Stencil Ref")
-SHADER_FLAG(10, InnerCoverage, "PS Inner Coverage")
-SHADER_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
-SHADER_FLAG(12, ROVs, "Raster Ordered UAVs")
-SHADER_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
-SHADER_FLAG(14, WaveOps, "Wave level operations")
-SHADER_FLAG(15, Int64Ops, "64-Bit integer")
-SHADER_FLAG(16, ViewID, "View Instancing")
-SHADER_FLAG(17, Barycentrics, "Barycentrics")
-SHADER_FLAG(18, NativeLowPrecision, "Use native low precision")
-SHADER_FLAG(19, ShadingRate, "Shading Rate")
-SHADER_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
-SHADER_FLAG(21, SamplerFeedback, "Sampler feedback")
-SHADER_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
-SHADER_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
-SHADER_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
-SHADER_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
-SHADER_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
-SHADER_FLAG(27, RESERVED, "<RESERVED>")
-SHADER_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
-SHADER_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
-SHADER_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
-
-SHADER_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
-
-#undef SHADER_FLAG
+#ifdef SHADER_FEATURE_FLAG
+
+SHADER_FEATURE_FLAG(0, Doubles, "Double-precision floating point")
+SHADER_FEATURE_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
+SHADER_FEATURE_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
+SHADER_FEATURE_FLAG(3, Max64UAVs, "64 UAV slots")
+SHADER_FEATURE_FLAG(4, MinimumPrecision, "Minimum-precision data types")
+SHADER_FEATURE_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
+SHADER_FEATURE_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
+SHADER_FEATURE_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
+SHADER_FEATURE_FLAG(8, TiledResources, "Tiled resources")
+SHADER_FEATURE_FLAG(9, StencilRef, "PS Output Stencil Ref")
+SHADER_FEATURE_FLAG(10, InnerCoverage, "PS Inner Coverage")
+SHADER_FEATURE_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
+SHADER_FEATURE_FLAG(12, ROVs, "Raster Ordered UAVs")
+SHADER_FEATURE_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
+SHADER_FEATURE_FLAG(14, WaveOps, "Wave level operations")
+SHADER_FEATURE_FLAG(15, Int64Ops, "64-Bit integer")
+SHADER_FEATURE_FLAG(16, ViewID, "View Instancing")
+SHADER_FEATURE_FLAG(17, Barycentrics, "Barycentrics")
+SHADER_FEATURE_FLAG(18, NativeLowPrecision, "Use native low precision")
+SHADER_FEATURE_FLAG(19, ShadingRate, "Shading Rate")
+SHADER_FEATURE_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
+SHADER_FEATURE_FLAG(21, SamplerFeedback, "Sampler feedback")
+SHADER_FEATURE_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
+SHADER_FEATURE_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
+SHADER_FEATURE_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
+SHADER_FEATURE_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
+SHADER_FEATURE_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
+SHADER_FEATURE_FLAG(27, RESERVED, "<RESERVED>")
+SHADER_FEATURE_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
+SHADER_FEATURE_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
+SHADER_FEATURE_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
+
+SHADER_FEATURE_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
+
+#undef SHADER_FEATURE_FLAG
 #endif
 
 #ifdef SEMANTIC_KIND
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
index a7f18c7996980..b6e3d321da246 100644
--- a/llvm/include/llvm/Object/DXContainer.h
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -276,7 +276,7 @@ class DXContainer {
   dxbc::Header Header;
   SmallVector<uint32_t, 4> PartOffsets;
   std::optional<DXILData> DXIL;
-  std::optional<uint64_t> ShaderFlags;
+  std::optional<uint64_t> ShaderFeatureFlags;
   std::optional<dxbc::ShaderHash> Hash;
   std::optional<DirectX::PSVRuntimeInfo> PSVInfo;
   DirectX::Signature InputSignature;
@@ -286,7 +286,7 @@ class DXContainer {
   Error parseHeader();
   Error parsePartOffsets();
   Error parseDXILHeader(StringRef Part);
-  Error parseShaderFlags(StringRef Part);
+  Error parseShaderFeatureFlags(StringRef Part);
   Error parseHash(StringRef Part);
   Error parsePSVInfo(StringRef Part);
   Error parseSignature(StringRef Part, DirectX::Signature &Array);
@@ -368,7 +368,9 @@ class DXContainer {
 
   const std::optional<DXILData> &getDXIL() const { return DXIL; }
 
-  std::optional<uint64_t> getShaderFlags() const { return ShaderFlags; }
+  std::optional<uint64_t> getShaderFeatureFlags() const {
+    return ShaderFeatureFlags;
+  }
 
   std::optional<dxbc::ShaderHash> getShaderHash() const { return Hash; }
 
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 66a6ac70bbea1..497f82bbd0f32 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -56,10 +56,10 @@ struct DXILProgram {
   std::optional<std::vector<llvm::yaml::Hex8>> DXIL;
 };
 
-#define SHADER_FLAG(Num, Val, Str) bool Val = false;
-struct ShaderFlags {
-  ShaderFlags() = default;
-  ShaderFlags(uint64_t FlagData);
+#define SHADER_FEATURE_FLAG(Num, Val, Str) bool Val = false;
+struct ShaderFeatureFlags {
+  ShaderFeatureFlags() = default;
+  ShaderFeatureFlags(uint64_t FlagData);
   uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
@@ -151,7 +151,7 @@ struct Part {
   std::string Name;
   uint32_t Size;
   std::optional<DXILProgram> Program;
-  std::optional<ShaderFlags> Flags;
+  std::optional<ShaderFeatureFlags> Flags;
   std::optional<ShaderHash> Hash;
   std::optional<PSVInfo> Info;
   std::optional<DXContainerYAML::Signature> Signature;
@@ -195,8 +195,8 @@ template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
   static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
-template <> struct MappingTraits<DXContainerYAML::ShaderFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFlags &Flags);
+template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
+  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
index 0401c20b98ec8..935749afe3385 100644
--- a/llvm/lib/Object/DXContainer.cpp
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -72,13 +72,13 @@ Error DXContainer::parseDXILHeader(StringRef Part) {
   return Error::success();
 }
 
-Error DXContainer::parseShaderFlags(StringRef Part) {
-  if (ShaderFlags)
+Error DXContainer::parseShaderFeatureFlags(StringRef Part) {
+  if (ShaderFeatureFlags)
     return parseFailed("More than one SFI0 part is present in the file");
   uint64_t FlagValue = 0;
   if (Error Err = readInteger(Part, Part.begin(), FlagValue))
     return Err;
-  ShaderFlags = FlagValue;
+  ShaderFeatureFlags = FlagValue;
   return Error::success();
 }
 
@@ -168,7 +168,7 @@ Error DXContainer::parsePartOffsets() {
         return Err;
       break;
     case dxbc::PartType::SFI0:
-      if (Error Err = parseShaderFlags(PartData))
+      if (Error Err = parseShaderFeatureFlags(PartData))
         return Err;
       break;
     case dxbc::PartType::HASH:
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 1f03f2c7d3996..7dc9822bdd221 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -23,15 +23,15 @@ namespace llvm {
 static_assert((uint64_t)dxbc::FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-DXContainerYAML::ShaderFlags::ShaderFlags(uint64_t FlagData) {
-#define SHADER_FLAG(Num, Val, Str)                                             \
+DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   Val = (FlagData & (uint64_t)dxbc::FeatureFlags::Val) > 0;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
-uint64_t DXContainerYAML::ShaderFlags::getEncodedFlags() {
+uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() {
   uint64_t Flag = 0;
-#define SHADER_FLAG(Num, Val, Str)                                             \
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   if (Val)                                                                     \
     Flag |= (uint64_t)dxbc::FeatureFlags::Val;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -103,9 +103,9 @@ void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
   IO.mapOptional("DXIL", Program.DXIL);
 }
 
-void MappingTraits<DXContainerYAML::ShaderFlags>::mapping(
-    IO &IO, DXContainerYAML::ShaderFlags &Flags) {
-#define SHADER_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
+void MappingTraits<DXContainerYAML::ShaderFeatureFlags>::mapping(
+    IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index bbb5643566021..66a9dc46bcbfb 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -51,7 +51,7 @@ void ComputedShaderFlags::print(raw_ostream &OS) const {
   if (FlagVal == 0)
     return;
   OS << "; Note: shader requires additional functionality:\n";
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   if (FlagName)                                                                \
     OS << ";       " Str "\n";
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index 4f51873a2d0b3..574a7b090f528 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -29,17 +29,17 @@ class GlobalVariable;
 namespace dxil {
 
 struct ComputedShaderFlags {
-#define SHADER_FLAG(bit, FlagName, Str) bool FlagName : 1;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) bool FlagName : 1;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 
-#define SHADER_FLAG(bit, FlagName, Str) FlagName = false;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) FlagName = false;
   ComputedShaderFlags() {
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   }
 
   operator uint64_t() const {
     uint64_t FlagValue = 0;
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   FlagValue |=                                                                 \
       FlagName ? static_cast<uint64_t>(dxbc::FeatureFlags::FlagName) : 0ull;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
index b58d7cd952aff..69d9b9a2f784f 100644
--- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
+++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
@@ -71,10 +71,10 @@ dumpDXContainer(MemoryBufferRef Source) {
       break;
     }
     case dxbc::PartType::SFI0: {
-      std::optional<uint64_t> Flags = Container.getShaderFlags();
+      std::optional<uint64_t> Flags = Container.getShaderFeatureFlags();
       // Omit the flags in the YAML if they are missing or zero.
       if (Flags && *Flags > 0)
-        NewPart.Flags = DXContainerYAML::ShaderFlags(*Flags);
+        NewPart.Flags = DXContainerYAML::ShaderFeatureFlags(*Flags);
       break;
     }
     case dxbc::PartType::HASH: {

From cf1c97b2d29c51d6c2e79454f6ec3d1f8f98e672 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Tue, 27 Feb 2024 18:04:59 -0800
Subject: [PATCH 519/546] [AMDGPU] Do not attempt to fallback to default
 mutations (#83208)

IGLP itself will be in SavedMutations via mutations added during
Scheduler creation, thus falling back results in reapplying IGLP.

In PostRA scheduling, if we have multiple regions with IGLP
instructions, then we may have infinite loop.

Disable the feature for now.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 21 ++++---------------
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       |  5 ++---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  8 +++----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 10 ++++-----
 llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll  | 15 +++++++++++++
 5 files changed, 28 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index e3f7248507953..57769fe998d1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2337,8 +2337,6 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
 
   ScheduleDAGMI *DAG;
 
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
-
   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
@@ -2381,10 +2379,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase Phase,
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
-      : SavedMutations(SavedMutations), Phase(Phase) {}
+  IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -2602,13 +2597,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     PS.solve();
     return;
   }
-
-  if (!SavedMutations)
-    return;
-
-  // We did not apply a mutation, fall back to SavedMutations
-  for (auto &m : *SavedMutations)
-    m->apply(DAG);
 }
 
 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -2707,10 +2695,9 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
-  return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 46ef4d702d002..aff7096f26d67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -20,9 +20,8 @@ namespace AMDGPU {
 enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
 } // namespace AMDGPU
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0d830df1f1f1d..76e843455bab7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,8 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -472,8 +471,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   return DAG;
 }
 
@@ -937,7 +935,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(
-        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f993ec8409c99..9f419a7fbf683 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
+  DAG.addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -858,8 +858,7 @@ bool GCNSchedStage::initGCNRegion() {
                           StageID == GCNSchedStageID::ILPInitialSchedule;
     DAG.addMutation(createIGroupLPDAGMutation(
         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
-                       : AMDGPU::SchedulingPhase::PreRAReentry,
-        &SavedMutations));
+                       : AMDGPU::SchedulingPhase::PreRAReentry));
   }
 
   return true;
@@ -1573,8 +1572,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
-                                          &SavedMutations));
+    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
   }
 
   ScheduleDAGMI::schedule();
diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
new file mode 100644
index 0000000000000..1113acb3c0305
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
+
+; Test should not result in build failure
+; CHECK-LABEL: shouldNotReApply
+
+define amdgpu_kernel void @shouldNotReApply() {
+entry:
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store <4 x i32> zeroinitializer, ptr addrspace(3) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store i32 0, ptr addrspace(5) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  ret void
+}

From 91d23370cd4608f84f5209e445579a3b24ae3545 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 28 Feb 2024 10:26:54 +0800
Subject: [PATCH 520/546] [RISCV] Use a tail agnostic vslideup if possible for
 scalable insert_subvector (#83146)

If we know that an insert_subvector inserting a fixed subvector will
overwrite the entire tail of the vector, we use a tail agnostic
vslideup. This was added in https://reviews.llvm.org/D147347, but we can
do the same thing for scalable vectors too.

The `Policy` variable is defined in a slightly weird place but this is
to mirror the fixed length subvector code path as closely as possible. I
think we may be able to deduplicate them in future.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  9 ++++++++-
 llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll    |  2 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll         | 14 +++++++-------
 llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll            |  8 ++++----
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll        |  6 +++---
 llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll       |  2 +-
 .../CodeGen/RISCV/rvv/vector-interleave-store.ll   |  2 +-
 llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll   | 12 ++++++------
 llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll       |  4 ++--
 .../CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll     |  4 ++--
 10 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e34750d057301..e95e21bda687e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9728,8 +9728,15 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
 
+  ElementCount EndIndex =
+      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
   VL = computeVLMax(SubVecVT, DL, DAG);
 
+  // Use tail agnostic policy if we're inserting over Vec's tail.
+  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if (EndIndex == VecVT.getVectorElementCount())
+    Policy = RISCVII::TAIL_AGNOSTIC;
+
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
   if (RemIdx == 0) {
@@ -9743,7 +9750,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
 
     SubVec = getVSlideup(DAG, Subtarget, DL, InterSubVT, AlignedExtract, SubVec,
-                         SlideupAmt, Mask, VL);
+                         SlideupAmt, Mask, VL, Policy);
   }
 
   // If required, insert this subvector back into the correct vector register.
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 0f3f57a0dec59..d377082761736 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -76,7 +76,7 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 243dc19a25588..1ef63ffa9ee0c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2235,9 +2235,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v1, a1
-; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add sp, sp, a0
@@ -2337,7 +2337,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v2, v26, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
@@ -2395,12 +2395,12 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v1
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v3, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v2, a1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 34
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -3637,7 +3637,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    slli a0, a1, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v17, v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index e77966d8c43b3..aee255196ce2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -3387,7 +3387,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    add a1, a0, a0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV32-NEXT:    vslideup.vx v0, v24, a0
 ; RV32-NEXT:    ret
 ;
@@ -3400,7 +3400,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV64-NEXT:    vslideup.vx v0, v24, a0
 ; RV64-NEXT:    ret
 ;
@@ -3413,7 +3413,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
 ; ZVFHMIN32-NEXT:    add a1, a0, a0
-; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN32-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -3426,7 +3426,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
 ; ZVFHMIN64-NEXT:    add a1, a0, a0
-; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN64-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN64-NEXT:    ret
   %vc = fcmp oeq <vscale x 16 x double> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 007afe12b8e43..a23b7c7b6ae9e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2424,7 +2424,7 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v1, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2459,7 +2459,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
@@ -2492,7 +2492,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index a2ac684604b94..5f35a4e50a952 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -3235,7 +3235,7 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0
 ; CHECK-NEXT:    ret
   %vc = icmp eq <vscale x 16 x i64> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index c23c10205e6e3..4c64b1677b362 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -22,7 +22,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index e84fd1b1a7036..1acc0fec8fe58 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -24,7 +24,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a0
 ; CHECK-NEXT:    ret
 ;
@@ -44,7 +44,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
@@ -376,9 +376,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
@@ -391,9 +391,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
-; ZVBB-NEXT:    vmv1r.v v8, v10
+; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index 8e983f63428a6..b888fde7d0683 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -937,7 +937,7 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptosi <vscale x 32 x half> %va to <vscale x 32 x i1>
@@ -967,7 +967,7 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptoui <vscale x 32 x half> %va to <vscale x 32 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 2546ec95a0079..515d77109af9f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -894,7 +894,7 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
@@ -982,7 +982,7 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0

From 1f2a1a72ae6615ce80fcd6f1185d1cde607377d2 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 18:33:43 -0800
Subject: [PATCH 521/546] [flang][runtime] Fixed flang+Werror buildbots after
 #83169.

---
 flang/include/flang/Common/float128.h     | 5 +++++
 flang/runtime/Float128Math/cabs.cpp       | 6 ++++--
 flang/runtime/Float128Math/math-entries.h | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 61b3a77b867a9..2e76bc0a162e6 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -54,9 +54,13 @@
 /* Define pure C CFloat128Type and CFloat128ComplexType. */
 #if LDBL_MANT_DIG == 113
 typedef long double CFloat128Type;
+#ifndef __cplusplus
 typedef long double _Complex CFloat128ComplexType;
+#endif
 #elif HAS_FLOAT128
 typedef __float128 CFloat128Type;
+
+#ifndef __cplusplus
 /*
  * Use mode() attribute supported by GCC and Clang.
  * Adjust it for other compilers as needed.
@@ -66,5 +70,6 @@ typedef _Complex float __attribute__((mode(TC))) CFloat128ComplexType;
 #else
 typedef _Complex float __attribute__((mode(KC))) CFloat128ComplexType;
 #endif
+#endif // __cplusplus
 #endif
 #endif /* FORTRAN_COMMON_FLOAT128_H_ */
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 2867c8a4578a8..827b197a6a81a 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -10,13 +10,15 @@
 
 namespace Fortran::runtime {
 extern "C" {
-
+#if 0
+// FIXME: temporarily disabled. Need to add pure C entry point
+// using C _Complex ABI.
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 // NOTE: Flang calls the runtime APIs using C _Complex ABI
 CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
   return CAbs<RTNAME(CAbsF128)>::invoke(x);
 }
 #endif
-
+#endif
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 141648d2fb2c5..83298674c4971 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -61,7 +61,6 @@ DEFINE_FALLBACK(Asinh)
 DEFINE_FALLBACK(Atan)
 DEFINE_FALLBACK(Atan2)
 DEFINE_FALLBACK(Atanh)
-DEFINE_FALLBACK(CAbs)
 DEFINE_FALLBACK(Ceil)
 DEFINE_FALLBACK(Cos)
 DEFINE_FALLBACK(Cosh)
@@ -163,7 +162,6 @@ DEFINE_SIMPLE_ALIAS(Asinh, asinhq)
 DEFINE_SIMPLE_ALIAS(Atan, atanq)
 DEFINE_SIMPLE_ALIAS(Atan2, atan2q)
 DEFINE_SIMPLE_ALIAS(Atanh, atanhq)
-DEFINE_SIMPLE_ALIAS(CAbs, cabsq)
 DEFINE_SIMPLE_ALIAS(Ceil, ceilq)
 DEFINE_SIMPLE_ALIAS(Cos, cosq)
 DEFINE_SIMPLE_ALIAS(Cosh, coshq)

From 7c206c7812408f152baffa3c73f765b7d9ffdf18 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Tue, 27 Feb 2024 18:44:28 -0800
Subject: [PATCH 522/546] [BOLT] Refactor interface for instruction labels.
 NFCI (#83209)

To avoid accidentally setting the label twice for the same instruction,
which can lead to a "lost" label, introduce getOrSetInstLabel()
function. Rename existing functions to getInstLabel()/setInstLabel() to
make it explicit that they operate on instruction labels. Add an
assertion in setInstLabel() that the instruction did not have a prior
label set.
---
 bolt/include/bolt/Core/MCPlusBuilder.h   |  9 +++++++--
 bolt/lib/Core/BinaryContext.cpp          |  2 +-
 bolt/lib/Core/BinaryEmitter.cpp          |  2 +-
 bolt/lib/Core/BinaryFunction.cpp         |  2 +-
 bolt/lib/Core/Exceptions.cpp             |  5 ++---
 bolt/lib/Core/MCPlusBuilder.cpp          | 19 ++++++++++++++++---
 bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 14 ++++----------
 7 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index eeb7609ff6b5f..6bb76d1b917db 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1183,11 +1183,16 @@ class MCPlusBuilder {
   bool clearOffset(MCInst &Inst) const;
 
   /// Return the label of \p Inst, if available.
-  MCSymbol *getLabel(const MCInst &Inst) const;
+  MCSymbol *getInstLabel(const MCInst &Inst) const;
+
+  /// Set the label of \p Inst or return the existing label for the instruction.
+  /// This label will be emitted right before \p Inst is emitted to MCStreamer.
+  MCSymbol *getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                 MCContext *Ctx) const;
 
   /// Set the label of \p Inst. This label will be emitted right before \p Inst
   /// is emitted to MCStreamer.
-  bool setLabel(MCInst &Inst, MCSymbol *Label) const;
+  void setInstLabel(MCInst &Inst, MCSymbol *Label) const;
 
   /// Get instruction size specified via annotation.
   std::optional<uint32_t> getSize(const MCInst &Inst) const;
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index d544ece13a832..b29ebbbfa18c4 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1967,7 +1967,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
     OS << " # Offset: " << *Offset;
   if (std::optional<uint32_t> Size = MIB->getSize(Instruction))
     OS << " # Size: " << *Size;
-  if (MCSymbol *Label = MIB->getLabel(Instruction))
+  if (MCSymbol *Label = MIB->getInstLabel(Instruction))
     OS << " # Label: " << *Label;
 
   MIB->printAnnotations(Instruction, OS);
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index d4b668c1d7e7b..97d19b75200f5 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -489,7 +489,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
 
       if (!EmitCodeOnly) {
         // A symbol to be emitted before the instruction to mark its location.
-        MCSymbol *InstrLabel = BC.MIB->getLabel(Instr);
+        MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr);
 
         if (opts::UpdateDebugSections && BF.getDWARFUnit()) {
           LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen,
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 54f2f9d972a46..00df42c11e223 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1424,7 +1424,7 @@ Error BinaryFunction::disassemble() {
     InstrMapType::iterator II = Instructions.find(Offset);
     assert(II != Instructions.end() && "reference to non-existing instruction");
 
-    BC.MIB->setLabel(II->second, Label);
+    BC.MIB->setInstLabel(II->second, Label);
   }
 
   // Reset symbolizer for the disassembler.
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 54618aeb95ccc..82bddf76d5b8c 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -408,12 +408,11 @@ void BinaryFunction::updateEHRanges() {
 
         // Same symbol is used for the beginning and the end of the range.
         MCSymbol *EHSymbol;
-        if (MCSymbol *InstrLabel = BC.MIB->getLabel(Instr)) {
+        if (MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr)) {
           EHSymbol = InstrLabel;
         } else {
           std::unique_lock<llvm::sys::RWMutex> Lock(BC.CtxMutex);
-          EHSymbol = BC.Ctx->createNamedTempSymbol("EH");
-          BC.MIB->setLabel(Instr, EHSymbol);
+          EHSymbol = BC.MIB->getOrCreateInstLabel(Instr, "EH", BC.Ctx.get());
         }
 
         // At this point we could be in one of the following states:
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 44e5f88d8950f..bd9bd0c45922a 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -12,6 +12,7 @@
 
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/Core/MCPlus.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -266,17 +267,29 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) const {
   return true;
 }
 
-MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const {
+MCSymbol *MCPlusBuilder::getInstLabel(const MCInst &Inst) const {
   if (std::optional<int64_t> Label =
           getAnnotationOpValue(Inst, MCAnnotation::kLabel))
     return reinterpret_cast<MCSymbol *>(*Label);
   return nullptr;
 }
 
-bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) const {
+MCSymbol *MCPlusBuilder::getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                              MCContext *Ctx) const {
+  MCSymbol *Label = getInstLabel(Inst);
+  if (Label)
+    return Label;
+
+  Label = Ctx->createNamedTempSymbol(Name);
+  setAnnotationOpValue(Inst, MCAnnotation::kLabel,
+                       reinterpret_cast<int64_t>(Label));
+  return Label;
+}
+
+void MCPlusBuilder::setInstLabel(MCInst &Inst, MCSymbol *Label) const {
+  assert(!getInstLabel(Inst) && "Instruction already has assigned label.");
   setAnnotationOpValue(Inst, MCAnnotation::kLabel,
                        reinterpret_cast<int64_t>(Label));
-  return true;
 }
 
 std::optional<uint32_t> MCPlusBuilder::getSize(const MCInst &Inst) const {
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 6377c1197253c..0d7dc1070ce75 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -770,11 +770,8 @@ Error LinuxKernelRewriter::rewriteORCTables() {
           continue;
 
         // Issue label for the instruction.
-        MCSymbol *Label = BC.MIB->getLabel(Inst);
-        if (!Label) {
-          Label = BC.Ctx->createTempSymbol("__ORC_");
-          BC.MIB->setLabel(Inst, Label);
-        }
+        MCSymbol *Label =
+            BC.MIB->getOrCreateInstLabel(Inst, "__ORC_", BC.Ctx.get());
 
         if (Error E = emitORCEntry(0, *ErrorOrState, Label))
           return E;
@@ -908,11 +905,8 @@ Error LinuxKernelRewriter::readStaticCalls() {
 
     BC.MIB->addAnnotation(*Inst, "StaticCall", EntryID);
 
-    MCSymbol *Label = BC.MIB->getLabel(*Inst);
-    if (!Label) {
-      Label = BC.Ctx->createTempSymbol("__SC_");
-      BC.MIB->setLabel(*Inst, Label);
-    }
+    MCSymbol *Label =
+        BC.MIB->getOrCreateInstLabel(*Inst, "__SC_", BC.Ctx.get());
 
     StaticCallEntries.push_back({EntryID, BF, Label});
   }

From 9d56be010cf30054313f5b3fea82331491c58cdb Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 28 Feb 2024 05:00:07 +0100
Subject: [PATCH 523/546]  [MLIR][OpenMP] Support basic materialization for
 `omp.private` ops (#81715)

Adds basic support for materializing delayed privatization. So far, the
restrictions on the implementation are:
- Only `private` clauses are supported (`firstprivate` support will be
  added in a later PR).
---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   5 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 182 +++++++++++++++---
 mlir/test/Target/LLVMIR/openmp-private.mlir   | 142 ++++++++++++++
 3 files changed, 297 insertions(+), 32 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-private.mlir

diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c2b471ab96183..8a6980e2c6a2d 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1957,7 +1957,10 @@ LogicalResult PrivateClauseOp::verify() {
   Type symType = getType();
 
   auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
-    if (!terminator->hasSuccessors() && !llvm::isa<YieldOp>(terminator))
+    if (!terminator->getBlock()->getSuccessors().empty())
+      return success();
+
+    if (!llvm::isa<YieldOp>(terminator))
       return mlir::emitError(terminator->getLoc())
              << "expected exit block terminator to be an `omp.yield` op.";
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6e53d801a0d2f..8c20689c4a39d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -396,9 +396,9 @@ collectReductionDecls(T loop,
 
 /// Translates the blocks contained in the given region and appends them to at
 /// the current insertion point of `builder`. The operations of the entry block
-/// are appended to the current insertion block, which is not expected to have a
-/// terminator. If set, `continuationBlockArgs` is populated with translated
-/// values that correspond to the values omp.yield'ed from the region.
+/// are appended to the current insertion block. If set, `continuationBlockArgs`
+/// is populated with translated values that correspond to the values
+/// omp.yield'ed from the region.
 static LogicalResult inlineConvertOmpRegions(
     Region &region, StringRef blockName, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation,
@@ -409,7 +409,14 @@ static LogicalResult inlineConvertOmpRegions(
   // Special case for single-block regions that don't create additional blocks:
   // insert operations without creating additional blocks.
   if (llvm::hasSingleElement(region)) {
+    llvm::Instruction *potentialTerminator =
+        builder.GetInsertBlock()->empty() ? nullptr
+                                          : &builder.GetInsertBlock()->back();
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->removeFromParent();
     moduleTranslation.mapBlock(&region.front(), builder.GetInsertBlock());
+
     if (failed(moduleTranslation.convertBlock(
             region.front(), /*ignoreArguments=*/true, builder)))
       return failure();
@@ -423,6 +430,10 @@ static LogicalResult inlineConvertOmpRegions(
     // Drop the mapping that is no longer necessary so that the same region can
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+
     return success();
   }
 
@@ -1000,11 +1011,50 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// A RAII class that on construction replaces the region arguments of the
+/// parallel op (which correspond to private variables) with the actual private
+/// variables they correspond to. This prepares the parallel op so that it
+/// matches what is expected by the OMPIRBuilder.
+///
+/// On destruction, it restores the original state of the operation so that on
+/// the MLIR side, the op is not affected by conversion to LLVM IR.
+class OmpParallelOpConversionManager {
+public:
+  OmpParallelOpConversionManager(omp::ParallelOp opInst)
+      : region(opInst.getRegion()), privateVars(opInst.getPrivateVars()),
+        privateArgBeginIdx(opInst.getNumReductionVars()),
+        privateArgEndIdx(privateArgBeginIdx + privateVars.size()) {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(region.getArgument(argIdx),
+                                       *privateVarsIt, region);
+  }
+
+  ~OmpParallelOpConversionManager() {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(*privateVarsIt,
+                                       region.getArgument(argIdx), region);
+  }
+
+private:
+  Region &region;
+  OperandRange privateVars;
+  unsigned privateArgBeginIdx;
+  unsigned privateArgEndIdx;
+};
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  OmpParallelOpConversionManager raii(opInst);
+
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
@@ -1086,12 +1136,81 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform appropriate actions according to the data-sharing
   // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently defaults to shared.
+  // Currently shared and private are supported.
   auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
                     llvm::Value &, llvm::Value &vPtr,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar, privatizerClone] =
+        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privatizers = opInst.getPrivatizers();
+
+        for (auto [privVar, privatizerAttr] :
+             llvm::zip_equal(privVars, *privatizers)) {
+          // Find the MLIR private variable corresponding to the LLVM value
+          // being privatized.
+          llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
+          if (llvmPrivVar != &vPtr)
+            continue;
+
+          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerAttr);
+          omp::PrivateClauseOp privatizer =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInst, privSym);
+
+          // Clone the privatizer in case it is used by more than one parallel
+          // region. The privatizer is processed in-place (see below) before it
+          // gets inlined in the parallel region and therefore processing the
+          // original op is dangerous.
+          return {privVar, privatizer.clone()};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+      if (privatizerClone.getDataSharingType() ==
+          omp::DataSharingClauseType::FirstPrivate) {
+        privatizerClone.emitOpError(
+            "TODO: delayed privatization is not "
+            "supported for `firstprivate` clauses yet.");
+        bodyGenStatus = failure();
+        return codeGenIP;
+      }
+
+      Region &allocRegion = privatizerClone.getAllocRegion();
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      auto allocRegionArg = allocRegion.getArgument(0);
+      replaceAllUsesInRegionWith(allocRegionArg, privVar, allocRegion);
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        opInst.emitError("failed to inline `alloc` region of an `omp.private` "
+                         "op in the parallel region");
+        bodyGenStatus = failure();
+      } else {
+        assert(yieldedValues.size() == 1);
+        replacementValue = yieldedValues.front();
+      }
+
+      privatizerClone.erase();
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -1635,7 +1754,7 @@ getRefPtrIfDeclareTarget(mlir::Value value,
 // A small helper structure to contain data gathered
 // for map lowering and coalese it into one area and
 // avoiding extra computations such as searches in the
-// llvm module for lowered mapped varibles or checking
+// llvm module for lowered mapped variables or checking
 // if something is declare target (and retrieving the
 // value) more than neccessary.
 struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
@@ -2854,26 +2973,26 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
                                                 moduleTranslation);
               return failure();
             })
-      .Case(
-          "omp.requires",
-          [&](Attribute attr) {
-            if (auto requiresAttr = attr.dyn_cast<omp::ClauseRequiresAttr>()) {
-              using Requires = omp::ClauseRequires;
-              Requires flags = requiresAttr.getValue();
-              llvm::OpenMPIRBuilderConfig &config =
-                  moduleTranslation.getOpenMPBuilder()->Config;
-              config.setHasRequiresReverseOffload(
-                  bitEnumContainsAll(flags, Requires::reverse_offload));
-              config.setHasRequiresUnifiedAddress(
-                  bitEnumContainsAll(flags, Requires::unified_address));
-              config.setHasRequiresUnifiedSharedMemory(
-                  bitEnumContainsAll(flags, Requires::unified_shared_memory));
-              config.setHasRequiresDynamicAllocators(
-                  bitEnumContainsAll(flags, Requires::dynamic_allocators));
-              return success();
-            }
-            return failure();
-          })
+      .Case("omp.requires",
+            [&](Attribute attr) {
+              if (auto requiresAttr =
+                      attr.dyn_cast<omp::ClauseRequiresAttr>()) {
+                using Requires = omp::ClauseRequires;
+                Requires flags = requiresAttr.getValue();
+                llvm::OpenMPIRBuilderConfig &config =
+                    moduleTranslation.getOpenMPBuilder()->Config;
+                config.setHasRequiresReverseOffload(
+                    bitEnumContainsAll(flags, Requires::reverse_offload));
+                config.setHasRequiresUnifiedAddress(
+                    bitEnumContainsAll(flags, Requires::unified_address));
+                config.setHasRequiresUnifiedSharedMemory(
+                    bitEnumContainsAll(flags, Requires::unified_shared_memory));
+                config.setHasRequiresDynamicAllocators(
+                    bitEnumContainsAll(flags, Requires::dynamic_allocators));
+                return success();
+              }
+              return failure();
+            })
       .Default([](Attribute) {
         // Fall through for omp attributes that do not require lowering.
         return success();
@@ -2988,12 +3107,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
new file mode 100644
index 0000000000000..58bda87c3b7be
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -0,0 +1,142 @@
+// Test code-gen for `omp.parallel` ops with delayed privatizers (i.e. using
+// `omp.private` ops).
+
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_1_private
+// CHECK-SAME: (ptr %[[ORIG:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[ORIG_GEP:.*]] = getelementptr { ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG]], ptr %[[ORIG_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_1_private..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_1_private..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
+// CHECK: store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV_ALLOC]], align 4
+// CHECK: }
+
+llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> i32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_2_privates
+// CHECK-SAME: (ptr %[[ORIG1:.*]], ptr %[[ORIG2:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[ORIG1_GEP:.*]] = getelementptr { ptr, ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG1]], ptr %[[ORIG1_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_2_privates..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_2_privates..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG1_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG1_PTR:.*]] = load ptr, ptr %[[ORIG1_PTR_PTR]], align 8
+// CHECK: %[[ORIG2_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 1
+// CHECK: %[[ORIG2_PTR:.*]] = load ptr, ptr %[[ORIG2_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV1_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG1_VAL:.*]] = load float, ptr %[[ORIG1_PTR]], align 4
+// CHECK: store float %[[ORIG1_VAL]], ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: %[[PRIV2_ALLOC:.*]] = alloca i32, align 4
+// CHECK: %[[ORIG2_VAL:.*]] = load i32, ptr %[[ORIG2_PTR]], align 4
+// CHECK: store i32 %[[ORIG2_VAL]], ptr %[[PRIV2_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: load i32, ptr %[[PRIV2_ALLOC]], align 4
+// CHECK: }
+
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %1, %0 : f32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+omp.private {type = private} @y.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %1, %0 : i32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+// -----
+
+llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @parallel_op_private_multi_block..omp_par
+// CHECK: omp.par.entry:
+// CHECK:  %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
+// CHECK:  %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK:   br label %[[PRIV_BB1:.*]]
+
+// Check contents of the first block in the `alloc` region.
+// CHECK: [[PRIV_BB1]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK-NEXT:   br label %[[PRIV_BB2:.*]]
+
+// Check contents of the second block in the `alloc` region.
+// CHECK: [[PRIV_BB2]]:
+// CHECK-NEXT:   %[[ORIG_PTR2:.*]] = phi ptr [ %[[ORIG_PTR]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR2]], align 4
+// CHECK-NEXT:   store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT:   br label %[[PRIV_CONT:.*]]
+
+// Check that the privatizer's continuation block yileds the private clone's
+// address.
+// CHECK: [[PRIV_CONT]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB2]] ]
+// CHECK-NEXT:   br label %[[PAR_REG:.*]]
+
+// Check that the body of the parallel region loads from the private clone.
+// CHECK: [[PAR_REG]]:
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+
+omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  llvm.br ^bb1(%arg0, %0 : !llvm.ptr, !llvm.ptr)
+
+^bb1(%arg1: !llvm.ptr, %arg2: !llvm.ptr):
+  %1 = llvm.load %arg1 : !llvm.ptr -> f32
+  llvm.store %1, %arg2 : f32, !llvm.ptr
+  omp.yield(%arg2 : !llvm.ptr)
+}

From 87c0260f45e5a02cb07722d089dae3f0f84c7b3d Mon Sep 17 00:00:00 2001
From: erman-gurses <99776114+erman-gurses@users.noreply.github.com>
Date: Tue, 27 Feb 2024 23:28:12 -0500
Subject: [PATCH 524/546] [AMDGPU] Add parameterization for optimized shared
 memory variables (#82508)

- This PR adds parameterization for shared memory variables that are
used for optimization: `sharedMemoryLineSizeBytes` and
`defaultVectorSizeBits.`
- The default values are set to 128 for both variables since it gives
zero bank conflicts.
---
 .../AMDGPU/TransformOps/AMDGPUTransformOps.td |  6 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  |  9 ++-
 .../Dialect/AMDGPU/Transforms/Transforms.h    | 10 +++-
 .../TransformOps/AMDGPUTransformOps.cpp       |  3 +-
 .../Transforms/OptimizeSharedMemory.cpp       | 57 ++++++++++---------
 .../AMDGPU/optimize_shmem_reads_writes.mlir   | 50 +++++++---------
 ...transform_optimize_shmem_reads_writes.mlir | 46 +++++++--------
 7 files changed, 97 insertions(+), 84 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
index 23873d86b495c..0eb6705060863 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
@@ -13,8 +13,8 @@ include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
 
+include "mlir/Interfaces/SideEffectInterfaces.td"
 //===----------------------------------------------------------------------===//
 // ApplyOptimizeSharedMemoryReadsAndWritesOp
 //===----------------------------------------------------------------------===//
@@ -28,7 +28,9 @@ def ApplyOptimizeSharedMemoryReadsAndWritesOp :
     reads/writes with the goal of avoiding bank conflicts.
   }];
 
-  let arguments = (ins TransformHandleTypeInterface:$target);
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$sharedMemoryLineSizeBytes,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$defaultVectorSizeBits);
   let results = (outs);
 
   let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index c8059e6d316e8..67f951fd19d17 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -37,10 +37,17 @@ def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
     attempts to optimize reads/writes from a memref representing GPU shared
     memory in order to avoid bank conflicts.
   }];
-
   let dependentDialects = [
     "memref::MemRefDialect", "vector::VectorDialect"
   ];
+  let options = [
+    Option<"sharedMemoryLineSizeBytes", "shared-memory-line-size-bytes", "int64_t",
+           /*default=*/"128",
+           "Shared memory line size in bytes">,
+    Option<"defaultVectorSizeBits", "default-vector-size-bits", "int64_t",
+           /*default=*/"128",
+           "Default vector size in bits">,
+  ];
 }
 
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
index 79f9ab71a2b43..843cea2c503b9 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
@@ -45,11 +45,15 @@ namespace amdgpu {
 /// function that depends on the row Index. The permutation function is chosen
 /// to ensure that sequential distributed+vectorized reads/writes down a single
 /// dimension of the memref have minimal conflicts.
-LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                 Value memrefValue);
+LogicalResult
+optimizeSharedMemoryReadsAndWrites(Operation *parentOp, Value memrefValue,
+                                   int64_t sharedMemoryLineSizeBytes,
+                                   int64_t defaultVectorSizeBits);
 
 std::optional<LogicalResult>
-optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp);
+optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                     int64_t sharedMemoryLineSizeBytes,
+                                     int64_t defaultVectorSizeBits);
 
 } // namespace amdgpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
index ff29f9f693853..b7e17a9289738 100644
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
@@ -27,7 +27,8 @@ DiagnosedSilenceableFailure
 ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
     TransformRewriter &rewriter, FuncOp funcOp, ApplyToEachResultList &results,
     TransformState &state) {
-  optimizeSharedMemoryReadsAndWritesOp(funcOp);
+  optimizeSharedMemoryReadsAndWritesOp(funcOp, getSharedMemoryLineSizeBytes(),
+                                       getDefaultVectorSizeBits());
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
index 6bd03ed833898..32fab265e03cc 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
@@ -35,13 +35,6 @@ namespace amdgpu {
 using namespace mlir;
 using namespace mlir::amdgpu;
 
-/// The size of a shared memory line according to AMD documentation.
-/// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
-constexpr int64_t kSharedMemoryLineSizeBytes = 64;
-/// We optimize for 64bit accesses, but this can be made an argument in the
-/// future.
-constexpr int64_t kDefaultVectorSizeBits = 64;
-
 /// Uses `srcIndexValue` to permute `tgtIndexValue` via
 /// `result = xor(floordiv(srcIdxVal,permuteEveryN),
 ///               floordiv(tgtIdxVal,vectorSize)))
@@ -49,7 +42,9 @@ constexpr int64_t kDefaultVectorSizeBits = 64;
 /// This is done using an optimized sequence of `arith` operations.
 static Value permuteVectorOffset(OpBuilder &b, Location loc,
                                  ArrayRef<Value> indices, MemRefType memrefTy,
-                                 int64_t srcDim, int64_t tgtDim) {
+                                 int64_t srcDim, int64_t tgtDim,
+                                 int64_t sharedMemoryLineSizeBytes,
+                                 int64_t defaultVectorSizeBits) {
   // Adjust the src index to change how often the permutation changes
   // if necessary.
   Value src = indices[srcDim];
@@ -57,9 +52,9 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // We only want to permute every N iterations of the target dim where N is
   // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
   const int64_t permuteEveryN = std::max<int64_t>(
-      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
-                                        memrefTy.getElementTypeBitWidth()) /
-                                       8));
+      1, sharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
+                                       memrefTy.getElementTypeBitWidth()) /
+                                      8));
 
   // clang-format off
   // Index bit representation (b0 = least significant bit) for dim(1)
@@ -71,7 +66,7 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // bits[N:M] = vector index
   // clang-format on
   int64_t n =
-      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
+      llvm::Log2_64(defaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
   int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
 
   // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
@@ -105,9 +100,11 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
 static void transformIndices(OpBuilder &builder, Location loc,
                              SmallVector<Value, 4> &indices,
                              MemRefType memrefTy, int64_t srcDim,
-                             int64_t tgtDim) {
+                             int64_t tgtDim, int64_t sharedMemoryLineSizeBytes,
+                             int64_t defaultVectorSizeBits) {
   indices[tgtDim] =
-      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
+      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim,
+                          sharedMemoryLineSizeBytes, defaultVectorSizeBits);
 }
 
 // Return all operations within `parentOp` that read from or write to
@@ -149,8 +146,9 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
   return success();
 }
 
-LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                         Value memrefValue) {
+LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(
+    Operation *parentOp, Value memrefValue, int64_t sharedMemoryLineSizeBytes,
+    int64_t defaultVectorSizeBits) {
   auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
   if (!memRefType ||
       !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
@@ -167,10 +165,10 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
   // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
   const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
   const int64_t rowsPerLine =
-      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
+      (8 * sharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
       rowSize;
   const int64_t threadGroupSize =
-      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
+      1LL << (7 - llvm::Log2_64(defaultVectorSizeBits / 8));
   if (rowsPerLine >= threadGroupSize)
     return failure();
 
@@ -198,7 +196,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmWriteOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmWriteOp, transformedIndices);
   }
 
@@ -210,7 +209,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmReadOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmReadOp, transformedIndices);
   }
 
@@ -218,7 +218,9 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
 }
 
 std::optional<LogicalResult>
-amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
+amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                             int64_t sharedMemoryLineSizeBytes,
+                                             int64_t defaultVectorSizeBits) {
   SmallVector<memref::AllocOp> shmAllocOps;
   funcOp.walk([&](memref::AllocOp allocOp) {
     if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
@@ -226,8 +228,9 @@ amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
     shmAllocOps.push_back(allocOp);
   });
   for (auto allocOp : shmAllocOps) {
-    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(funcOp,
-                                                          allocOp.getMemref())))
+    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(
+            funcOp, allocOp.getMemref(), sharedMemoryLineSizeBytes,
+            defaultVectorSizeBits)))
       return failure();
   }
   return success();
@@ -237,7 +240,8 @@ struct OptimizeSharedMemoryPass
     : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
 public:
   OptimizeSharedMemoryPass() = default;
-
+  OptimizeSharedMemoryPass(const OptimizeSharedMemoryOptions &options)
+      : OptimizeSharedMemoryBase(options) {}
   void runOnOperation() override {
     Operation *op = getOperation();
     SmallVector<memref::AllocOp> shmAllocOps;
@@ -248,8 +252,9 @@ struct OptimizeSharedMemoryPass
       shmAllocOps.push_back(allocOp);
     });
     for (auto allocOp : shmAllocOps) {
-      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
-                                                    allocOp.getMemref())))
+      if (failed(optimizeSharedMemoryReadsAndWrites(op, allocOp.getMemref(),
+                                                    sharedMemoryLineSizeBytes,
+                                                    defaultVectorSizeBits)))
         return;
     }
   }
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
index a1de1ff87c229..983eee732e2af 100644
--- a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
@@ -1,13 +1,13 @@
-// RUN: mlir-opt  %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
   
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
-    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16                  
+    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16
     %cst = arith.constant 0.000000e+00 : f16
 
     // CHECK: [[shmA:%.+]] = memref.alloc
@@ -15,42 +15,36 @@
     %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
-    // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
-    // CHECK:  vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
 
-    // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    // CHECK:  vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
   }
diff --git a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
index 143e7c2d27095..b1bb91ffc2972 100644
--- a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt  %s -transform-interpreter  | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
     %cst = arith.constant 0.000000e+00 : f16
@@ -13,33 +13,33 @@
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
@@ -48,7 +48,7 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 : (!transform.any_op) -> ()
+    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 {sharedMemoryLineSizeBytes = 128, defaultVectorSizeBits = 128}: (!transform.any_op) -> ()
     transform.yield
   } // @__transform_main
 } // module

From c2b952926fe8707527cf1b8bab211dc4c7ab9aee Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Wed, 28 Feb 2024 00:11:28 -0500
Subject: [PATCH 525/546] [mlir][vector] Fix n-d transfer write distribution
 (#83215)

Currently n-d transfer write distribution can be inconsistent with
distribution of reductions if a value has multiple users, one of which
is a transfer_write with a non-standard distribution map, and the other
of which is a vector.reduction.

We may want to consider removing the distribution map functionality in
the future for this reason.
---
 .../Vector/Transforms/VectorDistribute.cpp    | 44 ++++++++++++++++---
 .../Vector/vector-warp-distribute.mlir        | 25 +++++++++++
 .../Dialect/Vector/TestVectorTransforms.cpp   |  6 +--
 3 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 620ceee48b196..b3ab4a916121e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -443,15 +443,24 @@ static vector::TransferWriteOp cloneWriteOp(RewriterBase &rewriter,
 /// d1) and return vector<16x2x64>
 static VectorType getDistributedType(VectorType originalType, AffineMap map,
                                      int64_t warpSize) {
-  if (map.getNumResults() != 1)
-    return VectorType();
   SmallVector<int64_t> targetShape(originalType.getShape().begin(),
                                    originalType.getShape().end());
   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
     unsigned position = map.getDimPosition(i);
-    if (targetShape[position] % warpSize != 0)
-      return VectorType();
+    if (targetShape[position] % warpSize != 0) {
+      if (warpSize % targetShape[position] != 0) {
+        return VectorType();
+      }
+      warpSize /= targetShape[position];
+      targetShape[position] = 1;
+      continue;
+    }
     targetShape[position] = targetShape[position] / warpSize;
+    warpSize = 1;
+    break;
+  }
+  if (warpSize != 1) {
+    return VectorType();
   }
   VectorType targetType =
       VectorType::get(targetShape, originalType.getElementType());
@@ -526,7 +535,30 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
     // 4. Reindex the write using the distribution map.
     auto newWarpOp =
         newWriteOp.getVector().getDefiningOp<WarpExecuteOnLane0Op>();
+
+    // Delinearize the lane id based on the way threads are divided across the
+    // vector. To get the number of threads per vector dimension, divide the
+    // sequential size by the distributed size along each dim.
     rewriter.setInsertionPoint(newWriteOp);
+    SmallVector<OpFoldResult> delinearizedIdSizes;
+    for (auto [seqSize, distSize] :
+         llvm::zip_equal(writtenVectorType.getShape(), targetType.getShape())) {
+      assert(seqSize % distSize == 0 && "Invalid distributed vector shape");
+      delinearizedIdSizes.push_back(rewriter.getIndexAttr(seqSize / distSize));
+    }
+    SmallVector<Value> delinearized;
+    if (map.getNumResults() > 1) {
+      delinearized = rewriter
+                         .create<mlir::affine::AffineDelinearizeIndexOp>(
+                             newWarpOp.getLoc(), newWarpOp.getLaneid(),
+                             delinearizedIdSizes)
+                         .getResults();
+    } else {
+      // If there is only one map result, we can elide the delinearization
+      // op and use the lane id directly.
+      delinearized.append(targetType.getRank(), newWarpOp.getLaneid());
+    }
+
     AffineMap indexMap = map.compose(newWriteOp.getPermutationMap());
     Location loc = newWriteOp.getLoc();
     SmallVector<Value> indices(newWriteOp.getIndices().begin(),
@@ -539,11 +571,11 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
         continue;
       unsigned indexPos = indexExpr.getPosition();
       unsigned vectorPos = cast<AffineDimExpr>(std::get<1>(it)).getPosition();
+      Value laneId = delinearized[vectorPos];
       auto scale =
           rewriter.getAffineConstantExpr(targetType.getDimSize(vectorPos));
       indices[indexPos] = affine::makeComposedAffineApply(
-          rewriter, loc, d0 + scale * d1,
-          {indices[indexPos], newWarpOp.getLaneid()});
+          rewriter, loc, d0 + scale * d1, {indices[indexPos], laneId});
     }
     newWriteOp.getIndicesMutable().assign(indices);
 
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 9072603734879..bf90c4a6ebb3c 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -1559,3 +1559,28 @@ func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1:
 //       CHECK-PROP:   %[[DISTM0:.+]] = affine.apply #[[$SUBM0]]()[%[[M0]], %[[LANEID]]]
 //       CHECK-PROP:   %[[DISTM1:.+]] = affine.apply #[[$SUBM1]]()[%[[M1]], %[[LANEID]]]
 //       CHECK-PROP:   vector.create_mask %[[DISTM0]], %[[DISTM1]], %[[M2]] : vector<1x2x4xi1>
+
+// -----
+
+func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
+  %c0 = arith.constant 0 : index
+  vector.warp_execute_on_lane_0(%laneid)[32] -> () {
+    %0 = "some_def"() : () -> (vector<4x1024xf32>)
+    vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32>
+    vector.yield
+  }
+  return
+}
+
+//       CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
+
+// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write(
+//       CHECK-DIST-AND-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
+//       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def"
+//       CHECK-DIST-AND-PROP:     vector.yield %[[V0]]
+//  CHECK-DIST-AND-PROP-SAME:       vector<4x1024xf32>
+//       CHECK-DIST-AND-PROP:   }
+
+//       CHECK-DIST-AND-PROP:   %[[IDS:.+]]:2 = affine.delinearize_index %{{.*}} into (%c4, %c8) : index, index
+//       CHECK-DIST-AND-PROP:   %[[INNER_ID:.+]] = affine.apply #map()[%[[IDS]]#1]
+//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]], %{{.*}}[%[[IDS]]#0, %[[INNER_ID]]] {{.*}} : vector<1x128xf32>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 178a58e796b24..915f713f7047b 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -630,15 +630,13 @@ struct TestVectorDistribution
     });
     MLIRContext *ctx = &getContext();
     auto distributionFn = [](Value val) {
-      // Create a map (d0, d1) -> (d1) to distribute along the inner
-      // dimension. Once we support n-d distribution we can add more
-      // complex cases.
+      // Create an identity dim map of the same rank as the vector.
       VectorType vecType = dyn_cast<VectorType>(val.getType());
       int64_t vecRank = vecType ? vecType.getRank() : 0;
       OpBuilder builder(val.getContext());
       if (vecRank == 0)
         return AffineMap::get(val.getContext());
-      return AffineMap::get(vecRank, 0, builder.getAffineDimExpr(vecRank - 1));
+      return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
     };
     auto shuffleFn = [](Location loc, OpBuilder &builder, Value val,
                         Value srcIdx, int64_t warpSz) {

From 267beb10f2812107734a1cd2172b46e928af76b7 Mon Sep 17 00:00:00 2001
From: mlevesquedion <mlevesquedion@google.com>
Date: Tue, 27 Feb 2024 21:31:15 -0800
Subject: [PATCH 526/546] [MLIR] Fix a few links to passes in the documentation
 (#83221)

I double checked the links by building [the
website](https://github.com/llvm/mlir-www):

```
$ mlir-www-helper.sh --install-docs ../llvm-project website
$ cd website && hugo serve
```
---
 mlir/docs/PassManagement.md  | 3 +--
 mlir/docs/PatternRewriter.md | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index ff86bbfef7b0b..c9d705f0506a3 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -56,8 +56,7 @@ By default, an operation pass is `op-agnostic`, meaning that it operates on the
 operation type of the pass manager that it is added to. This means a pass may operate
 on many different types of operations. Agnostic passes should be written such that
 they do not make assumptions on the operation they run on. Examples of this type of pass are
-[Canonicalization](Pass.md/-canonicalize-canonicalize-operations)
-[Common Sub-Expression Elimination](Passes.md/#-cse-eliminate-common-sub-expressions).
+[Canonicalization](Passes.md/#-canonicalize) and [Common Sub-Expression Elimination](Passes.md/#-cse).
 
 To create an agnostic operation pass, a derived class must adhere to the following:
 
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 011cd14175634..0ba76199874cc 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -366,7 +366,7 @@ Note: This driver listens for IR changes via the callbacks provided by
 rewriter and do not bypass the rewriter API by modifying ops directly.
 
 Note: This driver is the one used by the [canonicalization](Canonicalization.md)
-[pass](Passes.md/#-canonicalize-canonicalize-operations) in MLIR.
+[pass](Passes.md/#-canonicalize) in MLIR.
 
 ### Debugging
 

From 4d04a40adb68f284350831911a658715134c66d8 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 27 Feb 2024 22:25:55 -0800
Subject: [PATCH 527/546] [alpha.webkit.UncountedCallArgsChecker] Allow a
 variable declaration in a trivial function. (#82291)

---
 .../Checkers/WebKit/PtrTypesSemantics.cpp       |  8 ++++++--
 .../Checkers/WebKit/uncounted-obj-arg.cpp       | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index defd83ec8e179..01b191ab0eeaf 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -310,8 +310,12 @@ class TrivialFunctionAnalysisVisitor
         return true;
       if (isa<EnumConstantDecl>(decl))
         return true;
-      if (auto *VD = dyn_cast<VarDecl>(decl))
-        return VD->hasConstantInitialization() && VD->getEvaluatedValue();
+      if (auto *VD = dyn_cast<VarDecl>(decl)) {
+        if (VD->hasConstantInitialization() && VD->getEvaluatedValue())
+          return true;
+        auto *Init = VD->getInit();
+        return !Init || Visit(Init);
+      }
     }
     return false;
   }
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index ac16a31293f3d..80a9a263dab14 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -199,6 +199,8 @@ class RefCounted {
   bool trivial23() const { return OptionSet<Flags>::fromRaw(v).contains(Flags::Flag1); }
   int trivial24() const { ASSERT(v); return v; }
   unsigned trivial25() const { return __c11_atomic_load((volatile _Atomic(unsigned) *)&v, __ATOMIC_RELAXED); }
+  bool trivial26() { bool hasValue = v; return !hasValue; }
+  bool trivial27(int v) { bool value; value = v ? 1 : 0; return value; }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -262,6 +264,15 @@ class RefCounted {
     return __c11_atomic_load((volatile _Atomic(unsigned) *)another(), __ATOMIC_RELAXED);
   }
 
+  void nonTrivial11() {
+    Number num(0.3);
+  }
+
+  bool nonTrivial12() {
+    bool val = otherFunction();
+    return val;
+  }
+
   unsigned v { 0 };
   Number* number { nullptr };
   Enum enumValue { Enum::Value1 };
@@ -309,6 +320,8 @@ class UnrelatedClass {
     getFieldTrivial().trivial23(); // no-warning
     getFieldTrivial().trivial24(); // no-warning
     getFieldTrivial().trivial25(); // no-warning
+    getFieldTrivial().trivial26(); // no-warning
+    getFieldTrivial().trivial27(5); // no-warning
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
@@ -334,6 +347,10 @@ class UnrelatedClass {
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial10();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial11();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial12();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 };
 

From 8eea478f57e79b6fad065d023355907bc2098206 Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard@google.com>
Date: Tue, 27 Feb 2024 22:34:07 -0800
Subject: [PATCH 528/546] [ItaniumDemangle] reject A-F in FP literals (#83061)

Sync this change to the copy of ItaniumDemangle.h in "llvm":

https://github.com/llvm/llvm-project/pull/82864

The Itanium C++ ABI specifies that FP literals are encoded using a
lowercase hexadecimal string. Previously, libc++abi allowed uppercase
A-F characters but decoded them by subtracting 'a' from them, producing
negative digit values. It is especially confusing to accept an 'E' digit
because 'E' marks the end of the FP literal.
---
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 04bc58d8f63e4..d33af157543fe 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -5540,7 +5540,7 @@ Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
     return nullptr;
   std::string_view Data(First, N);
   for (char C : Data)
-    if (!std::isxdigit(C))
+    if (!(C >= '0' && C <= '9') && !(C >= 'a' && C <= 'f'))
       return nullptr;
   First += N;
   if (!consumeIf('E'))

From 28c29fbec3057692a7985819d799a9e5d47eb2d1 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 28 Feb 2024 14:27:54 +0800
Subject: [PATCH 529/546] [RISCV] Add exact VLEN RUNs for insert_subvector and
 concat_vector tests. NFC

Also update the RUNs in the extract_subvector tests to be consistent.
Using the term VLS/VLA here as it's more succinct than KNOWNVLEN/UNKNOWNVLEN.
---
 .../rvv/fixed-vectors-extract-subvector.ll    | 860 +++++++++---------
 .../rvv/fixed-vectors-insert-subvector.ll     | 559 +++++++++---
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 265 ++++--
 3 files changed, 1012 insertions(+), 672 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index c49b1a7ad1861..b9c611bf3e54a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-KNOWNVLEN128
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define void @extract_v2i8_v4i8_0(ptr %x, ptr %y) {
 ; CHECK-LABEL: extract_v2i8_v4i8_0:
@@ -63,22 +69,22 @@ define void @extract_v2i8_v8i8_6(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4)
   store <1 x i32> %c, ptr %y
@@ -86,24 +92,24 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_5:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 5
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_5:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 5
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_5:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_5:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 1
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 5)
   store <1 x i32> %c, ptr %y
@@ -111,20 +117,20 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 0)
   store <2 x i32> %c, ptr %y
@@ -132,24 +138,24 @@ define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 2)
   store <2 x i32> %c, ptr %y
@@ -157,22 +163,22 @@ define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4)
   store <2 x i32> %c, ptr %y
@@ -180,24 +186,24 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 6)
   store <2 x i32> %c, ptr %y
@@ -230,59 +236,59 @@ define void @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %x, ptr %y) {
 }
 
 define void @extract_v2i32_nxv16i32_4(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 4)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 6)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <2 x i32> %c, ptr %y
   ret void
@@ -339,40 +345,40 @@ define void @extract_v2i8_nxv2i8_6(<vscale x 2 x i8> %x, ptr %y) {
 }
 
 define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 0)
   store <8 x i1> %c, ptr %y
@@ -380,26 +386,26 @@ define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 1
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 1
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 8)
   store <8 x i1> %c, ptr %y
@@ -407,26 +413,26 @@ define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_48(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_48:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_48:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_48:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_48:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 6
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 48)
   store <8 x i1> %c, ptr %y
@@ -508,38 +514,38 @@ define void @extract_v8i1_nxv64i1_192(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 0)
   store <2 x i1> %c, ptr %y
@@ -547,48 +553,48 @@ define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 2)
   store <2 x i1> %c, ptr %y
@@ -596,49 +602,49 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a0, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a0, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 42)
   store <2 x i1> %c, ptr %y
@@ -665,45 +671,45 @@ define void @extract_v2i1_nxv2i1_0(<vscale x 2 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv2i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv2i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %x, i64 2)
   store <2 x i1> %c, ptr %y
   ret void
@@ -754,91 +760,91 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a1, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a1, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a1
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv64i1(<vscale x 64 x i1> %x, i64 42)
   store <2 x i1> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 26
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv32i1_26:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 26
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv32i1_26:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv32i1(<vscale x 32 x i1> %x, i64 26)
   store <2 x i1> %c, ptr %y
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index efb1f720f2d09..9f0240c53b219 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
 
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
 ; CHECK-LABEL: insert_nxv8i32_v2i32_0:
@@ -45,26 +48,40 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
   ret <vscale x 8 x i32> %v
@@ -82,17 +99,27 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -101,15 +128,25 @@ define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslideup.vi v9, v8, 2
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vslideup.vi v9, v8, 2
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -118,13 +155,20 @@ define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_undef_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_undef_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_undef_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vs1r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
   store <4 x i32> %v, ptr %vp
@@ -132,17 +176,27 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v10, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLA-NEXT:    vmv.v.v v10, v8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -151,17 +205,27 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -170,15 +234,25 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 6
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
@@ -187,14 +261,23 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_undef_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_undef_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_undef_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 6
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
   store <8 x i32> %v, ptr %vp
@@ -239,18 +322,30 @@ define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v9
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
@@ -259,18 +354,30 @@ define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_16:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_16:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v9, 2
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
@@ -358,22 +465,36 @@ define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %sv
 }
 
 define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
-; CHECK-LABEL: insert_nxv2i1_v4i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v9, 0
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv2i1_v4i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vmerge.vim v9, v9, 1, v0
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v10, 0
+; VLA-NEXT:    vmv1r.v v0, v8
+; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
+; VLA-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v9, 0
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv2i1_v4i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vmerge.vim v10, v9, 1, v0
+; VLS-NEXT:    vmv1r.v v0, v8
+; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v10, 0
+; VLS-NEXT:    ret
   %sv = load <4 x i1>, ptr %svp
   %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
   ret <vscale x 2 x i1> %c
@@ -408,15 +529,24 @@ define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp)
 declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
 
 define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vle64.v v16, (a1)
+; VLA-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vs8r.v v8, (a2)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vl1re64.v v16, (a1)
+; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vs8r.v v8, (a2)
+; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
   %sv1 = load <2 x i64>, ptr %psv1
   %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
@@ -426,12 +556,18 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vs8r.v v8, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
   store <vscale x 16 x i64> %v, ptr %out
@@ -439,14 +575,22 @@ define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v8, 2
+; VLA-NEXT:    vs8r.v v16, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v8, 2
+; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
   store <vscale x 16 x i64> %v, ptr %out
@@ -521,6 +665,127 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
+; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLA:       # %bb.0:
+; RV32VLA-NEXT:    addi sp, sp, -80
+; RV32VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLA-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    .cfi_offset ra, -4
+; RV32VLA-NEXT:    .cfi_offset s0, -8
+; RV32VLA-NEXT:    addi s0, sp, 80
+; RV32VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLA-NEXT:    csrr a2, vlenb
+; RV32VLA-NEXT:    slli a2, a2, 4
+; RV32VLA-NEXT:    sub sp, sp, a2
+; RV32VLA-NEXT:    andi sp, sp, -64
+; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32VLA-NEXT:    vle64.v v8, (a0)
+; RV32VLA-NEXT:    addi a0, sp, 128
+; RV32VLA-NEXT:    vse64.v v8, (a0)
+; RV32VLA-NEXT:    csrr a0, vlenb
+; RV32VLA-NEXT:    slli a0, a0, 3
+; RV32VLA-NEXT:    addi a2, sp, 64
+; RV32VLA-NEXT:    add a3, a2, a0
+; RV32VLA-NEXT:    vl8re64.v v8, (a3)
+; RV32VLA-NEXT:    vl8re64.v v16, (a2)
+; RV32VLA-NEXT:    add a0, a1, a0
+; RV32VLA-NEXT:    vs8r.v v8, (a0)
+; RV32VLA-NEXT:    vs8r.v v16, (a1)
+; RV32VLA-NEXT:    addi sp, s0, -80
+; RV32VLA-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    addi sp, sp, 80
+; RV32VLA-NEXT:    ret
+;
+; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLA:       # %bb.0:
+; RV64VLA-NEXT:    addi sp, sp, -80
+; RV64VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLA-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    .cfi_offset ra, -8
+; RV64VLA-NEXT:    .cfi_offset s0, -16
+; RV64VLA-NEXT:    addi s0, sp, 80
+; RV64VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLA-NEXT:    csrr a2, vlenb
+; RV64VLA-NEXT:    slli a2, a2, 4
+; RV64VLA-NEXT:    sub sp, sp, a2
+; RV64VLA-NEXT:    andi sp, sp, -64
+; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64VLA-NEXT:    vle64.v v8, (a0)
+; RV64VLA-NEXT:    addi a0, sp, 128
+; RV64VLA-NEXT:    vse64.v v8, (a0)
+; RV64VLA-NEXT:    csrr a0, vlenb
+; RV64VLA-NEXT:    slli a0, a0, 3
+; RV64VLA-NEXT:    addi a2, sp, 64
+; RV64VLA-NEXT:    add a3, a2, a0
+; RV64VLA-NEXT:    vl8re64.v v8, (a3)
+; RV64VLA-NEXT:    vl8re64.v v16, (a2)
+; RV64VLA-NEXT:    add a0, a1, a0
+; RV64VLA-NEXT:    vs8r.v v8, (a0)
+; RV64VLA-NEXT:    vs8r.v v16, (a1)
+; RV64VLA-NEXT:    addi sp, s0, -80
+; RV64VLA-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    addi sp, sp, 80
+; RV64VLA-NEXT:    ret
+;
+; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLS:       # %bb.0:
+; RV32VLS-NEXT:    addi sp, sp, -80
+; RV32VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLS-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    .cfi_offset ra, -4
+; RV32VLS-NEXT:    .cfi_offset s0, -8
+; RV32VLS-NEXT:    addi s0, sp, 80
+; RV32VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLS-NEXT:    addi sp, sp, -256
+; RV32VLS-NEXT:    andi sp, sp, -64
+; RV32VLS-NEXT:    vl1re64.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 128
+; RV32VLS-NEXT:    vs1r.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 64
+; RV32VLS-NEXT:    addi a2, sp, 192
+; RV32VLS-NEXT:    vl8re64.v v8, (a2)
+; RV32VLS-NEXT:    vl8re64.v v16, (a0)
+; RV32VLS-NEXT:    addi a0, a1, 128
+; RV32VLS-NEXT:    vs8r.v v8, (a0)
+; RV32VLS-NEXT:    vs8r.v v16, (a1)
+; RV32VLS-NEXT:    addi sp, s0, -80
+; RV32VLS-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    addi sp, sp, 80
+; RV32VLS-NEXT:    ret
+;
+; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLS:       # %bb.0:
+; RV64VLS-NEXT:    addi sp, sp, -80
+; RV64VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLS-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    .cfi_offset ra, -8
+; RV64VLS-NEXT:    .cfi_offset s0, -16
+; RV64VLS-NEXT:    addi s0, sp, 80
+; RV64VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLS-NEXT:    addi sp, sp, -256
+; RV64VLS-NEXT:    andi sp, sp, -64
+; RV64VLS-NEXT:    vl1re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 128
+; RV64VLS-NEXT:    vs1r.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 192
+; RV64VLS-NEXT:    vl8re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 64
+; RV64VLS-NEXT:    vl8re64.v v16, (a0)
+; RV64VLS-NEXT:    addi a0, a1, 128
+; RV64VLS-NEXT:    vs8r.v v8, (a0)
+; RV64VLS-NEXT:    vs8r.v v16, (a1)
+; RV64VLS-NEXT:    addi sp, s0, -80
+; RV64VLS-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    addi sp, sp, 80
+; RV64VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
   store <vscale x 16 x i64> %v, ptr %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6ef5aa846d6d9..ce8827fe47536 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: concat_2xv4i32:
@@ -128,31 +131,51 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: concat_2xv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv16i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv4r.v v16, v12
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv16i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv4r.v v16, v12
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
 }
 
 define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
-; CHECK-LABEL: concat_4xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v16, v14
-; CHECK-NEXT:    vmv2r.v v24, v12
-; CHECK-NEXT:    vmv2r.v v0, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 8
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v16, v14
+; VLA-NEXT:    vmv2r.v v24, v12
+; VLA-NEXT:    vmv2r.v v0, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 8
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 16
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv2r.v v16, v14
+; VLS-NEXT:    vmv2r.v v24, v12
+; VLS-NEXT:    vmv2r.v v0, v10
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 8
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 16
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %abcd = shufflevector <16 x i32> %ab, <16 x i32> %cd, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -160,82 +183,128 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 }
 
 define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
-; CHECK-LABEL: concat_8xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv1r.v v16, v15
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v14
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v13
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmv1r.v v24, v10
-; CHECK-NEXT:    vmv1r.v v16, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 8
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 12
-; CHECK-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 20
-; CHECK-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    addi sp, sp, -16
+; VLA-NEXT:    .cfi_def_cfa_offset 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    sub sp, sp, a0
+; VLA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLA-NEXT:    vmv1r.v v16, v15
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v13
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v12
+; VLA-NEXT:    addi a0, sp, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v0, v11
+; VLA-NEXT:    vmv1r.v v24, v10
+; VLA-NEXT:    vmv1r.v v16, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 8
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 12
+; VLA-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 20
+; VLA-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 28
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    add sp, sp, a0
+; VLA-NEXT:    addi sp, sp, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    addi sp, sp, -16
+; VLS-NEXT:    .cfi_def_cfa_offset 16
+; VLS-NEXT:    addi sp, sp, -512
+; VLS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLS-NEXT:    vmv1r.v v16, v15
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v14
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v13
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v12
+; VLS-NEXT:    addi a0, sp, 16
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v0, v11
+; VLS-NEXT:    vmv1r.v v24, v10
+; VLS-NEXT:    vmv1r.v v16, v9
+; VLS-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 8
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 12
+; VLS-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 20
+; VLS-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 28
+; VLS-NEXT:    addi sp, sp, 512
+; VLS-NEXT:    addi sp, sp, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

From f81d5e549f0e02e1bfc5ccaf6341ad35d4ea8e98 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 28 Feb 2024 08:48:47 +0100
Subject: [PATCH 530/546] [flang] Handle OPTIONAL polymorphic captured in
 internal procedures (#82042)

The current code was doing an unconditional `fir.store %optional_box to
%host_link` which caused a crash when %optional_box is absent because is
is attempting to copy a descriptor from a null address.

Add code to conditionally do the copy at runtime.

The polymorphic array case with lower bounds can be handled with the
array case that already deals with descriptor argument with a few
modifications, just use that.
---
 flang/lib/Lower/HostAssociations.cpp          | 64 +++++++++++----
 flang/lib/Optimizer/Builder/MutableBox.cpp    |  2 +-
 .../HLFIR/internal-procedures-polymorphic.f90 | 81 +++++++++++++++++++
 3 files changed, 131 insertions(+), 16 deletions(-)
 create mode 100644 flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90

diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index a62f7a7e99b6f..44cc0e74e3b52 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -247,9 +247,11 @@ class CapturedCharacterScalars
   }
 };
 
-/// Class defining how polymorphic entities are captured in internal procedures.
-/// Polymorphic entities are always boxed as a fir.class box.
-class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
+/// Class defining how polymorphic scalar entities are captured in internal
+/// procedures. Polymorphic entities are always boxed as a fir.class box.
+/// Polymorphic array can be handled in CapturedArrays directly
+class CapturedPolymorphicScalar
+    : public CapturedSymbols<CapturedPolymorphicScalar> {
 public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
@@ -257,19 +259,50 @@ class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
   }
   static void instantiateHostTuple(const InstantiateHostTuple &args,
                                    Fortran::lower::AbstractConverter &converter,
-                                   const Fortran::semantics::Symbol &) {
+                                   const Fortran::semantics::Symbol &sym) {
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
     mlir::Type typeInTuple = fir::dyn_cast_ptrEleTy(args.addrInTuple.getType());
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvert(args.loc, typeInTuple,
                                                 fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto isPresent =
+          builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), castBox);
+      builder.genIfThenElse(loc, isPresent)
+          .genThen([&]() {
+            builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+          })
+          .genElse([&]() {
+            mlir::Value null = fir::factory::createUnallocatedBox(
+                builder, loc, typeInTuple,
+                /*nonDeferredParams=*/mlir::ValueRange{});
+            builder.create<fir::StoreOp>(loc, null, args.addrInTuple);
+          })
+          .end();
+    } else {
+      builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+    }
   }
   static void getFromTuple(const GetFromTuple &args,
                            Fortran::lower::AbstractConverter &converter,
                            const Fortran::semantics::Symbol &sym,
                            const Fortran::lower::BoxAnalyzer &ba) {
-    bindCapturedSymbol(sym, args.valueInTuple, converter, args.symMap);
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
+    mlir::Value box = args.valueInTuple;
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto boxTy = box.getType().cast<fir::BaseBoxType>();
+      auto eleTy = boxTy.getEleTy();
+      if (!fir::isa_ref_type(eleTy))
+        eleTy = builder.getRefType(eleTy);
+      auto addr = builder.create<fir::BoxAddrOp>(loc, eleTy, box);
+      mlir::Value isPresent = builder.genIsNotNullAddr(loc, addr);
+      auto absentBox = builder.create<fir::AbsentOp>(loc, boxTy);
+      box =
+          builder.create<mlir::arith::SelectOp>(loc, isPresent, box, absentBox);
+    }
+    bindCapturedSymbol(sym, box, converter, args.symMap);
   }
 };
 
@@ -342,7 +375,12 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
-    assert(type.isa<fir::SequenceType>() && "must be a sequence type");
+    bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
+    assert(type.isa<fir::SequenceType>() ||
+           (isPolymorphic && type.isa<fir::ClassType>()) &&
+               "must be a sequence type");
+    if (isPolymorphic)
+      return type;
     return fir::BoxType::get(type);
   }
 
@@ -410,13 +448,13 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
                          fir::factory::readBoxValue(builder, loc, boxValue),
                          converter, args.symMap);
     } else {
-      // Keep variable as a fir.box.
+      // Keep variable as a fir.box/fir.class.
       // If this is an optional that is absent, the fir.box needs to be an
       // AbsentOp result, otherwise it will not work properly with IsPresentOp
       // (absent boxes are null descriptor addresses, not descriptors containing
       // a null base address).
       if (Fortran::semantics::IsOptional(sym)) {
-        auto boxTy = box.getType().cast<fir::BoxType>();
+        auto boxTy = box.getType().cast<fir::BaseBoxType>();
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
@@ -470,14 +508,10 @@ walkCaptureCategories(T visitor, Fortran::lower::AbstractConverter &converter,
   ba.analyze(sym);
   if (Fortran::semantics::IsAllocatableOrPointer(sym))
     return CapturedAllocatableAndPointer::visit(visitor, converter, sym, ba);
-  if (Fortran::semantics::IsPolymorphic(sym)) {
-    if (ba.isArray() && !ba.lboundIsAllOnes())
-      TODO(converter.genLocation(sym.name()),
-           "polymorphic array with non default lower bound");
-    return CapturedPolymorphic::visit(visitor, converter, sym, ba);
-  }
   if (ba.isArray())
     return CapturedArrays::visit(visitor, converter, sym, ba);
+  if (Fortran::semantics::IsPolymorphic(sym))
+    return CapturedPolymorphicScalar::visit(visitor, converter, sym, ba);
   if (ba.isChar())
     return CapturedCharacterScalars::visit(visitor, converter, sym, ba);
   assert(ba.isTrivial() && "must be trivial scalar");
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index 4d8860b60915c..d4012e9c3d9d9 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -674,7 +674,7 @@ void fir::factory::disassociateMutableBox(fir::FirOpBuilder &builder,
     // 7.3.2.3 point 7. The dynamic type of a disassociated pointer is the
     // same as its declared type.
     auto boxTy = box.getBoxTy().dyn_cast<fir::BaseBoxType>();
-    auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy.getEleTy());
+    auto eleTy = fir::unwrapPassByRefType(boxTy.getEleTy());
     mlir::Type derivedType = fir::getDerivedType(eleTy);
     if (auto recTy = derivedType.dyn_cast<fir::RecordType>()) {
       fir::runtime::genNullifyDerivedType(builder, loc, box.getAddr(), recTy,
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
new file mode 100644
index 0000000000000..8645488290d71
--- /dev/null
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -0,0 +1,81 @@
+! Test lowering of internal procedure capturing OPTIONAL polymorphic
+! objects.
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nw | FileCheck %s
+
+
+module captured_optional_polymorphic
+  type sometype
+  end type
+contains
+subroutine test(x, y)
+  class(sometype), optional :: x
+  class(sometype), optional :: y(2:)
+  call internal()
+contains
+  subroutine internal()
+    if (present(x).and.present(y)) then
+      print *, same_type_as(x, y)
+    end if
+  end subroutine
+end
+end module
+
+! CHECK-LABEL:   func.func @_QMcaptured_optional_polymorphicPtest(
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare{{.*}}Ex
+! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare{{.*}}Ey
+! CHECK:           %[[VAL_7:.*]] = fir.alloca tuple<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>, !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_8]]
+! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i1
+! CHECK:           fir.if %[[VAL_10]] {
+! CHECK:             fir.store %[[VAL_2]]#1 to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_11:.*]] = fir.zero_bits !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             fir.store %[[VAL_12]] to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           }
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_13]]
+! CHECK:           %[[VAL_15:.*]] = fir.is_present %[[VAL_6]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i1
+! CHECK:           fir.if %[[VAL_15]] {
+! CHECK:             %[[VAL_16:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:             %[[VAL_17:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_16]]) : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:             fir.store %[[VAL_17]] to %[[VAL_14]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_18:.*]] = fir.type_desc !fir.type<_QMcaptured_optional_polymorphicTsometype>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_18]] : (!fir.tdesc<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<none>
+! CHECK:             %[[VAL_21:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_23:.*]] = fir.call @_FortranAPointerNullifyDerived(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK:           }
+! CHECK:           fir.call @_QMcaptured_optional_polymorphicFtestPinternal(%[[VAL_7]])
+
+! CHECK-LABEL: func.func{{.*}} @_QMcaptured_optional_polymorphicFtestPinternal(
+! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<tuple<{{.*}}>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]]
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i64
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_8:.*]] = fir.absent !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_7]], %[[VAL_3]], %[[VAL_8]] : !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ex
+! CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_11]]
+! CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_14]]
+! CHECK:           %[[VAL_16:.*]] = fir.box_addr %[[VAL_13]]
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i64
+! CHECK:           %[[VAL_18:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_18]] : i64
+! CHECK:           %[[VAL_20:.*]] = fir.absent !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_13]], %[[VAL_20]] : !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_22:.*]] = fir.shift %[[VAL_15]]#0 : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%[[VAL_22]]) {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ey

From 9617da88ab961145047076c45bb2bb1ac4513634 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 28 Feb 2024 15:58:55 +0800
Subject: [PATCH 531/546] [RISCV] Use a ta vslideup if inserting over end of
 InterSubVT (#83230)

The description in #83146 is slightly inaccurate: it relaxes a tail
undisturbed vslideup to tail agnostic if we are inserting over the
entire tail of the vector **and** we didn't shrink the LMUL of the
vector being inserted into.

This handles the case where we did shrink down the LMUL via InterSubVT
by checking if we inserted over the entire tail of InterSubVT, the
actual type that we're performing the vslideup on, not VecVT.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp          |  4 ++--
 llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll     |  2 +-
 llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll      | 10 +++++-----
 llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll | 12 ++++++------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e95e21bda687e..dde1882f5eea8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9732,9 +9732,9 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
       ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
   VL = computeVLMax(SubVecVT, DL, DAG);
 
-  // Use tail agnostic policy if we're inserting over Vec's tail.
+  // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == VecVT.getVectorElementCount())
+  if (EndIndex == InterSubVT.getVectorElementCount())
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641b..76aa2b913c652 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -474,7 +474,7 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index d377082761736..b15896580d425 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -227,7 +227,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -306,7 +306,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
@@ -319,7 +319,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
@@ -344,7 +344,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -357,7 +357,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 515d77109af9f..6d42b15273cf8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -916,7 +916,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -938,11 +938,11 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -1002,7 +1002,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -1025,11 +1025,11 @@ define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-NEXT:    vlse16.v v12, (a1), zero
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8

From 2b545108ffcb188b69d1a5e37081494d231e1456 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 28 Feb 2024 08:32:32 +0000
Subject: [PATCH 532/546] [GlobalISel] Add a TargetLowering variable to
 IRTranslator. NFC (#83009)

This prevents us from getting the variable multiple times.
---
 .../llvm/CodeGen/GlobalISel/IRTranslator.h    |  1 +
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 74 +++++++------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5454df02914af..bfac54a65c5b4 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -612,6 +612,7 @@ class IRTranslator : public MachineFunctionPass {
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
   const TargetLibraryInfo *LibInfo = nullptr;
+  const TargetLowering *TLI = nullptr;
   FunctionLoweringInfo FuncInfo;
 
   // True when either the Target Machine specifies no optimizations or the
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 38bb808dd5bd5..7c986dbbc2c7c 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -596,8 +596,6 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const Value *CondVal = BrInst.getCondition();
   MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1));
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
   // As long as jumps are not expensive (exceptions for multi-use logic ops,
@@ -617,7 +615,7 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   //     jle foo
   using namespace PatternMatch;
   const Instruction *CondI = dyn_cast<Instruction>(CondVal);
-  if (!TLI.isJumpExpensive() && CondI && CondI->hasOneUse() &&
+  if (!TLI->isJumpExpensive() && CondI && CondI->hasOneUse() &&
       !BrInst.hasMetadata(LLVMContext::MD_unpredictable)) {
     Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
     Value *Vec;
@@ -1385,9 +1383,8 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   MachineMemOperand::Flags Flags =
-      TLI.getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
+      TLI->getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
   if (AA && !(Flags & MachineMemOperand::MOInvariant)) {
     if (AA->pointsToConstantMemory(
             MemoryLocation(Ptr, LocationSize::precise(StoreSize), AAInfo))) {
@@ -1434,8 +1431,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  MachineMemOperand::Flags Flags = TLI.getStoreMemOperandFlags(SI, *DL);
+  MachineMemOperand::Flags Flags = TLI->getStoreMemOperandFlags(SI, *DL);
 
   for (unsigned i = 0; i < Vals.size(); ++i) {
     Register Addr;
@@ -1779,8 +1775,7 @@ void IRTranslator::getStackGuard(Register DstReg,
   auto MIB =
       MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
+  Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
   if (!Global)
     return;
 
@@ -2111,9 +2106,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // does. Simplest intrinsic ever!
     return true;
   case Intrinsic::vastart: {
-    auto &TLI = *MF->getSubtarget().getTargetLowering();
     Value *Ptr = CI.getArgOperand(0);
-    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+    unsigned ListSize = TLI->getVaListSizeInBits(*DL) / 8;
     Align Alignment = getKnownAlignment(Ptr, *DL);
 
     MIRBuilder.buildInstr(TargetOpcode::G_VASTART, {}, {getOrCreateVReg(*Ptr)})
@@ -2189,14 +2183,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     Register Dst = getOrCreateVReg(CI);
     Register Op0 = getOrCreateVReg(*CI.getArgOperand(0));
     Register Op1 = getOrCreateVReg(*CI.getArgOperand(1));
     Register Op2 = getOrCreateVReg(*CI.getArgOperand(2));
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI.isFMAFasterThanFMulAndFAdd(*MF,
-                                       TLI.getValueType(*DL, CI.getType()))) {
+        TLI->isFMAFasterThanFMulAndFAdd(*MF,
+                                        TLI->getValueType(*DL, CI.getType()))) {
       // TODO: Revisit this to see if we should move this part of the
       // lowering to the combiner.
       MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
@@ -2254,10 +2247,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     Register GuardVal;
-    if (TLI.useLoadStackGuardNode()) {
+    if (TLI->useLoadStackGuardNode()) {
       GuardVal = MRI->createGenericVirtualRegister(PtrTy);
       getStackGuard(GuardVal, MIRBuilder);
     } else
@@ -2635,10 +2627,9 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
+  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
     Align Alignment = Info.align.value_or(
         DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
     LLT MemTy = Info.memVT.isSimple()
@@ -2818,10 +2809,9 @@ bool IRTranslator::translateLandingPad(const User &U,
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother.
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   const Constant *PersonalityFn = MF->getFunction().getPersonalityFn();
-  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
-      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+  if (TLI->getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI->getExceptionSelectorRegister(PersonalityFn) == 0)
     return true;
 
   // If landingpad's return type is token type, we don't create DAG nodes
@@ -2852,7 +2842,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
 
   // Mark exception register as live in.
-  Register ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  Register ExceptionReg = TLI->getExceptionPointerRegister(PersonalityFn);
   if (!ExceptionReg)
     return false;
 
@@ -2860,7 +2850,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   ArrayRef<Register> ResRegs = getOrCreateVRegs(LP);
   MIRBuilder.buildCopy(ResRegs[0], ExceptionReg);
 
-  Register SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  Register SelectorReg = TLI->getExceptionSelectorRegister(PersonalityFn);
   if (!SelectorReg)
     return false;
 
@@ -2986,8 +2976,7 @@ bool IRTranslator::translateExtractElement(const User &U,
 
   Register Res = getOrCreateVReg(U);
   Register Val = getOrCreateVReg(*U.getOperand(0));
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits();
   Register Idx;
   if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
     if (CI->getBitWidth() != PreferredVecIdxWidth) {
@@ -3039,8 +3028,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   auto Res = getOrCreateVRegs(I);
   Register OldValRes = Res[0];
@@ -3061,8 +3049,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   Register Res = getOrCreateVReg(I);
   Register Addr = getOrCreateVReg(*I.getPointerOperand());
@@ -3302,8 +3289,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setDebugLoc(Inst.getDebugLoc());
   CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  if (TLI.fallBackToDAGISel(Inst))
+  if (TLI->fallBackToDAGISel(Inst))
     return false;
 
   switch (Inst.getOpcode()) {
@@ -3454,9 +3440,8 @@ bool IRTranslator::finalizeBasicBlock(const BasicBlock &BB,
   // Check if we need to generate stack-protector guard checks.
   StackProtector &SP = getAnalysis<StackProtector>();
   if (SP.shouldEmitSDCheck(BB)) {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     bool FunctionBasedInstrumentation =
-        TLI.getSSPStackGuardCheck(*MF->getFunction().getParent());
+        TLI->getSSPStackGuardCheck(*MF->getFunction().getParent());
     SPDescriptor.initialize(&BB, &MBB, FunctionBasedInstrumentation);
   }
   // Handle stack protector.
@@ -3501,10 +3486,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                                           MachineBasicBlock *ParentBB) {
   CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
   // First create the loads to the guard/stack slot for the comparison.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
   const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
-  LLT PtrMemTy = getLLTForMVT(TLI.getPointerMemTy(*DL));
+  LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL));
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
@@ -3522,13 +3506,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile)
           .getReg(0);
 
-  if (TLI.useStackGuardXorFP()) {
+  if (TLI->useStackGuardXorFP()) {
     LLVM_DEBUG(dbgs() << "Stack protector xor'ing with FP not yet implemented");
     return false;
   }
 
   // Retrieve guard check function, nullptr if instrumentation is inlined.
-  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+  if (const Function *GuardCheckFn = TLI->getSSPStackGuardCheck(M)) {
     // This path is currently untestable on GlobalISel, since the only platform
     // that needs this seems to be Windows, and we fall back on that currently.
     // The code still lives here in case that changes.
@@ -3563,13 +3547,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
-  if (TLI.useLoadStackGuardNode()) {
+  if (TLI->useLoadStackGuardNode()) {
     Guard =
         MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
     getStackGuard(Guard, *CurBuilder);
   } else {
     // TODO: test using android subtarget when we support @llvm.thread.pointer.
-    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    const Value *IRGuard = TLI->getSDagStackGuard(M);
     Register GuardPtr = getOrCreateVReg(*IRGuard);
 
     Guard = CurBuilder
@@ -3593,13 +3577,12 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
                                            MachineBasicBlock *FailureBB) {
   CurBuilder->setInsertPt(*FailureBB, FailureBB->end());
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
 
   const RTLIB::Libcall Libcall = RTLIB::STACKPROTECTOR_CHECK_FAIL;
-  const char *Name = TLI.getLibcallName(Libcall);
+  const char *Name = TLI->getLibcallName(Libcall);
 
   CallLowering::CallLoweringInfo Info;
-  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.CallConv = TLI->getLibcallCallingConv(Libcall);
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = {Register(), Type::getVoidTy(MF->getFunction().getContext()),
                   0};
@@ -3662,6 +3645,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   bool EnableCSE = EnableCSEInIRTranslator.getNumOccurrences()
                        ? EnableCSEInIRTranslator
                        : TPC->isGISelCSEEnabled();
+  TLI = MF->getSubtarget().getTargetLowering();
 
   if (EnableCSE) {
     EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
@@ -3696,12 +3680,8 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
-  SL->init(TLI, TM, *DL);
-
-
+  SL->init(*TLI, TM, *DL);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 

From a4fff36b6cf64d0afa965a8ef4927145c5558124 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Wed, 28 Feb 2024 09:32:03 +0100
Subject: [PATCH 533/546] [bazel] Add "include/" for libc includes

for 04e8653f189bf3d65680c7fb3b3033ad82903ee9 #83199
---
 utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 17eb30c8e458c..7d815bc4a2299 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -17,6 +17,7 @@ def libc_common_copts():
     libc_include_path = paths.join(root_label.workspace_root, root_label.package)
     return [
         "-I" + libc_include_path,
+        "-I" + paths.join(libc_include_path, "include"),
         "-DLIBC_NAMESPACE=" + LIBC_NAMESPACE,
     ]
 

From 619ee20b3911f9a481a75a64704c80aef16af9d0 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Wed, 28 Feb 2024 09:48:15 +0100
Subject: [PATCH 534/546] [mlir] add an example of using transform dialect
 standalone (#82623)

Transform dialect interpreter is designed to be usable outside of the
pass pipeline, as the main program transformation driver, e.g., for
languages with explicit schedules. Provide an example of such usage with
a couple of tests.
---
 mlir/examples/CMakeLists.txt                  |   1 +
 mlir/examples/transform-opt/CMakeLists.txt    |  26 ++
 mlir/examples/transform-opt/README.md         |  40 ++
 .../transform-opt/mlir-transform-opt.cpp      | 389 ++++++++++++++++++
 mlir/test/CMakeLists.txt                      |   1 +
 mlir/test/Examples/transform-opt/empty.mlir   |  12 +
 .../Examples/transform-opt/external-decl.mlir |  18 +
 .../Examples/transform-opt/external-def.mlir  |   8 +
 mlir/test/Examples/transform-opt/pass.mlir    |  19 +
 .../transform-opt/self-contained.mlir         |  21 +
 .../Examples/transform-opt/syntax-error.mlir  |   5 +
 mlir/test/lit.cfg.py                          |   1 +
 12 files changed, 541 insertions(+)
 create mode 100644 mlir/examples/transform-opt/CMakeLists.txt
 create mode 100644 mlir/examples/transform-opt/README.md
 create mode 100644 mlir/examples/transform-opt/mlir-transform-opt.cpp
 create mode 100644 mlir/test/Examples/transform-opt/empty.mlir
 create mode 100644 mlir/test/Examples/transform-opt/external-decl.mlir
 create mode 100644 mlir/test/Examples/transform-opt/external-def.mlir
 create mode 100644 mlir/test/Examples/transform-opt/pass.mlir
 create mode 100644 mlir/test/Examples/transform-opt/self-contained.mlir
 create mode 100644 mlir/test/Examples/transform-opt/syntax-error.mlir

diff --git a/mlir/examples/CMakeLists.txt b/mlir/examples/CMakeLists.txt
index d256bf1a5cbb1..2a1cac34d8c29 100644
--- a/mlir/examples/CMakeLists.txt
+++ b/mlir/examples/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(toy)
 add_subdirectory(transform)
+add_subdirectory(transform-opt)
 add_subdirectory(minimal-opt)
diff --git a/mlir/examples/transform-opt/CMakeLists.txt b/mlir/examples/transform-opt/CMakeLists.txt
new file mode 100644
index 0000000000000..8e23555d0b5d7
--- /dev/null
+++ b/mlir/examples/transform-opt/CMakeLists.txt
@@ -0,0 +1,26 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+
+set(LIBS
+  MLIRAnalysis
+  MLIRIR
+  MLIRParser
+  MLIRSupport
+  MLIRTransformDialect
+  MLIRTransformDialectTransforms
+  MLIRTransforms
+  ${dialect_libs}
+  ${conversion_libs}
+  ${extension_libs}
+)
+
+add_mlir_tool(mlir-transform-opt
+  mlir-transform-opt.cpp
+
+  DEPENDS
+  ${LIBS}
+)
+target_link_libraries(mlir-transform-opt PRIVATE ${LIBS})
+llvm_update_compile_flags(mlir-transform-opt)
+mlir_check_all_link_libraries(mlir-transform-opt)
diff --git a/mlir/examples/transform-opt/README.md b/mlir/examples/transform-opt/README.md
new file mode 100644
index 0000000000000..e9c8cc0173c7b
--- /dev/null
+++ b/mlir/examples/transform-opt/README.md
@@ -0,0 +1,40 @@
+# Standalone Transform Dialect Interpreter
+
+This is an example of using the Transform dialect interpreter functionality standalone, that is, outside of the regular pass pipeline. The example is a
+binary capable of processing MLIR source files similar to `mlir-opt` and other
+optimizer drivers, with the entire transformation process driven by a Transform
+dialect script. This script can be embedded into the source file or provided in
+a separate MLIR source file.
+
+Either the input module or the transform module must contain a top-level symbol
+named `__transform_main`, which is used as the entry point to the transformation
+script.
+
+```sh
+mlir-transform-opt payload_with_embedded_transform.mlir
+mlir-transform-opt payload.mlir -transform=transform.mlir
+```
+
+The name of the entry point can be overridden using command-line options.
+
+```sh
+mlir-transform-opt payload-mlir -transform-entry-point=another_entry_point
+```
+
+Transform scripts can reference symbols defined in other source files, called
+libraries, which can be supplied to the binary through command-line options.
+Libraries will be embedded into the main transformation module by the tool and
+the interpreter will process everything as a single module. A debug option is
+available to see the contents of the transform module before it goes into the interpreter.
+
+```sh
+mlir-transform-opt payload.mlir -transform=transform.mlir \
+  -transform-library=external_definitions_1.mlir \
+  -transform-library=external_definitions_2.mlir \
+  -dump-library-module
+```
+
+Check out the [Transform dialect
+tutorial](https://mlir.llvm.org/docs/Tutorials/transform/) as well as
+[documentation](https://mlir.llvm.org/docs/Dialects/Transform/) to learn more
+about the dialect. 
diff --git a/mlir/examples/transform-opt/mlir-transform-opt.cpp b/mlir/examples/transform-opt/mlir-transform-opt.cpp
new file mode 100644
index 0000000000000..41a17f18726b1
--- /dev/null
+++ b/mlir/examples/transform-opt/mlir-transform-opt.cpp
@@ -0,0 +1,389 @@
+//===- mlir-transform-opt.cpp -----------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <cstdlib>
+
+namespace {
+
+using namespace llvm;
+
+/// Structure containing command line options for the tool, these will get
+/// initialized when an instance is created.
+struct MlirTransformOptCLOptions {
+  cl::opt<bool> allowUnregisteredDialects{
+      "allow-unregistered-dialect",
+      cl::desc("Allow operations coming from an unregistered dialect"),
+      cl::init(false)};
+
+  cl::opt<bool> verifyDiagnostics{
+      "verify-diagnostics",
+      cl::desc("Check that emitted diagnostics match expected-* lines "
+               "on the corresponding line"),
+      cl::init(false)};
+
+  cl::opt<std::string> payloadFilename{cl::Positional, cl::desc("<input file>"),
+                                       cl::init("-")};
+
+  cl::opt<std::string> outputFilename{"o", cl::desc("Output filename"),
+                                      cl::value_desc("filename"),
+                                      cl::init("-")};
+
+  cl::opt<std::string> transformMainFilename{
+      "transform",
+      cl::desc("File containing entry point of the transform script, if "
+               "different from the input file"),
+      cl::value_desc("filename"), cl::init("")};
+
+  cl::list<std::string> transformLibraryFilenames{
+      "transform-library", cl::desc("File(s) containing definitions of "
+                                    "additional transform script symbols")};
+
+  cl::opt<std::string> transformEntryPoint{
+      "transform-entry-point",
+      cl::desc("Name of the entry point transform symbol"),
+      cl::init(mlir::transform::TransformDialect::kTransformEntryPointSymbolName
+                   .str())};
+
+  cl::opt<bool> disableExpensiveChecks{
+      "disable-expensive-checks",
+      cl::desc("Disables potentially expensive checks in the transform "
+               "interpreter, providing more speed at the expense of "
+               "potential memory problems and silent corruptions"),
+      cl::init(false)};
+
+  cl::opt<bool> dumpLibraryModule{
+      "dump-library-module",
+      cl::desc("Prints the combined library module before the output"),
+      cl::init(false)};
+};
+} // namespace
+
+/// "Managed" static instance of the command-line options structure. This makes
+/// them locally-scoped and explicitly initialized/deinitialized. While this is
+/// not strictly necessary in the tool source file that is not being used as a
+/// library (where the options would pollute the global list of options), it is
+/// good practice to follow this.
+static llvm::ManagedStatic<MlirTransformOptCLOptions> clOptions;
+
+/// Explicitly registers command-line options.
+static void registerCLOptions() { *clOptions; }
+
+namespace {
+/// A wrapper class for source managers diagnostic. This provides both unique
+/// ownership and virtual function-like overload for a pair of
+/// inheritance-related classes that do not use virtual functions.
+class DiagnosticHandlerWrapper {
+public:
+  /// Kind of the diagnostic handler to use.
+  enum class Kind { EmitDiagnostics, VerifyDiagnostics };
+
+  /// Constructs the diagnostic handler of the specified kind of the given
+  /// source manager and context.
+  DiagnosticHandlerWrapper(Kind kind, llvm::SourceMgr &mgr,
+                           mlir::MLIRContext *context) {
+    if (kind == Kind::EmitDiagnostics)
+      handler = new mlir::SourceMgrDiagnosticHandler(mgr, context);
+    else
+      handler = new mlir::SourceMgrDiagnosticVerifierHandler(mgr, context);
+  }
+
+  /// This object is non-copyable but movable.
+  DiagnosticHandlerWrapper(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper(DiagnosticHandlerWrapper &&other) = default;
+  DiagnosticHandlerWrapper &
+  operator=(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper &operator=(DiagnosticHandlerWrapper &&) = default;
+
+  /// Verifies the captured "expected-*" diagnostics if required.
+  mlir::LogicalResult verify() const {
+    if (auto *ptr =
+            handler.dyn_cast<mlir::SourceMgrDiagnosticVerifierHandler *>()) {
+      return ptr->verify();
+    }
+    return mlir::success();
+  }
+
+  /// Destructs the object of the same type as allocated.
+  ~DiagnosticHandlerWrapper() {
+    if (auto *ptr = handler.dyn_cast<mlir::SourceMgrDiagnosticHandler *>()) {
+      delete ptr;
+    } else {
+      delete handler.get<mlir::SourceMgrDiagnosticVerifierHandler *>();
+    }
+  }
+
+private:
+  /// Internal storage is a type-safe union.
+  llvm::PointerUnion<mlir::SourceMgrDiagnosticHandler *,
+                     mlir::SourceMgrDiagnosticVerifierHandler *>
+      handler;
+};
+
+/// MLIR has deeply rooted expectations that the LLVM source manager contains
+/// exactly one buffer, until at least the lexer level. This class wraps
+/// multiple LLVM source managers each managing a buffer to match MLIR's
+/// expectations while still providing a centralized handling mechanism.
+class TransformSourceMgr {
+public:
+  /// Constructs the source manager indicating whether diagnostic messages will
+  /// be verified later on.
+  explicit TransformSourceMgr(bool verifyDiagnostics)
+      : verifyDiagnostics(verifyDiagnostics) {}
+
+  /// Deconstructs the source manager. Note that `checkResults` must have been
+  /// called on this instance before deconstructing it.
+  ~TransformSourceMgr() {
+    assert(resultChecked && "must check the result of diagnostic handlers by "
+                            "running TransformSourceMgr::checkResult");
+  }
+
+  /// Parses the given buffer and creates the top-level operation of the kind
+  /// specified as template argument in the given context. Additional parsing
+  /// options may be provided.
+  template <typename OpTy = mlir::Operation *>
+  mlir::OwningOpRef<OpTy> parseBuffer(std::unique_ptr<MemoryBuffer> buffer,
+                                      mlir::MLIRContext &context,
+                                      const mlir::ParserConfig &config) {
+    // Create a single-buffer LLVM source manager. Note that `unique_ptr` allows
+    // the code below to capture a reference to the source manager in such a way
+    // that it is not invalidated when the vector contents is eventually
+    // reallocated.
+    llvm::SourceMgr &mgr =
+        *sourceMgrs.emplace_back(std::make_unique<llvm::SourceMgr>());
+    mgr.AddNewSourceBuffer(std::move(buffer), llvm::SMLoc());
+
+    // Choose the type of diagnostic handler depending on whether diagnostic
+    // verification needs to happen and store it.
+    if (verifyDiagnostics) {
+      diagHandlers.emplace_back(
+          DiagnosticHandlerWrapper::Kind::VerifyDiagnostics, mgr, &context);
+    } else {
+      diagHandlers.emplace_back(DiagnosticHandlerWrapper::Kind::EmitDiagnostics,
+                                mgr, &context);
+    }
+
+    // Defer to MLIR's parser.
+    return mlir::parseSourceFile<OpTy>(mgr, config);
+  }
+
+  /// If diagnostic message verification has been requested upon construction of
+  /// this source manager, performs the verification, reports errors and returns
+  /// the result of the verification. Otherwise passes through the given value.
+  mlir::LogicalResult checkResult(mlir::LogicalResult result) {
+    resultChecked = true;
+    if (!verifyDiagnostics)
+      return result;
+
+    return mlir::failure(llvm::any_of(diagHandlers, [](const auto &handler) {
+      return mlir::failed(handler.verify());
+    }));
+  }
+
+private:
+  /// Indicates whether diagnostic message verification is requested.
+  const bool verifyDiagnostics;
+
+  /// Indicates that diagnostic message verification has taken place, and the
+  /// deconstruction is therefore safe.
+  bool resultChecked = false;
+
+  /// Storage for per-buffer source managers and diagnostic handlers. These are
+  /// wrapped into unique pointers in order to make it safe to capture
+  /// references to these objects: if the vector is reallocated, the unique
+  /// pointer objects are moved by the pointer addresses won't change. Also, for
+  /// handlers, this allows to store the pointer to the base class.
+  SmallVector<std::unique_ptr<llvm::SourceMgr>> sourceMgrs;
+  SmallVector<DiagnosticHandlerWrapper> diagHandlers;
+};
+} // namespace
+
+/// Trivial wrapper around `applyTransforms` that doesn't support extra mapping
+/// and doesn't enforce the entry point transform ops being top-level.
+static mlir::LogicalResult
+applyTransforms(mlir::Operation *payloadRoot,
+                mlir::transform::TransformOpInterface transformRoot,
+                const mlir::transform::TransformOptions &options) {
+  return applyTransforms(payloadRoot, transformRoot, {}, options,
+                         /*enforceToplevelTransformOp=*/false);
+}
+
+/// Applies transforms indicated in the transform dialect script to the input
+/// buffer. The transform script may be embedded in the input buffer or as a
+/// separate buffer. The transform script may have external symbols, the
+/// definitions of which must be provided in transform library buffers. If the
+/// application is successful, prints the transformed input buffer into the
+/// given output stream. Additional configuration options are derived from
+/// command-line options.
+static mlir::LogicalResult processPayloadBuffer(
+    raw_ostream &os, std::unique_ptr<MemoryBuffer> inputBuffer,
+    std::unique_ptr<llvm::MemoryBuffer> transformBuffer,
+    MutableArrayRef<std::unique_ptr<MemoryBuffer>> transformLibraries,
+    mlir::DialectRegistry &registry) {
+
+  // Initialize the MLIR context, and various configurations.
+  mlir::MLIRContext context(registry, mlir::MLIRContext::Threading::DISABLED);
+  context.allowUnregisteredDialects(clOptions->allowUnregisteredDialects);
+  mlir::ParserConfig config(&context);
+  TransformSourceMgr sourceMgr(
+      /*verifyDiagnostics=*/clOptions->verifyDiagnostics);
+
+  // Parse the input buffer that will be used as transform payload.
+  mlir::OwningOpRef<mlir::Operation *> payloadRoot =
+      sourceMgr.parseBuffer(std::move(inputBuffer), context, config);
+  if (!payloadRoot)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Identify the module containing the transform script entry point. This may
+  // be the same module as the input or a separate module. In the former case,
+  // make a copy of the module so it can be modified freely. Modification may
+  // happen in the script itself (at which point it could be rewriting itself
+  // during interpretation, leading to tricky memory errors) or by embedding
+  // library modules in the script.
+  mlir::OwningOpRef<mlir::ModuleOp> transformRoot;
+  if (transformBuffer) {
+    transformRoot = sourceMgr.parseBuffer<mlir::ModuleOp>(
+        std::move(transformBuffer), context, config);
+    if (!transformRoot)
+      return sourceMgr.checkResult(mlir::failure());
+  } else {
+    transformRoot = cast<mlir::ModuleOp>(payloadRoot->clone());
+  }
+
+  // Parse and merge the libraries into the main transform module.
+  for (auto &&transformLibrary : transformLibraries) {
+    mlir::OwningOpRef<mlir::ModuleOp> libraryModule =
+        sourceMgr.parseBuffer<mlir::ModuleOp>(std::move(transformLibrary),
+                                              context, config);
+
+    if (!libraryModule ||
+        mlir::failed(mlir::transform::detail::mergeSymbolsInto(
+            *transformRoot, std::move(libraryModule))))
+      return sourceMgr.checkResult(mlir::failure());
+  }
+
+  // If requested, dump the combined transform module.
+  if (clOptions->dumpLibraryModule)
+    transformRoot->dump();
+
+  // Find the entry point symbol. Even if it had originally been in the payload
+  // module, it was cloned into the transform module so only look there.
+  mlir::transform::TransformOpInterface entryPoint =
+      mlir::transform::detail::findTransformEntryPoint(
+          *transformRoot, mlir::ModuleOp(), clOptions->transformEntryPoint);
+  if (!entryPoint)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Apply the requested transformations.
+  mlir::transform::TransformOptions transformOptions;
+  transformOptions.enableExpensiveChecks(!clOptions->disableExpensiveChecks);
+  if (mlir::failed(applyTransforms(*payloadRoot, entryPoint, transformOptions)))
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Print the transformed result and check the captured diagnostics if
+  // requested.
+  payloadRoot->print(os);
+  return sourceMgr.checkResult(mlir::success());
+}
+
+/// Tool entry point.
+static mlir::LogicalResult runMain(int argc, char **argv) {
+  // Register all upstream dialects and extensions. Specific uses are advised
+  // not to register all dialects indiscriminately but rather hand-pick what is
+  // necessary for their use case.
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::registerAllExtensions(registry);
+  mlir::registerAllPasses();
+
+  // Explicitly register the transform dialect. This is not strictly necessary
+  // since it has been already registered as part of the upstream dialect list,
+  // but useful for example purposes for cases when dialects to register are
+  // hand-picked. The transform dialect must be registered.
+  registry.insert<mlir::transform::TransformDialect>();
+
+  // Register various command-line options. Note that the LLVM initializer
+  // object is a RAII that ensures correct deconstruction of command-line option
+  // objects inside ManagedStatic.
+  llvm::InitLLVM y(argc, argv);
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  registerCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Minimal Transform dialect driver\n");
+
+  // Try opening the main input file.
+  std::string errorMessage;
+  std::unique_ptr<llvm::MemoryBuffer> payloadFile =
+      mlir::openInputFile(clOptions->payloadFilename, &errorMessage);
+  if (!payloadFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the output file.
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(clOptions->outputFilename, &errorMessage);
+  if (!outputFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the main transform file if provided.
+  std::unique_ptr<llvm::MemoryBuffer> transformRootFile;
+  if (!clOptions->transformMainFilename.empty()) {
+    if (clOptions->transformMainFilename == clOptions->payloadFilename) {
+      llvm::errs() << "warning: " << clOptions->payloadFilename
+                   << " is provided as both payload and transform file\n";
+    } else {
+      transformRootFile =
+          mlir::openInputFile(clOptions->transformMainFilename, &errorMessage);
+      if (!transformRootFile) {
+        llvm::errs() << errorMessage << "\n";
+        return mlir::failure();
+      }
+    }
+  }
+
+  // Try opening transform library files if provided.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> transformLibraries;
+  transformLibraries.reserve(clOptions->transformLibraryFilenames.size());
+  for (llvm::StringRef filename : clOptions->transformLibraryFilenames) {
+    transformLibraries.emplace_back(
+        mlir::openInputFile(filename, &errorMessage));
+    if (!transformLibraries.back()) {
+      llvm::errs() << errorMessage << "\n";
+      return mlir::failure();
+    }
+  }
+
+  return processPayloadBuffer(outputFile->os(), std::move(payloadFile),
+                              std::move(transformRootFile), transformLibraries,
+                              registry);
+}
+
+int main(int argc, char **argv) {
+  return mlir::asMainReturnCode(runMain(argc, argv));
+}
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 74921544c5557..baf07ea1f010a 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -173,6 +173,7 @@ if(LLVM_BUILD_EXAMPLES)
     transform-opt-ch3
     transform-opt-ch4
     mlir-minimal-opt
+    mlir-transform-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/Examples/transform-opt/empty.mlir b/mlir/test/Examples/transform-opt/empty.mlir
new file mode 100644
index 0000000000000..b525769db6882
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/empty.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/external-def.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+// CHECK-NOT: @__transform_main
+module {}
diff --git a/mlir/test/Examples/transform-opt/external-decl.mlir b/mlir/test/Examples/transform-opt/external-decl.mlir
new file mode 100644
index 0000000000000..5a73735892429
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-decl.mlir
@@ -0,0 +1,18 @@
+// This test just needs to parse. Note that the diagnostic message below will
+// be produced in *another* multi-file test, do *not* -verify-diagnostics here.
+// RUN: mlir-opt %s
+
+// RUN: mlir-transform-opt %s --transform-library=%p/external-def.mlir | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // The definition should not be printed here.
+  // CHECK: @external_def
+  // CHECK-NOT: transform.print
+  transform.named_sequence private @external_def(%root: !transform.any_op {transform.readonly})
+
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    // expected-error @below {{unresolved external named sequence}}
+    transform.include @external_def failures(propagate) (%root) : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/external-def.mlir b/mlir/test/Examples/transform-opt/external-def.mlir
new file mode 100644
index 0000000000000..9dc4fbbdd6b61
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-def.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir-opt %s
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @external_def(%root: !transform.any_op {transform.readonly}) {
+    transform.print %root { name = "external_def" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/pass.mlir b/mlir/test/Examples/transform-opt/pass.mlir
new file mode 100644
index 0000000000000..5c7c8bf1e256d
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/pass.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // CHECK-LABEL: @return_42
+  // CHECK: %[[C42:.+]] = arith.constant 42
+  // CHECK: return %[[C42]]
+  func.func @return_42() -> i32 {
+    %0 = arith.constant 21 : i32
+    %1 = arith.constant 2 : i32
+    %2 = arith.muli %0, %1 : i32
+    return %2 : i32
+  }
+
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %arg1 = transform.apply_registered_pass "canonicalize" to %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.print %arg1 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/self-contained.mlir b/mlir/test/Examples/transform-opt/self-contained.mlir
new file mode 100644
index 0000000000000..b9a93af61b8bb
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/self-contained.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+
+// The first occurrence comes from the print operation and the second is the
+// roundtrip output. However, we shouldn't have the symbol duplicated because
+// of library merging.
+// CHECK-COUNT-2: @__transform_main
+// CHECK-NOT: @__transform_main
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    transform.print %root { name = "in self-contained" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/syntax-error.mlir b/mlir/test/Examples/transform-opt/syntax-error.mlir
new file mode 100644
index 0000000000000..89f1d472fe895
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/syntax-error.mlir
@@ -0,0 +1,5 @@
+// RUN: mlir-opt %s --verify-diagnostics
+// This file is used as additional input.
+
+// expected-error @below {{expected operation name in quotes}}
+module {
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 904dfb680a040..7636ef30c2d3e 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -161,6 +161,7 @@ def add_runtime(name):
         ToolSubst("transform-opt-ch2", unresolved="ignore"),
         ToolSubst("transform-opt-ch3", unresolved="ignore"),
         ToolSubst("transform-opt-ch4", unresolved="ignore"),
+        ToolSubst("mlir-transform-opt", unresolved="ignore"),
         ToolSubst("%mlir_lib_dir", config.mlir_lib_dir, unresolved="ignore"),
         ToolSubst("%mlir_src_dir", config.mlir_src_root, unresolved="ignore"),
     ]

From 49c399c2d113df1654b09c9b5afa38924829a8fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 07:38:22 +0100
Subject: [PATCH 535/546] [clang][Interp] Toplevel destructors may fail

We used to run them, but not check if they failed. If they do,
the expression is invalid, even if we already have a result.

I do have a suspicion that we need to manually call destroyLocals()
in more places (everywhere basically?), but I'll wait with that
until I have a reproducer at hand.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 22 ++++++++++--------
 clang/lib/AST/Interp/ByteCodeExprGen.h   | 29 ++++++++++++++++--------
 clang/lib/AST/Interp/EvalEmitter.cpp     |  5 +++-
 clang/lib/AST/Interp/EvaluationResult.h  |  3 ++-
 clang/test/AST/Interp/cxx20.cpp          | 22 ++++++++++++++++--
 5 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index f193f959d3a6c..b151f8d0d7a79 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2587,7 +2587,10 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
 
     if (!this->emitFinishInit(E))
       return false;
-    return this->emitRetValue(E);
+    // We are destroying the locals AFTER the Ret op.
+    // The Ret op needs to copy the (alive) values, but the
+    // destructors may still turn the entire expression invalid.
+    return this->emitRetValue(E) && RootScope.destroyLocals();
   }
 
   return false;
@@ -3414,14 +3417,15 @@ bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Record *R) {
   // Now emit the destructor and recurse into base classes.
   if (const CXXDestructorDecl *Dtor = R->getDestructor();
       Dtor && !Dtor->isTrivial()) {
-    if (const Function *DtorFunc = getFunction(Dtor)) {
-      assert(DtorFunc->hasThisPointer());
-      assert(DtorFunc->getNumParams() == 1);
-      if (!this->emitDupPtr(SourceInfo{}))
-        return false;
-      if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
-        return false;
-    }
+    const Function *DtorFunc = getFunction(Dtor);
+    if (!DtorFunc)
+      return false;
+    assert(DtorFunc->hasThisPointer());
+    assert(DtorFunc->getNumParams() == 1);
+    if (!this->emitDupPtr(SourceInfo{}))
+      return false;
+    if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
+      return false;
   }
 
   for (const Record::Base &Base : llvm::reverse(R->bases())) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5b3b533dba387..acbbcc3dc9619 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -332,7 +332,7 @@ template <class Emitter> class VariableScope {
   }
 
   virtual void emitDestruction() {}
-  virtual void emitDestructors() {}
+  virtual bool emitDestructors() { return true; }
   VariableScope *getParent() const { return Parent; }
 
 protected:
@@ -356,13 +356,18 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
   }
 
   /// Overriden to support explicit destruction.
-  void emitDestruction() override {
+  void emitDestruction() override { destroyLocals(); }
+
+  /// Explicit destruction of local variables.
+  bool destroyLocals() {
     if (!Idx)
-      return;
-    this->emitDestructors();
+      return true;
+
+    bool Success = this->emitDestructors();
     this->Ctx->emitDestroy(*Idx, SourceInfo{});
     removeStoredOpaqueValues();
     this->Idx = std::nullopt;
+    return Success;
   }
 
   void addLocal(const Scope::Local &Local) override {
@@ -374,19 +379,25 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
     this->Ctx->Descriptors[*Idx].emplace_back(Local);
   }
 
-  void emitDestructors() override {
+  bool emitDestructors() override {
     if (!Idx)
-      return;
+      return true;
     // Emit destructor calls for local variables of record
     // type with a destructor.
     for (Scope::Local &Local : this->Ctx->Descriptors[*Idx]) {
       if (!Local.Desc->isPrimitive() && !Local.Desc->isPrimitiveArray()) {
-        this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{});
-        this->Ctx->emitDestruction(Local.Desc);
-        this->Ctx->emitPopPtr(SourceInfo{});
+        if (!this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{}))
+          return false;
+
+        if (!this->Ctx->emitDestruction(Local.Desc))
+          return false;
+
+        if (!this->Ctx->emitPopPtr(SourceInfo{}))
+          return false;
         removeIfStoredOpaqueValue(Local);
       }
     }
+    return true;
   }
 
   void removeStoredOpaqueValues() {
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 9cae25f5c4d64..c9c2bf9b145b2 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -38,8 +38,11 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
   this->ConvertResultToRValue = ConvertResultToRValue;
   EvalResult.setSource(E);
 
-  if (!this->visitExpr(E) && EvalResult.empty())
+  if (!this->visitExpr(E)) {
+    // EvalResult may already have a result set, but something failed
+    // after that (e.g. evaluating destructors).
     EvalResult.setInvalid();
+  }
 
   return std::move(this->EvalResult);
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.h b/clang/lib/AST/Interp/EvaluationResult.h
index 28e1ae6ba3e7a..ecf2250074cc9 100644
--- a/clang/lib/AST/Interp/EvaluationResult.h
+++ b/clang/lib/AST/Interp/EvaluationResult.h
@@ -72,7 +72,8 @@ class EvaluationResult final {
     Kind = LValue;
   }
   void setInvalid() {
-    assert(empty());
+    // We are NOT asserting empty() here, since setting it to invalid
+    // is allowed even if there is already a result.
     Kind = Invalid;
   }
   void setValid() {
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 5c9c625796510..b24b0c8a3ba0e 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify=both,expected -fcxx-exceptions %s
+// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=both,ref -fcxx-exceptions %s
 
 void test_alignas_operand() {
   alignas(8) char dummy;
@@ -799,3 +799,21 @@ void f2() {
                        // access info for unnamed bit-field
 }
 }
+
+namespace FailingDestructor {
+  struct D {
+    int n;
+    bool can_destroy;
+
+    constexpr ~D() {
+      if (!can_destroy)
+        throw "oh no";
+    }
+  };
+  template<D d>
+  void f() {} // both-note {{invalid explicitly-specified argument}}
+
+  void g() {
+    f<D{0, false}>(); // both-error {{no matching function}}
+  }
+}

From 9f99eda1208787364b1a381b2d4e146fc4868cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 07:50:44 +0100
Subject: [PATCH 536/546] [clang][Interp][NFC] Convert test to
 verify=expected,both style

---
 clang/test/AST/Interp/cxx20.cpp | 156 +++++++++++---------------------
 1 file changed, 52 insertions(+), 104 deletions(-)

diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index b24b0c8a3ba0e..2c28e53784c5c 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -58,13 +58,10 @@ static_assert(pointerAssign2() == 12, "");
 
 constexpr int unInitLocal() {
   int a;
-  return a; // ref-note {{read of uninitialized object}} \
-            // expected-note {{read of uninitialized object}}
+  return a; // both-note {{read of uninitialized object}}
 }
-static_assert(unInitLocal() == 0, ""); // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{in call to 'unInitLocal()'}} \
-                                       // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{in call to 'unInitLocal()'}} \
+static_assert(unInitLocal() == 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{in call to 'unInitLocal()'}}
 
 constexpr int initializedLocal() {
   int a;
@@ -75,25 +72,19 @@ static_assert(initializedLocal() == 20);
 
 constexpr int initializedLocal2() {
   int a[2];
-  return *a; // expected-note {{read of uninitialized object is not allowed in a constant expression}} \
-             // ref-note {{read of uninitialized object is not allowed in a constant expression}}
+  return *a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal2() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal2() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 struct Int { int a; };
 constexpr int initializedLocal3() {
   Int i;
-  return i.a; // ref-note {{read of uninitialized object is not allowed in a constant expression}} \
-              // expected-note {{read of uninitialized object}}
+  return i.a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal3() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal3() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
@@ -137,22 +128,16 @@ static_assert(!b4); // ref-error {{not an integral constant expression}} \
 namespace UninitializedFields {
   class A {
   public:
-    int a; // expected-note 4{{subobject declared here}} \
-           // ref-note 4{{subobject declared here}}
+    int a; // both-note 4{{subobject declared here}}
     constexpr A() {}
   };
-  constexpr A a; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{subobject 'a' is not initialized}} \
-                 // ref-error {{must be initialized by a constant expression}} \
-                 // ref-note {{subobject 'a' is not initialized}}
-  constexpr A aarr[2]; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr A a; // both-error {{must be initialized by a constant expression}} \
+                 // both-note {{subobject 'a' is not initialized}}
+  constexpr A aarr[2]; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
   class F {
     public:
-      int f; // expected-note 3{{subobject declared here}} \
-             // ref-note 3{{subobject declared here}}
+      int f; // both-note 3{{subobject declared here}}
 
       constexpr F() {}
       constexpr F(bool b) {
@@ -161,26 +146,19 @@ namespace UninitializedFields {
       }
   };
 
-  constexpr F foo[2] = {true}; // expected-error {{must be initialized by a constant expression}} \
-                               // expected-note {{subobject 'f' is not initialized}} \
-                               // ref-error {{must be initialized by a constant expression}} \
-                               // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo2[3] = {true, false, true}; // expected-error {{must be initialized by a constant expression}} \
-                                             // expected-note {{subobject 'f' is not initialized}} \
-                                             // ref-error {{must be initialized by a constant expression}} \
-                                             // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo3[3] = {true, true, F()}; // expected-error {{must be initialized by a constant expression}} \
-                                           // expected-note {{subobject 'f' is not initialized}} \
-                                           // ref-error {{must be initialized by a constant expression}} \
-                                           // ref-note {{subobject 'f' is not initialized}}
+  constexpr F foo[2] = {true}; // both-error {{must be initialized by a constant expression}} \
+                               // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo2[3] = {true, false, true}; // both-error {{must be initialized by a constant expression}} \
+                                             // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo3[3] = {true, true, F()}; // both-error {{must be initialized by a constant expression}} \
+                                           // both-note {{subobject 'f' is not initialized}}
 
 
   class Base {
   public:
     bool b;
-    int a; // expected-note {{subobject declared here}} \
-           // ref-note {{subobject declared here}}
+    int a; // both-note {{subobject declared here}}
     constexpr Base() : b(true) {}
   };
 
@@ -188,56 +166,44 @@ namespace UninitializedFields {
   public:
     constexpr Derived() : Base() {}   };
 
-  constexpr Derived D; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr Derived D; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
 
   class C2 {
   public:
     A a;
     constexpr C2() {}   };
-  constexpr C2 c2; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C2 c2; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C3 {
   public:
     A a[2];
     constexpr C3() {}
   };
-  constexpr C3 c3; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C3 c3; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C4 {
   public:
-    bool B[2][3]; // expected-note {{subobject declared here}} \
-                  // ref-note {{subobject declared here}}
+    bool B[2][3]; // both-note {{subobject declared here}}
     constexpr C4(){}
   };
-  constexpr C4 c4; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'B' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'B' is not initialized}}
+  constexpr C4 c4; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'B' is not initialized}}
 };
 
 namespace ConstThis {
   class Foo {
-    const int T = 12; // expected-note {{declared const here}} \
-                      // ref-note {{declared const here}}
+    const int T = 12; // both-note {{declared const here}}
     int a;
   public:
     constexpr Foo() {
       this->a = 10;
-      T = 13; // expected-error {{cannot assign to non-static data member 'T' with const-qualified type}} \
-              // ref-error {{cannot assign to non-static data member 'T' with const-qualified type}}
+      T = 13; // both-error {{cannot assign to non-static data member 'T' with const-qualified type}}
     }
   };
-  constexpr Foo F; // expected-error {{must be initialized by a constant expression}} \
-                   // ref-error {{must be initialized by a constant expression}}
+  constexpr Foo F; // both-error {{must be initialized by a constant expression}}
 
 
   class FooDtor {
@@ -264,8 +230,7 @@ namespace ConstThis {
     constexpr ctor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
@@ -277,22 +242,17 @@ namespace ConstThis {
     constexpr ~dtor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
   constexpr ctor_test<true> good_ctor;
   constexpr dtor_test<true> good_dtor;
 
-  constexpr ctor_test<false> bad_ctor; // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{in call to}}
-  constexpr dtor_test<false> bad_dtor; // expected-error {{must have constant destruction}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must have constant destruction}} \
-                                       // ref-note {{in call to}}
+  constexpr ctor_test<false> bad_ctor; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{in call to}}
+  constexpr dtor_test<false> bad_dtor; // both-error {{must have constant destruction}} \
+                                       // both-note {{in call to}}
 };
 
 namespace BaseInit {
@@ -311,10 +271,8 @@ namespace BaseInit {
   };
 
   static_assert(Final{1, 2, 3}.c == 3, ""); // OK
-  static_assert(Final{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                            // expected-note {{read of uninitialized object}} \
-                                            // ref-error {{not an integral constant expression}} \
-                                            // ref-note {{read of uninitialized object}}
+  static_assert(Final{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{read of uninitialized object}}
 
 
   struct Mixin  {
@@ -333,10 +291,8 @@ namespace BaseInit {
 
   static_assert(Final2{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final2{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final2{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final2{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 
 
   struct Mixin3  {
@@ -352,10 +308,8 @@ namespace BaseInit {
 
   static_assert(Final3{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final3{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final3{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final3{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 };
 
 namespace Destructors {
@@ -633,16 +587,13 @@ namespace ImplicitFunction {
 
    /// The operator= call here will fail and the diagnostics should be fine.
    b = a; // ref-note {{subobject 'a' is not initialized}} \
-          // ref-note {{in call to}} \
           // expected-note {{read of uninitialized object}} \
-          // expected-note {{in call to}}
+          // both-note {{in call to}}
 
    return 1;
   }
-  static_assert(callMe() == 1, ""); // ref-error {{not an integral constant expression}} \
-                                    // ref-note {{in call to 'callMe()'}} \
-                                    // expected-error {{not an integral constant expression}} \
-                                    // expected-note {{in call to 'callMe()'}}
+  static_assert(callMe() == 1, ""); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to 'callMe()'}}
 }
 
 /// FIXME: Unfortunately, the similar tests in test/SemaCXX/{compare-cxx2a.cpp use member pointers,
@@ -680,8 +631,7 @@ namespace ThreeWayCmp {
   static_assert(1.0 <=> 2.f == -1, "");
   static_assert(1.0 <=> 1.0 == 0, "");
   static_assert(2.0 <=> 1.0 == 1, "");
-  constexpr int k = (1 <=> 1, 0); // expected-warning {{comparison result unused}} \
-                                  // ref-warning {{comparison result unused}}
+  constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}}
   static_assert(k== 0, "");
 
   /// Pointers.
@@ -690,10 +640,8 @@ namespace ThreeWayCmp {
   constexpr const int *pa1 = &a[1];
   constexpr const int *pa2 = &a[2];
   constexpr const int *pb1 = &b[1];
-  static_assert(pa1 <=> pb1 != 0, ""); // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{has unspecified value}} \
-                                       // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{has unspecified value}}
+  static_assert(pa1 <=> pb1 != 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{has unspecified value}} \
   static_assert(pa1 <=> pa1 == 0, "");
   static_assert(pa1 <=> pa2 == -1, "");
   static_assert(pa2 <=> pa1 == 1, "");

From e59ed6683470244118d67a0366e9e90fef1c8b7d Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Fri, 23 Feb 2024 06:50:14 +0100
Subject: [PATCH 537/546] Include `LLVMSPIRVLib.h` in `SPIRVError.cpp` (#2377)

The prototype for `getErrorMessage()` is declared in `LLVMSPIRVLib.h`,
but `SPIRVError.cpp` defining that function did not include
`LLVMSPIRVLib.h`.  This can be problematic for builds that use
`-fvisibility=hidden`.

Fixes https://github.com/KhronosGroup/SPIRV-LLVM-Translator/issues/2376

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/81f78d24db36622
---
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.cpp
index 44c63128b61ae..9c40039b6ebd6 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVError.cpp
@@ -36,6 +36,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LLVMSPIRVLib.h"
+
 #include "SPIRVError.h"
 #include <string>
 

From 6fb3494be92037eefae3d73fd9f258cba4aec125 Mon Sep 17 00:00:00 2001
From: bwlodarcz <bertrand.wlodarczyk@intel.com>
Date: Mon, 26 Feb 2024 17:03:11 +0100
Subject: [PATCH 538/546] Struct as result type in OpGroupNonUniformShuffleDown
 translation fix (#2339)

The contract between some frontends and Translator is that
calls to functions which are matching to hardcoded name are
converted to specific Ops in SPIR-V. The translation from LLVM to SPIR-V of
OpGroupNonUnfiromShuffleDown call was done incorrectly. The struct member
which is correct target for this op was left unwrapped which resulted in
SPIR-V nonconforming with specification. The result of this were later problems
with e.g. SPIR-V to LLVM translation.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/6b3ec4166a63039
---
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp          | 23 ++++++++++
 .../test/group_non_uniform_shuffle_down.ll    | 44 +++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 llvm-spirv/test/group_non_uniform_shuffle_down.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index cf05c8ea2b7e1..3403c458c19da 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -6411,6 +6411,29 @@ LLVMToSPIRVBase::transBuiltinToInstWithoutDecoration(Op OC, CallInst *CI,
     return BM->addCompositeConstructInst(transType(CI->getType()), Operands,
                                          BB);
   }
+  case OpGroupNonUniformShuffleDown: {
+    Function *F = CI->getCalledFunction();
+    if (F->arg_size() && F->getArg(0)->hasStructRetAttr()) {
+      StructType *St = cast<StructType>(F->getParamStructRetType(0));
+      assert(isSYCLHalfType(St) || isSYCLBfloat16Type(St));
+      SPIRVValue *InValue =
+          transValue(CI->getArgOperand(0)->stripPointerCasts(), BB);
+      SPIRVId ScopeId = transValue(CI->getArgOperand(1), BB)->getId();
+      SPIRVValue *Delta = transValue(CI->getArgOperand(3), BB);
+      SPIRVValue *Composite0 = BM->addLoadInst(InValue, {}, BB);
+      Type *MemberTy = St->getElementType(0);
+      SPIRVType *ElementTy = transType(MemberTy);
+      SPIRVValue *Element0 =
+          BM->addCompositeExtractInst(ElementTy, Composite0, {0}, BB);
+      SPIRVValue *Src =
+          BM->addGroupInst(OpGroupNonUniformShuffleDown, ElementTy,
+                           static_cast<Scope>(ScopeId), {Element0, Delta}, BB);
+      SPIRVValue *Composite1 =
+          BM->addCompositeInsertInst(Src, Composite0, {0}, BB);
+      return BM->addStoreInst(InValue, Composite1, {}, BB);
+    }
+    [[fallthrough]];
+  }
   default: {
     if (isCvtOpCode(OC) && OC != OpGenericCastToPtrExplicit) {
       return BM->addUnaryInst(OC, transScavengedType(CI),
diff --git a/llvm-spirv/test/group_non_uniform_shuffle_down.ll b/llvm-spirv/test/group_non_uniform_shuffle_down.ll
new file mode 100644
index 0000000000000..27bd3d98ef9f9
--- /dev/null
+++ b/llvm-spirv/test/group_non_uniform_shuffle_down.ll
@@ -0,0 +1,44 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck --check-prefix CHECK-SPIRV %s
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis %t.rev.bc -o - | FileCheck --check-prefix CHECK-LLVM %s
+
+; CHECK-SPIRV-DAG: TypeInt [[#I32:]] 32 0
+; CHECK-SPIRV-DAG: Constant [[#I32]] [[#CONST_I32_3:]] 3
+; CHECK-SPIRV-DAG: Constant [[#I32]] [[#CONST_I32_8:]] 8
+; CHECK-SPIRV-DAG: TypeFloat [[#HALF:]] 16
+; CHECK-SPIRV-DAG: TypeStruct [[#S_HALF:]] [[#HALF]]
+; CHECK-SPIRV-DAG: TypePointer [[#PTR_S_HALF:]] {{[0-9]+}} [[#S_HALF]]
+
+target triple = "spir64-unknown-unknown"
+
+%"class.sycl::_V1::detail::half_impl::half" = type { half }
+
+define spir_func void @test_group_non_uniform_shuffle_down() {
+entry:
+  %agg.tmp.i.i = alloca %"class.sycl::_V1::detail::half_impl::half", align 2
+  %ref.tmp.i = alloca %"class.sycl::_V1::detail::half_impl::half", align 2
+  %ref.tmp.ascast.i = addrspacecast ptr %ref.tmp.i to ptr addrspace(4)
+  call spir_func void @_Z34__spirv_GroupNonUniformShuffleDownIN4sycl3_V16detail9half_impl4halfEET_N5__spv5Scope4FlagES5_j(ptr addrspace(4) dead_on_unwind writable sret(%"class.sycl::_V1::detail::half_impl::half") align 2 %ref.tmp.ascast.i, i32 noundef 3, ptr noundef nonnull byval(%"class.sycl::_V1::detail::half_impl::half") align 2 %agg.tmp.i.i, i32 noundef 8)
+  ret void
+}
+
+; CHECK-SPIRV: Variable {{[0-9]+}} {{[0-9]+}}
+; CHECK-SPIRV: Variable [[#PTR_S_HALF]] [[#VAR_0:]]
+; CHECK-SPIRV: Load [[#S_HALF]] [[#COMP_0:]] [[#VAR_0]]
+; CHECK-SPIRV: CompositeExtract [[#HALF]] [[#ELEM_0:]] [[#COMP_0]] 0
+; CHECK-SPIRV: GroupNonUniformShuffleDown [[#HALF]] [[#ELEM_1:]] [[#CONST_I32_3]] [[#ELEM_0]] [[#CONST_I32_8]]
+; CHECK-SPIRV: CompositeInsert [[#S_HALF]] [[#COMP_1:]] [[#ELEM_1]] [[#COMP_0]] 0
+; CHECK-SPIRV: Store [[#VAR_0]] [[#COMP_1]]
+
+; CHECK-LLVM: [[ALLOCA_0:%[a-z0-9.]+]] = alloca %"class.sycl::_V1::detail::half_impl::half", align 2
+; CHECK-LLVM: [[ALLOCA_1:%[a-z0-9.]+]] = alloca %"class.sycl::_V1::detail::half_impl::half", align 2
+; CHECK-LLVM: [[LOAD_0:%[a-z0-9.]+]] = load %"class.sycl::_V1::detail::half_impl::half", ptr [[ALLOCA_1]], align 2
+; CHECK-LLVM: [[EXTRACT_0:%[a-z0-9.]+]] = extractvalue %"class.sycl::_V1::detail::half_impl::half" [[LOAD_0]], 0
+; CHECK-LLVM: [[CALL_0:%[a-z0-9.]+]] = call spir_func half @_Z22sub_group_shuffle_downDhj(half [[EXTRACT_0]], i32 8) #0
+; CHECK-LLVM: [[INSERT_0:%[a-z0-9.]+]] = insertvalue %"class.sycl::_V1::detail::half_impl::half" [[LOAD_0]], half [[CALL_0]], 0
+; CHECK-LLVM: store %"class.sycl::_V1::detail::half_impl::half" [[INSERT_0]], ptr [[ALLOCA_1]], align 2
+
+declare dso_local spir_func void @_Z34__spirv_GroupNonUniformShuffleDownIN4sycl3_V16detail9half_impl4halfEET_N5__spv5Scope4FlagES5_j(ptr addrspace(4) dead_on_unwind writable sret(%"class.sycl::_V1::detail::half_impl::half") align 2, i32 noundef, ptr noundef byval(%"class.sycl::_V1::detail::half_impl::half") align 2, i32 noundef) local_unnamed_addr
\ No newline at end of file

From 3d89fc6cfd1627cf81be8c57b86f2c92e036bed7 Mon Sep 17 00:00:00 2001
From: Viktoria Maximova <viktoria.maksimova@intel.com>
Date: Mon, 26 Feb 2024 18:37:25 +0100
Subject: [PATCH 539/546] Support SPV_INTEL_maximum_registers extension (#2344)

Spec:
KhronosGroup/SPIRV-Registry#235

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/92ea64b77207d28
---
 llvm-spirv/include/LLVMSPIRVExtensions.inc    |  1 +
 llvm-spirv/lib/SPIRV/SPIRVReader.cpp          | 44 ++++++++++
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp          | 37 +++++++-
 llvm-spirv/lib/SPIRV/SPIRVWriter.h            |  1 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp  |  3 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h    | 19 ++++-
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h     |  6 ++
 .../lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h     |  3 +
 .../lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h     |  9 ++
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.cpp |  1 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h   |  1 +
 .../lib/SPIRV/libSPIRV/spirv_internal.hpp     | 20 ++++-
 .../registerallocmode_maxreg_extension.ll     | 85 +++++++++++++++++++
 13 files changed, 226 insertions(+), 4 deletions(-)
 create mode 100644 llvm-spirv/test/extensions/INTEL/SPV_INTEL_maximum_registers/registerallocmode_maxreg_extension.ll

diff --git a/llvm-spirv/include/LLVMSPIRVExtensions.inc b/llvm-spirv/include/LLVMSPIRVExtensions.inc
index a96941f0b0544..c9af495691090 100644
--- a/llvm-spirv/include/LLVMSPIRVExtensions.inc
+++ b/llvm-spirv/include/LLVMSPIRVExtensions.inc
@@ -71,3 +71,4 @@ EXT(SPV_INTEL_fp_max_error)
 EXT(SPV_INTEL_cache_controls)
 EXT(SPV_INTEL_subgroup_requirements)
 EXT(SPV_INTEL_task_sequence)
+EXT(SPV_INTEL_maximum_registers)
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index a8cf517e39594..486af805bf0b6 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -4234,6 +4234,50 @@ bool SPIRVToLLVM::transMetadata() {
       F->setMetadata(kSPIR2MD::IntelFPGAIPInterface,
                      MDNode::get(*Context, InterfaceMDVec));
     }
+    if (auto *EM = BF->getExecutionMode(
+            internal::ExecutionModeMaximumRegistersINTEL)) {
+      NamedMDNode *ExecModeMD =
+          M->getOrInsertNamedMetadata(kSPIRVMD::ExecutionMode);
+
+      SmallVector<Metadata *, 4> ValueVec;
+      ValueVec.push_back(ConstantAsMetadata::get(F));
+      ValueVec.push_back(
+          ConstantAsMetadata::get(getUInt32(M, EM->getExecutionMode())));
+      ValueVec.push_back(
+          ConstantAsMetadata::get(getUInt32(M, EM->getLiterals()[0])));
+      ExecModeMD->addOperand(MDNode::get(*Context, ValueVec));
+    }
+    if (auto *EM = BF->getExecutionMode(
+            internal::ExecutionModeMaximumRegistersIdINTEL)) {
+      NamedMDNode *ExecModeMD =
+          M->getOrInsertNamedMetadata(kSPIRVMD::ExecutionMode);
+
+      SmallVector<Metadata *, 4> ValueVec;
+      ValueVec.push_back(ConstantAsMetadata::get(F));
+      ValueVec.push_back(
+          ConstantAsMetadata::get(getUInt32(M, EM->getExecutionMode())));
+
+      auto *ExecOp = BF->getModule()->getValue(EM->getLiterals()[0]);
+      ValueVec.push_back(
+          MDNode::get(*Context, ConstantAsMetadata::get(cast<ConstantInt>(
+                                    transValue(ExecOp, nullptr, nullptr)))));
+      ExecModeMD->addOperand(MDNode::get(*Context, ValueVec));
+    }
+    if (auto *EM = BF->getExecutionMode(
+            internal::ExecutionModeNamedMaximumRegistersINTEL)) {
+      NamedMDNode *ExecModeMD =
+          M->getOrInsertNamedMetadata(kSPIRVMD::ExecutionMode);
+
+      SmallVector<Metadata *, 4> ValueVec;
+      ValueVec.push_back(ConstantAsMetadata::get(F));
+      ValueVec.push_back(
+          ConstantAsMetadata::get(getUInt32(M, EM->getExecutionMode())));
+
+      assert(EM->getLiterals()[0] == 0 &&
+             "Invalid named maximum number of registers");
+      ValueVec.push_back(MDString::get(*Context, "AutoINTEL"));
+      ExecModeMD->addOperand(MDNode::get(*Context, ValueVec));
+    }
   }
   NamedMDNode *MemoryModelMD =
       M->getOrInsertNamedMetadata(kSPIRVMD::MemoryModel);
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 3403c458c19da..1549ba73fb542 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -981,7 +981,10 @@ SPIRVFunction *LLVMToSPIRVBase::transFunctionDecl(Function *F) {
 
   transFPGAFunctionMetadata(BF, F);
 
-  transFunctionMetadataAsUserSemanticDecoration(BF, F);
+  if (BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_maximum_registers))
+    transFunctionMetadataAsExecutionMode(BF, F);
+  else
+    transFunctionMetadataAsUserSemanticDecoration(BF, F);
 
   transAuxDataInst(BF, F);
 
@@ -1136,6 +1139,38 @@ void LLVMToSPIRVBase::transFPGAFunctionMetadata(SPIRVFunction *BF,
     transMetadataDecorations(FDecoMD, BF);
 }
 
+void LLVMToSPIRVBase::transFunctionMetadataAsExecutionMode(SPIRVFunction *BF,
+                                                           Function *F) {
+  SmallVector<MDNode *, 1> RegisterAllocModeMDs;
+  F->getMetadata("RegisterAllocMode", RegisterAllocModeMDs);
+
+  for (unsigned I = 0; I < RegisterAllocModeMDs.size(); I++) {
+    auto *RegisterAllocMode = RegisterAllocModeMDs[I]->getOperand(0).get();
+    if (isa<MDString>(RegisterAllocMode)) {
+      StringRef Str = getMDOperandAsString(RegisterAllocModeMDs[I], 0);
+      internal::InternalNamedMaximumNumberOfRegisters NamedValue =
+          SPIRVNamedMaximumNumberOfRegistersNameMap::rmap(Str.str());
+      BF->addExecutionMode(BM->add(new SPIRVExecutionMode(
+          OpExecutionMode, BF,
+          internal::ExecutionModeNamedMaximumRegistersINTEL, NamedValue)));
+    } else if (isa<MDNode>(RegisterAllocMode)) {
+      auto *RegisterAllocNodeMDOp =
+          getMDOperandAsMDNode(RegisterAllocModeMDs[I], 0);
+      int Num = getMDOperandAsInt(RegisterAllocNodeMDOp, 0);
+      auto *Const =
+          BM->addConstant(transType(Type::getInt32Ty(F->getContext())), Num);
+      BF->addExecutionMode(BM->add(new SPIRVExecutionModeId(
+          BF, internal::ExecutionModeMaximumRegistersIdINTEL, Const->getId())));
+    } else {
+      int64_t RegisterAllocVal =
+          mdconst::dyn_extract<ConstantInt>(RegisterAllocMode)->getZExtValue();
+      BF->addExecutionMode(BM->add(new SPIRVExecutionMode(
+          OpExecutionMode, BF, internal::ExecutionModeMaximumRegistersINTEL,
+          RegisterAllocVal)));
+    }
+  }
+}
+
 void LLVMToSPIRVBase::transFunctionMetadataAsUserSemanticDecoration(
     SPIRVFunction *BF, Function *F) {
   if (auto *RegisterAllocModeMD = F->getMetadata("RegisterAllocMode")) {
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.h b/llvm-spirv/lib/SPIRV/SPIRVWriter.h
index 0978c3f3a6caa..1ca30ff756075 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.h
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.h
@@ -131,6 +131,7 @@ class LLVMToSPIRVBase : protected BuiltinCallHelper {
   SPIRVFunction *transFunctionDecl(Function *F);
   void transVectorComputeMetadata(Function *F);
   void transFPGAFunctionMetadata(SPIRVFunction *BF, Function *F);
+  void transFunctionMetadataAsExecutionMode(SPIRVFunction *BF, Function *F);
   void transFunctionMetadataAsUserSemanticDecoration(SPIRVFunction *BF,
                                                      Function *F);
   void transAuxDataInst(SPIRVFunction *BF, Function *F);
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
index 470061d903586..8a60be3951724 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.cpp
@@ -661,6 +661,9 @@ void SPIRVExecutionMode::decode(std::istream &I) {
   case ExecutionModeRegisterMapInterfaceINTEL:
   case ExecutionModeStreamingInterfaceINTEL:
   case spv::internal::ExecutionModeNamedSubgroupSizeINTEL:
+  case internal::ExecutionModeMaximumRegistersINTEL:
+  case internal::ExecutionModeMaximumRegistersIdINTEL:
+  case internal::ExecutionModeNamedMaximumRegistersINTEL:
     WordLiterals.resize(1);
     break;
   default:
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
index 78127500ab851..22be73bdf3adc 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEntry.h
@@ -696,6 +696,17 @@ class SPIRVExecutionMode : public SPIRVAnnotation {
     }
   }
 
+  std::optional<ExtensionID> getRequiredExtension() const override {
+    switch (static_cast<unsigned>(ExecMode)) {
+    case internal::ExecutionModeMaximumRegistersINTEL:
+    case internal::ExecutionModeMaximumRegistersIdINTEL:
+    case internal::ExecutionModeNamedMaximumRegistersINTEL:
+      return ExtensionID::SPV_INTEL_maximum_registers;
+    default:
+      return {};
+    }
+  }
+
 protected:
   _SPIRV_DCL_ENCDEC
   SPIRVExecutionModeKind ExecMode;
@@ -757,6 +768,11 @@ class SPIRVComponentExecutionModes {
       return IsDenorm(EMK) || IsRoundingMode(EMK) || IsFPMode(EMK) ||
              IsOtherFP(EMK);
     };
+    auto IsMaxRegisters = [&](auto EMK) {
+      return EMK == internal::ExecutionModeMaximumRegistersINTEL ||
+             EMK == internal::ExecutionModeMaximumRegistersIdINTEL ||
+             EMK == internal::ExecutionModeNamedMaximumRegistersINTEL;
+    };
     auto IsCompatible = [&](SPIRVExecutionMode *EM0, SPIRVExecutionMode *EM1) {
       if (EM0->getTargetId() != EM1->getTargetId())
         return true;
@@ -770,7 +786,8 @@ class SPIRVComponentExecutionModes {
         return true;
       return !(IsDenorm(EMK0) && IsDenorm(EMK1)) &&
              !(IsRoundingMode(EMK0) && IsRoundingMode(EMK1)) &&
-             !(IsFPMode(EMK0) && IsFPMode(EMK1));
+             !(IsFPMode(EMK0) && IsFPMode(EMK1)) &&
+             !(IsMaxRegisters(EMK0) && IsMaxRegisters(EMK1));
     };
     for (auto I = ExecModes.begin(); I != ExecModes.end(); ++I) {
       assert(IsCompatible(ExecMode, (*I).second) &&
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
index 9a2c3bfce6eac..314b79fa16cef 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h
@@ -293,6 +293,12 @@ template <> inline void SPIRVMap<SPIRVExecutionModeKind, SPIRVCapVec>::init() {
                {CapabilityVectorComputeINTEL});
   ADD_VEC_INIT(internal::ExecutionModeNamedSubgroupSizeINTEL,
                {internal::CapabilitySubgroupRequirementsINTEL});
+  ADD_VEC_INIT(internal::ExecutionModeMaximumRegistersINTEL,
+               {internal::CapabilityRegisterLimitsINTEL});
+  ADD_VEC_INIT(internal::ExecutionModeMaximumRegistersIdINTEL,
+               {internal::CapabilityRegisterLimitsINTEL});
+  ADD_VEC_INIT(internal::ExecutionModeNamedMaximumRegistersINTEL,
+               {internal::CapabilityRegisterLimitsINTEL});
 }
 
 template <> inline void SPIRVMap<SPIRVMemoryModelKind, SPIRVCapVec>::init() {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
index 080b8bcd868ff..0b9114bd34fa6 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h
@@ -73,6 +73,9 @@ inline bool isValid(spv::ExecutionModel V) {
   case ExecutionModelCallableKHR:
   case ExecutionModeRegisterMapInterfaceINTEL:
   case ExecutionModeStreamingInterfaceINTEL:
+  case internal::ExecutionModeMaximumRegistersINTEL:
+  case internal::ExecutionModeMaximumRegistersIdINTEL:
+  case internal::ExecutionModeNamedMaximumRegistersINTEL:
     return true;
   default:
     return false;
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
index ff704c63b3696..4b2d4c7039a36 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h
@@ -688,6 +688,7 @@ template <> inline void SPIRVMap<Capability, std::string>::init() {
   add(internal::CapabilitySubgroupRequirementsINTEL,
       "SubgroupRequirementsINTEL");
   add(internal::CapabilityTaskSequenceINTEL, "TaskSequenceINTEL");
+  add(internal::CapabilityRegisterLimitsINTEL, "RegisterLimitsINTEL");
 }
 SPIRV_DEF_NAMEMAP(Capability, SPIRVCapabilityNameMap)
 
@@ -711,6 +712,14 @@ template <> inline void SPIRVMap<HostAccessQualifier, std::string>::init() {
 }
 SPIRV_DEF_NAMEMAP(HostAccessQualifier, SPIRVHostAccessQualifierNameMap)
 
+template <>
+inline void
+SPIRVMap<internal::InternalNamedMaximumNumberOfRegisters, std::string>::init() {
+  add(internal::NamedMaximumNumberOfRegistersAutoINTEL, "AutoINTEL");
+}
+SPIRV_DEF_NAMEMAP(internal::InternalNamedMaximumNumberOfRegisters,
+                  SPIRVNamedMaximumNumberOfRegistersNameMap);
+
 } /* namespace SPIRV */
 
 #endif // SPIRV_LIBSPIRV_SPIRVNAMEMAPENUM_H
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.cpp
index 7b785b5b55057..b13220ae32e0b 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.cpp
@@ -147,6 +147,7 @@ SPIRV_DEF_ENCDEC(SPIRVDebugExtOpKind)
 SPIRV_DEF_ENCDEC(NonSemanticAuxDataOpKind)
 SPIRV_DEF_ENCDEC(InitializationModeQualifier)
 SPIRV_DEF_ENCDEC(HostAccessQualifier)
+SPIRV_DEF_ENCDEC(internal::InternalNamedMaximumNumberOfRegisters)
 SPIRV_DEF_ENCDEC(LinkageType)
 
 // Read a string with padded 0's at the end so that they form a stream of
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h
index 0a788044a642c..478692ccdccaf 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVStream.h
@@ -231,6 +231,7 @@ SPIRV_DEC_ENCDEC(SPIRVDebugExtOpKind)
 SPIRV_DEC_ENCDEC(NonSemanticAuxDataOpKind)
 SPIRV_DEC_ENCDEC(InitializationModeQualifier)
 SPIRV_DEC_ENCDEC(HostAccessQualifier)
+SPIRV_DEC_ENCDEC(internal::InternalNamedMaximumNumberOfRegisters)
 SPIRV_DEC_ENCDEC(LinkageType)
 
 const SPIRVEncoder &operator<<(const SPIRVEncoder &O, const std::string &Str);
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp b/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp
index 3b686d32cc30b..5aa9b39253167 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/spirv_internal.hpp
@@ -124,14 +124,18 @@ enum InternalCapability {
   ICapabilityJointMatrixPackedInt2ComponentTypeINTEL = 6438,
   ICapabilityJointMatrixPackedInt4ComponentTypeINTEL = 6439,
   ICapabilityCacheControlsINTEL = 6441,
-  ICapabilitySubgroupRequirementsINTEL = 6445
+  ICapabilitySubgroupRequirementsINTEL = 6445,
+  ICapRegisterLimitsINTEL = 6460
 };
 
 enum InternalFunctionControlMask { IFunctionControlOptNoneINTELMask = 0x10000 };
 
 enum InternalExecutionMode {
   IExecModeFastCompositeKernelINTEL = 6088,
-  IExecModeNamedSubgroupSizeINTEL = 6446
+  IExecModeNamedSubgroupSizeINTEL = 6446,
+  IExecModeMaximumRegistersINTEL = 6461,
+  IExecModeMaximumRegistersIdINTEL = 6462,
+  IExecModeNamedMaximumRegistersINTEL = 6463
 };
 
 constexpr LinkageType LinkageTypeInternal =
@@ -159,6 +163,10 @@ enum InternalBuiltIn {
   IBuiltInGlobalHWThreadIDINTEL = 6136,
 };
 
+enum InternalNamedMaximumNumberOfRegisters {
+  NamedMaximumNumberOfRegistersAutoINTEL = 0,
+};
+
 #define _SPIRV_OP(x, y) constexpr x x##y = static_cast<x>(I##x##y);
 _SPIRV_OP(Capability, JointMatrixINTEL)
 _SPIRV_OP(Capability, JointMatrixWIInstructionsINTEL)
@@ -292,12 +300,20 @@ constexpr Capability CapabilityBfloat16ConversionINTEL =
     static_cast<Capability>(ICapBfloat16ConversionINTEL);
 constexpr Capability CapabilityGlobalVariableDecorationsINTEL =
     static_cast<Capability>(ICapGlobalVariableDecorationsINTEL);
+constexpr Capability CapabilityRegisterLimitsINTEL =
+    static_cast<Capability>(ICapRegisterLimitsINTEL);
 
 constexpr FunctionControlMask FunctionControlOptNoneINTELMask =
     static_cast<FunctionControlMask>(IFunctionControlOptNoneINTELMask);
 
 constexpr ExecutionMode ExecutionModeFastCompositeKernelINTEL =
     static_cast<ExecutionMode>(IExecModeFastCompositeKernelINTEL);
+constexpr ExecutionMode ExecutionModeMaximumRegistersINTEL =
+    static_cast<ExecutionMode>(IExecModeMaximumRegistersINTEL);
+constexpr ExecutionMode ExecutionModeMaximumRegistersIdINTEL =
+    static_cast<ExecutionMode>(IExecModeMaximumRegistersIdINTEL);
+constexpr ExecutionMode ExecutionModeNamedMaximumRegistersINTEL =
+    static_cast<ExecutionMode>(IExecModeNamedMaximumRegistersINTEL);
 
 constexpr ExecutionMode ExecutionModeNamedSubgroupSizeINTEL =
     static_cast<ExecutionMode>(IExecModeNamedSubgroupSizeINTEL);
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_maximum_registers/registerallocmode_maxreg_extension.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_maximum_registers/registerallocmode_maxreg_extension.ll
new file mode 100644
index 0000000000000..1dfc768ffebe0
--- /dev/null
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_maximum_registers/registerallocmode_maxreg_extension.ll
@@ -0,0 +1,85 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv -spirv-text --spirv-ext=+SPV_INTEL_maximum_registers %t.bc
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_maximum_registers -o %t.spv
+; RUN: llvm-spirv -r %t.spv -spirv-target-env=SPV-IR -o - | llvm-dis -o %t.rev.ll
+; RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC0:]] "main_l3"
+; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC1:]] "main_l6"
+; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC2:]] "main_l9"
+; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC3:]] "main_l13"
+; CHECK-SPIRV: EntryPoint [[#]] [[#FUNC4:]] "main_l19"
+
+; CHECK-SPIRV: ExecutionMode [[#FUNC0]] 6461 2
+; CHECK-SPIRV: ExecutionMode [[#FUNC1]] 6461 1
+; CHECK-SPIRV: ExecutionMode [[#FUNC2]] 6463 0
+; CHECK-SPIRV: ExecutionModeId [[#FUNC3]] 6462 [[#Const3:]]
+; CHECK-SPIRV: TypeInt [[#TypeInt:]] 32 0
+; CHECK-SPIRV: Constant [[#TypeInt]] [[#Const3]] 3
+
+; CHECK-SPIRV-NOT: ExecutionMode [[#FUNC4]]
+
+; CHECK-LLVM: !spirv.ExecutionMode = !{![[#FLAG0:]], ![[#FLAG1:]], ![[#FLAG2:]], ![[#FLAG3:]]}
+; CHECK-LLVM: ![[#FLAG0]] = !{ptr @main_l3, i32 6461, i32 2}
+; CHECK-LLVM: ![[#FLAG1]] = !{ptr @main_l6, i32 6461, i32 1}
+; CHECK-LLVM: ![[#FLAG2]] = !{ptr @main_l9, i32 6463, !"AutoINTEL"}
+; CHECK-LLVM: ![[#FLAG3]] = !{ptr @main_l13, i32 6462, ![[#VAL:]]}
+; CHECK-LLVM: ![[#VAL]] = !{i32 3}
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64"
+
+; Function Attrs: noinline nounwind optnone
+define weak dso_local spir_kernel void @main_l3() #0 !RegisterAllocMode !10 {
+newFuncRoot:
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define weak dso_local spir_kernel void @main_l6() #0 !RegisterAllocMode !11 {
+newFuncRoot:
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define weak dso_local spir_kernel void @main_l9() #0 !RegisterAllocMode !12 {
+newFuncRoot:
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define weak dso_local spir_kernel void @main_l13() #0 !RegisterAllocMode !13 {
+newFuncRoot:
+  ret void
+}
+
+; Function Attrs: noinline nounwind optnone
+define weak dso_local spir_kernel void @main_l19() #0 {
+newFuncRoot:
+  ret void
+}
+
+attributes #0 = { noinline nounwind optnone }
+
+
+!opencl.compiler.options = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!spirv.Source = !{!2, !3, !3, !3, !3, !3, !2, !3, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2, !2}
+!llvm.module.flags = !{!4, !5, !6, !7, !8}
+!spirv.MemoryModel = !{!9, !9, !9, !9, !9, !9}
+!spirv.ExecutionMode = !{}
+
+!0 = !{}
+!2 = !{i32 4, i32 200000}
+!3 = !{i32 3, i32 200000}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"openmp", i32 50}
+!6 = !{i32 7, !"openmp-device", i32 50}
+!7 = !{i32 8, !"PIC Level", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = !{i32 2, i32 2}
+!10 = !{i32 2}
+!11 = !{i32 1}
+!12 = !{!"AutoINTEL"}
+!13 = !{!14}
+!14 = !{i32 3}

From 1d53bf42604a6dd6f5364746f878e1e6ec4288e5 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Tue, 27 Feb 2024 14:57:40 +0100
Subject: [PATCH 540/546] Fix OpGroupNonUniformBroadcast version requirement
 (#2378)

undefinedFor SPIR-V 1.4 and earlier, the Id operand of OpGroupNonUniformBroadcast must come from a constant instruction.

Ensure that any emitted SPIR-V module using a non-constant Id operand declares SPIR-V version 1.5. Bail out if the maximum SPIR-V version was restricted to < 1.5, to avoid producing invalid SPIR-V.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/64fe75e388dc3b7
---
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     | 28 +++++++++++++++++++
 .../test/transcoding/subgroup_spirv_1_5.cl    | 17 +++++++++++
 2 files changed, 45 insertions(+)
 create mode 100644 llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl

diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index 2b1887682f458..b387745e24703 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -276,6 +276,9 @@ class SPIRVInstTemplateBase : public SPIRVInstruction {
     } else
       SPIRVEntry::setWordCount(WC);
     Ops = TheOps;
+    // The required SPIR-V version depends on the operands for some
+    // instructions.
+    updateModuleVersion();
   }
   void setWordCount(SPIRVWord TheWordCount) override {
     SPIRVEntry::setWordCount(TheWordCount);
@@ -295,6 +298,11 @@ class SPIRVInstTemplateBase : public SPIRVInstruction {
 
   /// Get operand as value.
   /// If the operand is a literal, return it as a uint32 constant.
+  const SPIRVValue *getOpValue(int I) const {
+    return isOperandLiteral(I) ? Module->getLiteralAsConstant(Ops[I])
+                               : getValue(Ops[I]);
+  }
+
   SPIRVValue *getOpValue(int I) {
     return isOperandLiteral(I) ? Module->getLiteralAsConstant(Ops[I])
                                : getValue(Ops[I]);
@@ -315,6 +323,10 @@ class SPIRVInstTemplateBase : public SPIRVInstruction {
     return Operands;
   }
 
+  virtual const SPIRVValue *getOperand(unsigned I) const {
+    return getOpValue(I);
+  }
+
   virtual SPIRVValue *getOperand(unsigned I) {
     return getOpValue(I);
   }
@@ -2503,6 +2515,22 @@ class SPIRVGroupNonUniformBallotInst : public SPIRVInstTemplateBase {
   SPIRVCapVec getRequiredCapability() const override {
     return getVec(CapabilityGroupNonUniformBallot);
   }
+
+  SPIRVWord getRequiredSPIRVVersion() const override {
+    switch (OpCode) {
+    case OpGroupNonUniformBroadcast: {
+      assert(Ops.size() == 3 && "Expecting (Execution, Value, Id) operands");
+      if (!isConstantOpCode(getOperand(2)->getOpCode())) {
+        // Before version 1.5, Id must come from a constant instruction.
+        return static_cast<SPIRVWord>(VersionNumber::SPIRV_1_5);
+      }
+      break;
+    }
+    default:
+      break;
+    }
+    return static_cast<SPIRVWord>(VersionNumber::SPIRV_1_3);
+  }
 };
 
 #define _SPIRV_OP(x, ...)                                                      \
diff --git a/llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl b/llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl
new file mode 100644
index 0000000000000..be7dbf59a49d6
--- /dev/null
+++ b/llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -O1 -cl-std=CL2.0 -fdeclare-opencl-builtins -finclude-default-header -emit-llvm-bc %s -o %t.bc
+// RUN: not llvm-spirv --spirv-max-version=1.4 %t.bc 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: llvm-spirv %t.bc -o %t.spv
+// RUN: spirv-val %t.spv
+// RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+// RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+// Before SPIR-V 1.5, the Id operand of OpGroupNonUniformBroadcast must come from a constant instruction.
+// CHECK-ERROR: RequiresVersion: Cannot fulfill SPIR-V version restriction:
+// CHECK-ERROR-NEXT: SPIR-V version was restricted to at most 1.4 (66560) but a construct from the input requires SPIR-V version 1.5 (66816) or above
+
+// CHECK-LLVM-LABEL: @test
+// CHECK-LLVM: call spir_func i16 @_Z31sub_group_non_uniform_broadcasttj(i16 %a, i32 %id)
+
+kernel void test(short a, uint id, global short *res) {
+  res[0] = sub_group_non_uniform_broadcast(a, id);
+}

From 4ddf13bf01088fc0e88c25c31567d998d1a64a6a Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov@intel.com>
Date: Tue, 27 Feb 2024 20:24:00 +0100
Subject: [PATCH 541/546] Fix function with unused sret parameter (#2381)

Usually sret parameters are accessed by a memory instruction, from which
would tell SPIRVTypeScavenger which type to use for this function
parameter. But if sret parameter is unused later in the module scavenger
would fail attempting to deduce type from the mangled name.

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/f71747917eb56ef
---
 llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp   |  3 +-
 .../transcoding/unused-sret-opaque-ptr.ll     | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 llvm-spirv/test/transcoding/unused-sret-opaque-ptr.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp b/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp
index eb4240362f2b6..cf75eb8d1d0be 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVTypeScavenger.cpp
@@ -734,7 +734,8 @@ void SPIRVTypeScavenger::deduceFunctionType(Function &F) {
         for (Argument *Arg : PointerArgs) {
           if (auto *Ty =
                   dyn_cast<TypedPointerType>(ParamTypes[Arg->getArgNo()]))
-            TypeArgument(Arg, Ty);
+            if (!Arg->hasAttribute(Attribute::StructRet))
+              TypeArgument(Arg, Ty);
         }
       }
     }
diff --git a/llvm-spirv/test/transcoding/unused-sret-opaque-ptr.ll b/llvm-spirv/test/transcoding/unused-sret-opaque-ptr.ll
new file mode 100644
index 0000000000000..ef1e7e8cd64fd
--- /dev/null
+++ b/llvm-spirv/test/transcoding/unused-sret-opaque-ptr.ll
@@ -0,0 +1,30 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o %t.txt
+; RUN: FileCheck < %t.txt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: Name [[#Fun:]] "_Z3booi"
+; CHECK-SPIRV-DAG: Decorate [[#Param:]] FuncParamAttr 3
+; CHECK-SPIRV-DAG: TypePointer [[#PtrTy:]] [[#]] [[#StructTy:]]
+; CHECK-SPIRV-DAG: TypeStruct [[#StructTy]]
+; CHECK-SPIRV: Function [[#]] [[#Fun]]
+; CHECK-SPIRV: FunctionParameter [[#PtrTy:]] [[#Param]]
+
+; CHECK-LLVM: call spir_func void @_Z3booi(ptr sret(%struct.Example) align 8
+
+source_filename = "/app/example.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
+
+%struct.Example = type { }
+
+define spir_func i32 @foo() {
+  %1 = alloca %struct.Example, align 8
+  call void @_Z3booi(ptr sret(%struct.Example) align 8 %1, i32 noundef 42)
+  ret i32 0
+}
+
+declare void @_Z3booi(ptr sret(%struct.Example) align 8, i32 noundef)

From d053f45da7ddf635c9580052333f99c9aeb81b12 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Wed, 28 Feb 2024 14:14:59 +0100
Subject: [PATCH 542/546] Out of bounds cooperative matrix load/store/fill work
 with legacy TypeJointMatrixINTEL (#2318)

This PR is to ensure that CooperativeMatrixLoadCheckedINTEL, CooperativeMatrixStoreCheckedINTEL and CooperativeMatrixConstructCheckedINTEL instructions can correctly accept TypeJointMatrixINTEL as an input during both forward and reverse translation.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/632c3f626990f36
---
 llvm-spirv/lib/SPIRV/SPIRVReader.cpp          |   1 +
 .../cooperative_matrix_checked.ll             |  25 +--
 .../joint_matrix_checked.ll                   | 158 ++++++++++++++++++
 3 files changed, 172 insertions(+), 12 deletions(-)
 create mode 100644 llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/joint_matrix_checked.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index 486af805bf0b6..659f007b43f99 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -3297,6 +3297,7 @@ Instruction *SPIRVToLLVM::transSPIRVBuiltinFromInst(SPIRVInstruction *BI,
   case OpSUDotAccSatKHR:
   case internal::OpJointMatrixLoadINTEL:
   case OpCooperativeMatrixLoadKHR:
+  case internal::OpCooperativeMatrixLoadCheckedINTEL:
   case internal::OpTaskSequenceCreateINTEL:
     AddRetTypePostfix = true;
     break;
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/cooperative_matrix_checked.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/cooperative_matrix_checked.ll
index 7dd3e7e9bdc94..f6e4ba727c935 100644
--- a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/cooperative_matrix_checked.ll
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/cooperative_matrix_checked.ll
@@ -9,6 +9,7 @@
 ; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
 
 ; CHECK-SPIRV-DAG: Capability CooperativeMatrixKHR
+; CHECK-SPIRV-DAG: Capability CooperativeMatrixCheckedInstructionsINTEL
 ; CHECK-SPIRV-DAG: Extension "SPV_KHR_cooperative_matrix"
 ; CHECK-SPIRV-DAG: Extension "SPV_INTEL_joint_matrix"
 ; CHECK-SPIRV-DAG: TypeInt [[#Int8Ty:]] 8 0
@@ -30,12 +31,12 @@
 ; CHECK-SPIRV: CooperativeMatrixMulAddKHR [[#MatTy1]]
 ; CHECK-SPIRV: CooperativeMatrixStoreCheckedINTEL
 
-; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTELiilli(i32 4, i32 4, i64 12, i64 12, i32 %_arg_Initvalue)
-; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTELPU3AS4ciiillli(ptr addrspace(4) %[[MatrixPtr:[%0-9a-z.]+]], i32 0, i32 0, i32 0, i64 12, i64 48, i64 %_arg_K, i32 1)
+; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTELiiiii(i32 4, i32 4, i32 12, i32 12, i32 %_arg_Initvalue)
+; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) @_Z95__spirv_CooperativeMatrixLoadCheckedINTEL_RPU3AS144__spirv_CooperativeMatrixKHR__char_3_12_48_0PU3AS4ciiiiili(ptr addrspace(4) %[[MatrixPtr:[%0-9a-z.]+]], i32 0, i32 0, i32 0, i32 12, i32 48, i64 %_arg_K, i32 1)
 ; CHECK-LLVM: call spir_func i32 @_Z34__spirv_CooperativeMatrixLengthKHRPU3AS144__spirv_CooperativeMatrixKHR__char_3_12_48_0(target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0)
-; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTELPU3AS4ciiilll
+; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) @_Z95__spirv_CooperativeMatrixLoadCheckedINTEL_RPU3AS144__spirv_CooperativeMatrixKHR__char_2_48_12_1PU3AS4ciiiiil
 ; CHECK-LLVM: call spir_func target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z34__spirv_CooperativeMatrixMulAddKHRPU3AS144__spirv_CooperativeMatrixKHR__char_3_12_48_0PU3AS144__spirv_CooperativeMatrixKHR__char_2_48_12_1PU3AS144__spirv_CooperativeMatrixKHR__uint_3_12_12_2i(target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) %{{.*}}, target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) %{{.*}}, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2)
-; CHECK-LLVM: call spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTELPU3AS4iiiPU3AS144__spirv_CooperativeMatrixKHR__uint_3_12_12_2illli(ptr addrspace(4) %{{.*}}, i32 0, i32 0, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2)
+; CHECK-LLVM: call spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTELPU3AS4iiiPU3AS144__spirv_CooperativeMatrixKHR__uint_3_12_12_2iiili(ptr addrspace(4) %{{.*}}, i32 0, i32 0, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2)
 
 ; ModuleID = 'test-matrix-opaque.bc'
 source_filename = "matrix-int8-test.cpp"
@@ -77,7 +78,7 @@ entry:
   %cmp.i58.i = icmp ult i64 %5, 2147483648
   %sub5.i = sub nsw i64 %2, %5
   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %sub_c.sroa.0.i)
-  %call.i.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTEL(i32 noundef 4, i32 noundef 4, i64 noundef 12, i64 noundef 12, i32 noundef %_arg_Initvalue) #4
+  %call.i.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTEL(i32 noundef 4, i32 noundef 4, i32 noundef 12, i32 noundef 12, i32 noundef %_arg_Initvalue) #4
   store target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) %call.i.i, ptr %sub_c.sroa.0.i, align 8
   %mul.i = mul nsw i64 %sub.i, 12
   %div2452.i = lshr i64 %sub5.i, 4
@@ -102,14 +103,14 @@ for.body.i:                                       ; preds = %for.cond.i
   %conv13.i = zext i32 %mul12.i to i64
   %add.ptr.i96.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i93.i, i64 %conv13.i
   %call.ascast.i66.i = addrspacecast ptr addrspace(1) %add.ptr.i96.i to ptr addrspace(4)
-  %call1.i.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_1(ptr addrspace(4) noundef %call.ascast.i66.i, i32 noundef 0, i32 noundef 0, i32 noundef 0, i64 noundef 12, i64 noundef 48, i64 noundef %_arg_K, i32 noundef 1) #4
+  %call1.i.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_1(ptr addrspace(4) noundef %call.ascast.i66.i, i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 12, i32 noundef 48, i64 noundef %_arg_K, i32 noundef 1) #4
   %len = tail call spir_func noundef i32 @_Z34__spirv_CooperativeMatrixLengthKHR(target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) %call1.i.i)
   %div20.i = mul nsw i32 %k.0.i, 12
   %conv21.i = zext i32 %div20.i to i64
   %mul23.i = mul i64 %mul22.i, %conv21.i
   %add.ptr.i111.i = getelementptr i8, ptr addrspace(1) %add.ptr.i108140.i, i64 %mul23.i
   %call.ascast.i72.i = addrspacecast ptr addrspace(1) %add.ptr.i111.i to ptr addrspace(4)
-  %call1.i73.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_2(ptr addrspace(4) noundef %call.ascast.i72.i, i32 noundef 0, i32 noundef 0, i32 noundef 0, i64 noundef 48, i64 noundef 12, i64 noundef %mul22.i) #4
+  %call1.i73.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_2(ptr addrspace(4) noundef %call.ascast.i72.i, i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 48, i32 noundef 12, i64 noundef %mul22.i) #4
   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ref.tmp29.sroa.0.i)
   %sub_c.sroa.0.i.0.sub_c.sroa.0.i.0.sub_c.sroa.0.0.sub_c.sroa.0.0.sub_c.sroa.0.0.125.i = load target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), ptr %sub_c.sroa.0.i, align 8
   %call.i77.i = tail call spir_func noundef target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z34__spirv_CooperativeMatrixMulAddKHR(target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) noundef %call1.i.i, target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) noundef %call1.i73.i, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) noundef %sub_c.sroa.0.i.0.sub_c.sroa.0.i.0.sub_c.sroa.0.0.sub_c.sroa.0.0.sub_c.sroa.0.0.125.i, i32 noundef 12) #4
@@ -127,27 +128,27 @@ _ZZZ15matrix_multiplyIiaLm24ELm96ELm24ELm96ELm24ELm24EEvR10big_matrixIT_XT5_EXT6
   %add.ptr.i81.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i, i64 %mul39.i
   %call.ascast.i.i = addrspacecast ptr addrspace(1) %add.ptr.i81.i to ptr addrspace(4)
   %sub_c.sroa.0.i.0.sub_c.sroa.0.i.0.sub_c.sroa.0.0.sub_c.sroa.0.0.sub_c.sroa.0.0..i = load target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2), ptr %sub_c.sroa.0.i, align 8
-  tail call spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTEL(ptr addrspace(4) noundef %call.ascast.i.i, i32 noundef 0, i32 noundef 0, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) noundef %sub_c.sroa.0.i.0.sub_c.sroa.0.i.0.sub_c.sroa.0.0.sub_c.sroa.0.0.sub_c.sroa.0.0..i, i32 noundef 0, i64 noundef 12, i64 noundef 12, i64 noundef %_arg_N, i32 noundef 1) #4
+  tail call spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTEL(ptr addrspace(4) noundef %call.ascast.i.i, i32 noundef 0, i32 noundef 0, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) noundef %sub_c.sroa.0.i.0.sub_c.sroa.0.i.0.sub_c.sroa.0.0.sub_c.sroa.0.0.sub_c.sroa.0.0..i, i32 noundef 0, i32 noundef 12, i32 noundef 12, i64 noundef %_arg_N, i32 noundef 1) #4
   call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %sub_c.sroa.0.i)
   ret void
 }
 
 ; Function Attrs: convergent
-declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTEL(i32 noundef, i32 noundef, i64 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
+declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTEL(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef) local_unnamed_addr #2
 
 declare dso_local spir_func noundef i32 @_Z34__spirv_CooperativeMatrixLengthKHR(target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) noundef)
 
 ; Function Attrs: convergent
-declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_1(ptr addrspace(4) noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef, i64 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
+declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_1(ptr addrspace(4) noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
 
 ; Function Attrs: convergent
-declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_2(ptr addrspace(4) noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef, i64 noundef, i64 noundef) local_unnamed_addr #2
+declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_2(ptr addrspace(4) noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef) local_unnamed_addr #2
 
 ; Function Attrs: convergent
 declare dso_local spir_func noundef target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) @_Z34__spirv_CooperativeMatrixMulAddKHR(target("spirv.CooperativeMatrixKHR", i8, 3, 12, 48, 0) noundef, target("spirv.CooperativeMatrixKHR", i8, 2, 48, 12, 1) noundef, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) noundef, i32 noundef) local_unnamed_addr #2
 
 ; Function Attrs: convergent
-declare dso_local spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTEL(ptr addrspace(4) noundef, i32 noundef, i32 noundef, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) noundef, i32 noundef, i64 noundef, i64 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
+declare dso_local spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTEL(ptr addrspace(4) noundef, i32 noundef, i32 noundef, target("spirv.CooperativeMatrixKHR", i32, 3, 12, 12, 2) noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/joint_matrix_checked.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/joint_matrix_checked.ll
new file mode 100644
index 0000000000000..d948844ef4820
--- /dev/null
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_joint_matrix/joint_matrix_checked.ll
@@ -0,0 +1,158 @@
+; This is an adapted copy of test/extensions/INTEL/SPV_INTEL_joint_matrix/cooperative_matrix_checked.ll
+
+; RUN: llvm-as < %s -o %t.bc
+; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_joint_matrix -o %t.spv
+; RUN: llvm-spirv %t.spv -to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV-DAG: Capability JointMatrixINTEL
+; CHECK-SPIRV-DAG: Capability CooperativeMatrixKHR
+; CHECK-SPIRV-DAG: Capability CooperativeMatrixCheckedInstructionsINTEL
+; CHECK-SPIRV: Extension "SPV_INTEL_joint_matrix"
+; CHECK-SPIRV-NOT: Extension "SPV_KHR_cooperative_matrix"
+; CHECK-SPIRV-DAG: TypeInt [[#Int8Ty:]] 8 0
+; CHECK-SPIRV-DAG: TypeInt [[#Int32Ty:]] 32 0
+; CHECK-SPIRV-DAG: Constant [[#Int32Ty]] [[#Const12:]] 12
+; CHECK-SPIRV-DAG: Constant [[#Int32Ty]] [[#Const3:]] 3
+; CHECK-SPIRV-DAG: Constant [[#Int32Ty]] [[#Const2:]] 2
+; CHECK-SPIRV-DAG: Constant [[#Int32Ty]] [[#Const0:]] 0
+; CHECK-SPIRV-DAG: Constant [[#Int32Ty]] [[#Const48:]] 48
+; CHECK-SPIRV-DAG: Constant [[#Int32Ty]] [[#Const1:]] 1
+; CHECK-SPIRV-DAG: TypeJointMatrixINTEL [[#MatTy1:]] [[#Int32Ty]] [[#Const12]] [[#Const12]] [[#Const3]] [[#Const3]] [[#Const2]]
+; CHECK-SPIRV-DAG: TypeJointMatrixINTEL [[#MatTy2:]] [[#Int8Ty]] [[#Const12]] [[#Const48]] [[#Const0]] [[#Const3]] [[#Const0]]
+; CHECK-SPIRV-DAG: TypeJointMatrixINTEL [[#MatTy3:]] [[#Int8Ty]] [[#Const48]] [[#Const12]] [[#Const2]] [[#Const3]] [[#Const1]]
+; CHECK-SPIRV: CooperativeMatrixConstructCheckedINTEL [[#MatTy1]]
+; CHECK-SPIRV: CooperativeMatrixLoadCheckedINTEL [[#MatTy2]] [[#Load1:]]
+; CHECK-SPIRV: CooperativeMatrixLoadCheckedINTEL [[#MatTy3]]
+; CHECK-SPIRV: JointMatrixMadINTEL [[#MatTy1]]
+; CHECK-SPIRV: CooperativeMatrixStoreCheckedINTEL
+
+; CHECK-LLVM: call spir_func target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTELiiiii(i32 4, i32 4, i32 12, i32 12, i32 %_arg_Initvalue)
+; CHECK-LLVM: call spir_func target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0) @_Z93__spirv_CooperativeMatrixLoadCheckedINTEL{{[a-zA-Z0-9_]*}}(ptr addrspace(4) %[[MatrixPtr:[%0-9a-z.]+]], i32 0, i32 0, i32 0, i32 12, i32 48, i64 %_arg_K, i32 1)
+; CHECK-LLVM: call spir_func target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1) @_Z93__spirv_CooperativeMatrixLoadCheckedINTEL
+; CHECK-LLVM: call spir_func target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) @_Z27__spirv_JointMatrixMadINTEL{{[a-zA-Z0-9_]*}}(target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0) %{{.*}}, target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1) %{{.*}}, target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2)
+; CHECK-LLVM: call spir_func void @_Z42__spirv_CooperativeMatrixStoreChecked{{[a-zA-Z0-9_]*}}(ptr addrspace(4) %{{.*}}, i32 0, i32 0, target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2)
+
+; ModuleID = 'test-matrix-opaque.bc'
+source_filename = "matrix-int8-test.cpp"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%"class.sycl::_V1::range" = type { %"class.sycl::_V1::detail::array" }
+%"class.sycl::_V1::detail::array" = type { [2 x i64] }
+%"class.sycl::_V1::id" = type { %"class.sycl::_V1::detail::array" }
+
+$_ZTSZZ15matrix_multiply = comdat any
+
+@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+@__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+; Function Attrs: convergent norecurse
+define weak_odr dso_local spir_kernel void @_ZTSZZ15matrix_multiply(ptr addrspace(1) noundef align 1 %_arg_accA, ptr addrspace(1) noundef align 1 %_arg_accB, ptr noundef byval(%"class.sycl::_V1::range") align 8 %_arg_accB5, ptr noundef byval(%"class.sycl::_V1::id") align 8 %_arg_accB6, ptr addrspace(1) noundef align 4 %_arg_accC, i64 noundef %_arg_N, i64 noundef %_arg_K, i32 noundef %_arg_Initvalue) local_unnamed_addr #0 comdat {
+entry:
+  %sub_c.sroa.0.i = alloca target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2), align 8
+  %ref.tmp29.sroa.0.i = alloca target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2), align 8
+  %agg.tmp15.sroa.0.sroa.2.0..sroa_idx = getelementptr inbounds %"class.sycl::_V1::range", ptr %_arg_accB5, i64 0, i32 0, i32 0, i64 1
+  %agg.tmp15.sroa.0.sroa.2.0.copyload = load i64, ptr %agg.tmp15.sroa.0.sroa.2.0..sroa_idx, align 8
+  %agg.tmp16.sroa.0.sroa.0.0.copyload = load i64, ptr %_arg_accB6, align 8
+  %agg.tmp16.sroa.0.sroa.2.0..sroa_idx = getelementptr inbounds %"class.sycl::_V1::id", ptr %_arg_accB6, i64 0, i32 0, i32 0, i64 1
+  %agg.tmp16.sroa.0.sroa.2.0.copyload = load i64, ptr %agg.tmp16.sroa.0.sroa.2.0..sroa_idx, align 8
+  %mul.i4.i.i.i.i45 = mul i64 %agg.tmp16.sroa.0.sroa.0.0.copyload, %agg.tmp15.sroa.0.sroa.2.0.copyload
+  %add.i6.i.i.i.i46 = add i64 %mul.i4.i.i.i.i45, %agg.tmp16.sroa.0.sroa.2.0.copyload
+  %add.ptr.i47 = getelementptr inbounds i8, ptr addrspace(1) %_arg_accB, i64 %add.i6.i.i.i.i46
+  %0 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32
+  %1 = extractelement <3 x i64> %0, i64 1
+  %2 = extractelement <3 x i64> %0, i64 0
+  %3 = load <3 x i64>, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32
+  %4 = extractelement <3 x i64> %3, i64 1
+  %5 = extractelement <3 x i64> %3, i64 0
+  %cmp.i.i = icmp ult i64 %1, 2147483648
+  %cmp.i54.i = icmp ult i64 %2, 2147483648
+  %cmp.i56.i = icmp ult i64 %4, 2147483648
+  %sub.i = sub nsw i64 %1, %4
+  %cmp.i58.i = icmp ult i64 %5, 2147483648
+  %sub5.i = sub nsw i64 %2, %5
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %sub_c.sroa.0.i)
+  %call.i.i = tail call spir_func noundef target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTEL(i32 noundef 4, i32 noundef 4, i32 noundef 12, i32 noundef 12, i32 noundef %_arg_Initvalue) #4
+  store target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) %call.i.i, ptr %sub_c.sroa.0.i, align 8
+  %mul.i = mul nsw i64 %sub.i, 12
+  %div2452.i = lshr i64 %sub5.i, 4
+  %mul26.i = mul i64 %div2452.i, 48
+  %div.i = udiv i64 %_arg_K, 48
+  %mul11.i = mul i64 %mul.i, %_arg_K
+  %add.ptr.i93.i = getelementptr inbounds i8, ptr addrspace(1) %_arg_accA, i64 %mul11.i
+  %idx.neg.i.i104.i = sub i64 0, %add.i6.i.i.i.i46
+  %add.ptr.i.i105141.i = getelementptr i8, ptr addrspace(1) %add.ptr.i47, i64 %mul26.i
+  %mul22.i = shl i64 %_arg_N, 2
+  %add.ptr.i108140.i = getelementptr i8, ptr addrspace(1) %add.ptr.i.i105141.i, i64 %idx.neg.i.i104.i
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i, %entry
+  %k.0.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  %conv.i = zext i32 %k.0.i to i64
+  %cmp.i = icmp ugt i64 %div.i, %conv.i
+  br i1 %cmp.i, label %for.body.i, label %_ZZZ15matrix_multiplyIiaLm24ELm96ELm24ELm96ELm24ELm24EEvR10big_matrixIT_XT5_EXT6_EERS0_IT0_XT1_EXT2_EERS0_IS4_XT3_EXT4_EEENKUlRN4sycl3_V17handlerEE_clESC_ENKUlNSA_7nd_itemILi2EEEE_clESF_.exit
+
+for.body.i:                                       ; preds = %for.cond.i
+  %mul12.i = mul nsw i32 %k.0.i, 48
+  %conv13.i = zext i32 %mul12.i to i64
+  %add.ptr.i96.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i93.i, i64 %conv13.i
+  %call.ascast.i66.i = addrspacecast ptr addrspace(1) %add.ptr.i96.i to ptr addrspace(4)
+  %call1.i.i = tail call spir_func noundef target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_1(ptr addrspace(4) noundef %call.ascast.i66.i, i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 12, i32 noundef 48, i64 noundef %_arg_K, i32 noundef 1) #4
+  %div20.i = mul nsw i32 %k.0.i, 12
+  %conv21.i = zext i32 %div20.i to i64
+  %mul23.i = mul i64 %mul22.i, %conv21.i
+  %add.ptr.i111.i = getelementptr i8, ptr addrspace(1) %add.ptr.i108140.i, i64 %mul23.i
+  %call.ascast.i72.i = addrspacecast ptr addrspace(1) %add.ptr.i111.i to ptr addrspace(4)
+  %call1.i73.i = tail call spir_func noundef target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_2(ptr addrspace(4) noundef %call.ascast.i72.i, i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 48, i32 noundef 12, i64 noundef %mul22.i) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ref.tmp29.sroa.0.i)
+  %sub_tostore2 = load target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2), ptr %sub_c.sroa.0.i, align 8
+  %call.i77.i = tail call spir_func noundef target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) @_Z27__spirv_JointMatrixMadINTEL(target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0) noundef %call1.i.i, target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1) noundef %call1.i73.i, target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) noundef %sub_tostore2, i32 noundef 12) #4
+  store target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) %call.i77.i, ptr %ref.tmp29.sroa.0.i, align 8
+  %ref.tmp29.sroa.0.i.0.ref.tmp29.sroa.0.i.0.ref.tmp29.sroa.0.0.ref.tmp29.sroa.0.0.ref.tmp29.sroa.0.0..i = load i64, ptr %ref.tmp29.sroa.0.i, align 8
+  store i64 %ref.tmp29.sroa.0.i.0.ref.tmp29.sroa.0.i.0.ref.tmp29.sroa.0.0.ref.tmp29.sroa.0.0.ref.tmp29.sroa.0.0..i, ptr %sub_c.sroa.0.i, align 8
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ref.tmp29.sroa.0.i)
+  %add.i = add nuw nsw i32 %k.0.i, 1
+  br label %for.cond.i
+
+_ZZZ15matrix_multiplyIiaLm24ELm96ELm24ELm96ELm24ELm24EEvR10big_matrixIT_XT5_EXT6_EERS0_IT0_XT1_EXT2_EERS0_IS4_XT3_EXT4_EEENKUlRN4sycl3_V17handlerEE_clESC_ENKUlNSA_7nd_itemILi2EEEE_clESF_.exit: ; preds = %for.cond.i
+  %mul37.i = mul i64 %mul.i, %_arg_N
+  %add.ptr.i.i = getelementptr inbounds i32, ptr addrspace(1) %_arg_accC, i64 %mul37.i
+  %mul39.i = mul nuw i64 %div2452.i, 12
+  %add.ptr.i81.i = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i.i, i64 %mul39.i
+  %call.ascast.i.i = addrspacecast ptr addrspace(1) %add.ptr.i81.i to ptr addrspace(4)
+  %sub_tostore = load target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2), ptr %sub_c.sroa.0.i, align 8
+  tail call spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTEL(ptr addrspace(4) noundef %call.ascast.i.i, i32 noundef 0, i32 noundef 0, target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) noundef %sub_tostore, i32 noundef 0, i32 noundef 12, i32 noundef 12, i64 noundef %_arg_N, i32 noundef 1) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %sub_c.sroa.0.i)
+  ret void
+}
+
+; Function Attrs: convergent
+declare dso_local spir_func noundef target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) @_Z46__spirv_CooperativeMatrixConstructCheckedINTEL(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare dso_local spir_func noundef target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_1(ptr addrspace(4) noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare dso_local spir_func noundef target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1) @_Z41__spirv_CooperativeMatrixLoadCheckedINTEL_2(ptr addrspace(4) noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare dso_local spir_func noundef target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) @_Z27__spirv_JointMatrixMadINTEL(target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0) noundef, target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1) noundef, target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) noundef, i32 noundef) local_unnamed_addr #2
+
+; Function Attrs: convergent
+declare dso_local spir_func void @_Z42__spirv_CooperativeMatrixStoreCheckedINTEL(ptr addrspace(4) noundef, i32 noundef, i32 noundef, target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2) noundef, i32 noundef, i32 noundef, i32 noundef, i64 noundef, i32 noundef) local_unnamed_addr #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #3
+
+attributes #0 = { convergent norecurse "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="matrix-int8-test.cpp" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+attributes #2 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #4 = { convergent }

From 17f3a1af4072ca61e4f3ff4938533a3b5d50479d Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov@intel.com>
Date: Wed, 28 Feb 2024 17:10:00 +0100
Subject: [PATCH 543/546] Fix GV initialized with undef translation (#2383)

Previously the translator would create OpVariable with init field
pointing to OpUndef. That is not conformant with the specification, so
when LLVM GV is initialized with undef - we should skip initializer in
SPIR-V.

Later in the reader it's likely to become zero initialized, but it depends
on the linkage and storage class. Checking correctness of this logic goes
outside of this patch.

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/c621918b0e68e16
---
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     |  2 +-
 llvm-spirv/test/transcoding/undef-gv.ll       | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 llvm-spirv/test/transcoding/undef-gv.ll

diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index b387745e24703..ecfef98e53824 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -458,7 +458,7 @@ class SPIRVVariable : public SPIRVInstruction {
       : SPIRVInstruction(TheInitializer ? 5 : 4, OpVariable, TheType, TheId,
                          TheBB, TheM),
         StorageClass(TheStorageClass) {
-    if (TheInitializer)
+    if (TheInitializer && !TheInitializer->isUndef())
       Initializer.push_back(TheInitializer->getId());
     Name = TheName;
     validate();
diff --git a/llvm-spirv/test/transcoding/undef-gv.ll b/llvm-spirv/test/transcoding/undef-gv.ll
new file mode 100644
index 0000000000000..c88b17fb2aaaf
--- /dev/null
+++ b/llvm-spirv/test/transcoding/undef-gv.ll
@@ -0,0 +1,34 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -spirv-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: spirv-val %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.bc
+; RUN: llvm-dis < %t.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-SPIRV: Decorate [[#Var:]] LinkageAttributes "v" Export
+; CHECK-SPIRV: Variable [[#]] [[#Var]] [[#]] {{$}}
+; CHECK-SPIRV-NOT: OpUndef
+
+; CHECK-LLVM: @v = common addrspace(1) global i32 0, align 4
+
+; ModuleID = 'after.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+@v = addrspace(1) global i32 undef, align 4
+
+!spirv.MemoryModel = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!2}
+!opencl.used.extensions = !{!3}
+!opencl.used.optional.core.features = !{!3}
+!spirv.Generator = !{!4}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 200000}
+!2 = !{i32 2, i32 0}
+!3 = !{}
+!4 = !{i16 6, i16 14}

From cbe3792950b4d93fd6105a09104423d03efd64a8 Mon Sep 17 00:00:00 2001
From: LU-JOHN <111294400+LU-JOHN@users.noreply.github.com>
Date: Thu, 29 Feb 2024 06:13:19 -0600
Subject: [PATCH 544/546] Add --spirv-ext-inst option (#2382)

Add option --spirv-ext-inst=[none|OpenCL.std] to control what external instructions can be used when translating form LLVM IR to SPIR-V.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/2cd8b788edc53ed
---
 llvm-spirv/include/LLVMSPIRVOpts.h          | 12 ++++++++
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp        |  3 +-
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h |  2 ++
 llvm-spirv/test/llvm-intrinsics/fmuladd.ll  | 16 +++++++++--
 llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp  | 31 ++++++++++++++++++++-
 5 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h
index 25d357fd12e3d..f5e301fb8b5e7 100644
--- a/llvm-spirv/include/LLVMSPIRVOpts.h
+++ b/llvm-spirv/include/LLVMSPIRVOpts.h
@@ -99,6 +99,8 @@ enum class ExtensionID : uint32_t {
   Last,
 };
 
+enum class ExtInst : uint32_t { None, OpenCL };
+
 enum class BIsRepresentation : uint32_t { OpenCL12, OpenCL20, SPIRVFriendlyIR };
 
 enum class FPContractMode : uint32_t { On, Off, Fast };
@@ -180,6 +182,14 @@ class TranslatorOpts {
     return true;
   }
 
+  void setExtInst(ExtInst Value) {
+    // --spirv-ext-inst supersedes --spirv-replace-fmuladd-with-ocl-mad
+    ReplaceLLVMFmulAddWithOpenCLMad = false;
+    ExtInstValue = Value;
+  }
+
+  ExtInst getExtInst() const { return ExtInstValue; }
+
   void setDesiredBIsRepresentation(BIsRepresentation Value) {
     DesiredRepresentationOfBIs = Value;
   }
@@ -238,6 +248,8 @@ class TranslatorOpts {
   // SPIR-V to LLVM translation options
   bool GenKernelArgNameMD = false;
   std::unordered_map<uint32_t, uint64_t> ExternalSpecialization;
+  // Extended instruction set to use when translating from LLVM IR to SPIR-V
+  ExtInst ExtInstValue = ExtInst::None;
   // Representation of built-ins, which should be used while translating from
   // SPIR-V to back to LLVM IR
   BIsRepresentation DesiredRepresentationOfBIs = BIsRepresentation::OpenCL12;
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 1549ba73fb542..930cbef303874 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -4271,7 +4271,8 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II,
     // If allowed, let's replace llvm.fmuladd.* with mad from OpenCL extended
     // instruction set, as it has the same semantic for FULL_PROFILE OpenCL
     // devices (implementation-defined for EMBEDDED_PROFILE).
-    if (BM->shouldReplaceLLVMFmulAddWithOpenCLMad()) {
+    if (BM->shouldReplaceLLVMFmulAddWithOpenCLMad() ||
+        BM->getExtInst() == SPIRV::ExtInst::OpenCL) {
       std::vector<SPIRVValue *> Ops{transValue(II->getArgOperand(0), BB),
                                     transValue(II->getArgOperand(1), BB),
                                     transValue(II->getArgOperand(2), BB)};
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
index 196f3a90a9884..4ec78631fce91 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.h
@@ -570,6 +570,8 @@ class SPIRVModule {
     return SPIRVEIS_Debug;
   }
 
+  ExtInst getExtInst() const { return TranslationOpts.getExtInst(); }
+
   BIsRepresentation getDesiredBIsRepresentation() const {
     return TranslationOpts.getDesiredBIsRepresentation();
   }
diff --git a/llvm-spirv/test/llvm-intrinsics/fmuladd.ll b/llvm-spirv/test/llvm-intrinsics/fmuladd.ll
index 15210f3eb444d..c92d190f9b6d2 100644
--- a/llvm-spirv/test/llvm-intrinsics/fmuladd.ll
+++ b/llvm-spirv/test/llvm-intrinsics/fmuladd.ll
@@ -1,13 +1,23 @@
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: llvm-spirv %t.bc -o %t-default.spv
-; RUN: llvm-spirv %t.bc --spirv-replace-fmuladd-with-ocl-mad=true -o %t-replace.spv
-; RUN: llvm-spirv %t.bc --spirv-replace-fmuladd-with-ocl-mad=false -o %t-break.spv
+; RUN: llvm-spirv %t-default.spv -to-text -o - | FileCheck %s --check-prefixes=COMMON,REPLACE
+
+; preferred option for controlling fmuladd generation
+; RUN: llvm-spirv %t.bc --spirv-ext-inst=OpenCL.std -o %t-replace.spv
+; RUN: llvm-spirv %t.bc --spirv-ext-inst=none -o %t-break.spv
 ; RUN: spirv-val %t-replace.spv
 ; RUN: spirv-val %t-break.spv
-; RUN: llvm-spirv %t-default.spv -to-text -o - | FileCheck %s --check-prefixes=COMMON,REPLACE
 ; RUN: llvm-spirv %t-replace.spv -to-text -o - | FileCheck %s --check-prefixes=COMMON,REPLACE
 ; RUN: llvm-spirv %t-break.spv -to-text -o - | FileCheck %s --check-prefixes=COMMON,BREAK
 
+; legacy option for controlling fmuladd generation
+; RUN: llvm-spirv %t.bc --spirv-replace-fmuladd-with-ocl-mad=true -o %t-replace.legacy.spv
+; RUN: llvm-spirv %t.bc --spirv-replace-fmuladd-with-ocl-mad=false -o %t-break.legacy.spv
+; RUN: spirv-val %t-replace.legacy.spv
+; RUN: spirv-val %t-break.legacy.spv
+; RUN: llvm-spirv %t-replace.legacy.spv -to-text -o - | FileCheck %s --check-prefixes=COMMON,REPLACE
+; RUN: llvm-spirv %t-break.legacy.spv -to-text -o - | FileCheck %s --check-prefixes=COMMON,BREAK
+
 ; COMMON-NOT: llvm.fmuladd
 
 ; COMMON: TypeFloat [[f32:[0-9]+]] 32
diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
index cca27bdf6a870..35a7c4d7c0f2e 100644
--- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
+++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
@@ -135,6 +135,17 @@ static cl::opt<bool> SPIRVGenKernelArgNameMD(
     cl::desc("Enable generating OpenCL kernel argument name "
              "metadata"));
 
+static cl::opt<SPIRV::ExtInst> ExtInst(
+    "spirv-ext-inst",
+    cl::desc("Specify the extended instruction set to use when "
+             "translating from a LLVM intrinsic function to SPIR-V.  "
+             "If none, some LLVM intrinsic functions will be emulated."),
+    cl::values(clEnumValN(SPIRV::ExtInst::None, "none",
+                          "No extended instructions"),
+               clEnumValN(SPIRV::ExtInst::OpenCL, "OpenCL.std",
+                          "OpenCL.std extended instruction set")),
+    cl::init(SPIRV::ExtInst::None));
+
 static cl::opt<SPIRV::BIsRepresentation> BIsRepresentation(
     "spirv-target-env",
     cl::desc("Specify a representation of different SPIR-V Instructions which "
@@ -253,7 +264,7 @@ static cl::opt<SPIRV::DebugInfoEIS> DebugEIS(
 static cl::opt<bool> SPIRVReplaceLLVMFmulAddWithOpenCLMad(
     "spirv-replace-fmuladd-with-ocl-mad",
     cl::desc("Allow replacement of llvm.fmuladd.* intrinsic with OpenCL mad "
-             "instruction from OpenCL extended instruction set"),
+             "instruction from OpenCL extended instruction set (deprecated)"),
     cl::init(true));
 
 static cl::opt<SPIRV::BuiltinFormat> SPIRVBuiltinFormat(
@@ -708,6 +719,24 @@ int main(int Ac, char **Av) {
     return Ret;
 
   SPIRV::TranslatorOpts Opts(MaxSPIRVVersion, ExtensionsStatus);
+
+  if (ExtInst.getNumOccurrences() != 0) {
+    if (ExtInst.getNumOccurrences() > 1) {
+      errs() << "Error: --spirv-ext-inst cannot be used more than once\n";
+      return -1;
+    } else if (SPIRVReplaceLLVMFmulAddWithOpenCLMad.getNumOccurrences()) {
+      errs()
+          << "Error: --spirv-ext-inst and --spirv-replace-fmuladd-with-ocl-mad "
+             "cannot be used together.  --spirv-replace-fmuladd-with-ocl-mad "
+             "is deprecated and --spirv-ext-inst is preferred.\n";
+      return -1;
+    } else if (IsReverse) {
+      errs() << "Note: --spirv-ext-inst option ignored as it only "
+                "affects translation from LLVM IR to SPIR-V";
+    }
+    Opts.setExtInst(ExtInst);
+  }
+
   if (BIsRepresentation.getNumOccurrences() != 0) {
     if (!IsReverse) {
       errs() << "Note: --spirv-target-env option ignored as it only "

From 5c1437631e3b22cccc72d9e0a0d492ec5e7429a5 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Thu, 29 Feb 2024 16:22:04 -0800
Subject: [PATCH 545/546] Revert "[HIP] Allow partial linking for `-fgpu-rdc`
 (#81700)"

This reverts commit 33a6ce18373ffd1457ebd54e930b6f02fe4c39c1.

There is bug in the implementation, John Lu suggested to revert it
first.
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  22 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  10 +-
 clang/lib/Driver/OffloadBundler.cpp           |  40 +--
 clang/lib/Driver/ToolChains/HIPUtility.cpp    | 258 +-----------------
 clang/test/CMakeLists.txt                     |   1 -
 clang/test/CodeGenCUDA/device-stub.cu         |  10 +-
 .../test/CodeGenCUDA/host-used-device-var.cu  |   5 +-
 clang/test/Driver/Inputs/hip.h                |  25 --
 clang/test/Driver/clang-offload-bundler.c     |  13 +-
 clang/test/Driver/hip-partial-link.hip        |  97 -------
 clang/test/Driver/hip-toolchain-rdc.hip       |  38 +--
 11 files changed, 50 insertions(+), 469 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/hip.h
 delete mode 100644 clang/test/Driver/hip-partial-link.hip

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 2a0c50f05b545..3996108b64a91 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -762,10 +762,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
       // to contain the fat binary but will be populated somewhere else,
       // e.g. by lld through link script.
       FatBinStr = new llvm::GlobalVariable(
-          CGM.getModule(), CGM.Int8Ty,
-          /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
-          "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
-          llvm::GlobalVariable::NotThreadLocal);
+        CGM.getModule(), CGM.Int8Ty,
+        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+        "__hip_fatbin", nullptr,
+        llvm::GlobalVariable::NotThreadLocal);
       cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
     }
 
@@ -818,8 +818,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
   // thread safety of the loaded program. Therefore we can assume sequential
   // execution of constructor functions here.
   if (IsHIP) {
-    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
-                                 : llvm::GlobalValue::ExternalLinkage;
+    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
+        llvm::GlobalValue::LinkOnceAnyLinkage;
     llvm::BasicBlock *IfBlock =
         llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
     llvm::BasicBlock *ExitBlock =
@@ -828,11 +828,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     // of HIP ABI.
     GpuBinaryHandle = new llvm::GlobalVariable(
         TheModule, PtrTy, /*isConstant=*/false, Linkage,
-        /*Initializer=*/
-        CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
-        CudaGpuBinary
-            ? "__hip_gpubin_handle"
-            : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
+        /*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
+        "__hip_gpubin_handle");
+    if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
+      GpuBinaryHandle->setComdat(
+          CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
     GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
     // Prevent the weak symbol in different shared libraries being merged.
     if (Linkage != llvm::GlobalValue::InternalLinkage)
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 10696b6a87da6..a9890aa9c5a8f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -959,15 +959,7 @@ void CodeGenModule::Release() {
         llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
     addCompilerUsedGlobal(GV);
   }
-  if (LangOpts.HIP) {
-    // Emit a unique ID so that host and device binaries from the same
-    // compilation unit can be associated.
-    auto *GV = new llvm::GlobalVariable(
-        getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
-        llvm::Constant::getNullValue(Int8Ty),
-        "__hip_cuid_" + getContext().getCUIDHash());
-    addCompilerUsedGlobal(GV);
-  }
+
   emitLLVMUsed();
   if (SanStats)
     SanStats->finish();
diff --git a/clang/lib/Driver/OffloadBundler.cpp b/clang/lib/Driver/OffloadBundler.cpp
index 3b083f88719ae..8e61c064c53af 100644
--- a/clang/lib/Driver/OffloadBundler.cpp
+++ b/clang/lib/Driver/OffloadBundler.cpp
@@ -733,15 +733,8 @@ class ObjectFileHandler final : public FileHandler {
     StringRef Content = *ContentOrErr;
 
     // Copy fat object contents to the output when extracting host bundle.
-    std::string ModifiedContent;
-    if (Content.size() == 1u && Content.front() == 0) {
-      auto HostBundleOrErr = getHostBundle();
-      if (!HostBundleOrErr)
-        return HostBundleOrErr.takeError();
-
-      ModifiedContent = std::move(*HostBundleOrErr);
-      Content = ModifiedContent;
-    }
+    if (Content.size() == 1u && Content.front() == 0)
+      Content = StringRef(Input.getBufferStart(), Input.getBufferSize());
 
     OS.write(Content.data(), Content.size());
     return Error::success();
@@ -863,35 +856,6 @@ class ObjectFileHandler final : public FileHandler {
     }
     return Error::success();
   }
-
-  Expected<std::string> getHostBundle() {
-    TempFileHandlerRAII TempFiles;
-
-    auto ModifiedObjPathOrErr = TempFiles.Create(std::nullopt);
-    if (!ModifiedObjPathOrErr)
-      return ModifiedObjPathOrErr.takeError();
-    StringRef ModifiedObjPath = *ModifiedObjPathOrErr;
-
-    BumpPtrAllocator Alloc;
-    StringSaver SS{Alloc};
-    SmallVector<StringRef, 16> ObjcopyArgs{"llvm-objcopy"};
-
-    ObjcopyArgs.push_back("--regex");
-    ObjcopyArgs.push_back("--remove-section=__CLANG_OFFLOAD_BUNDLE__.*");
-    ObjcopyArgs.push_back("--");
-    ObjcopyArgs.push_back(BundlerConfig.InputFileNames.front());
-    ObjcopyArgs.push_back(ModifiedObjPath);
-
-    if (Error Err = executeObjcopy(BundlerConfig.ObjcopyPath, ObjcopyArgs))
-      return std::move(Err);
-
-    auto BufOrErr = MemoryBuffer::getFile(ModifiedObjPath);
-    if (!BufOrErr)
-      return createStringError(BufOrErr.getError(),
-                               "Failed to read back the modified object file");
-
-    return BufOrErr->get()->getBuffer().str();
-  }
 };
 
 /// Handler for text files. The bundled file will have the following format.
diff --git a/clang/lib/Driver/ToolChains/HIPUtility.cpp b/clang/lib/Driver/ToolChains/HIPUtility.cpp
index fcecf2e1313bb..f692458b775de 100644
--- a/clang/lib/Driver/ToolChains/HIPUtility.cpp
+++ b/clang/lib/Driver/ToolChains/HIPUtility.cpp
@@ -9,24 +9,13 @@
 #include "HIPUtility.h"
 #include "CommonArgs.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/Options.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/MD5.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
-#include <deque>
-#include <set>
 
-using namespace clang;
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace llvm::opt;
-using llvm::dyn_cast;
 
 #if defined(_WIN32) || defined(_WIN64)
 #define NULL_FILE "nul"
@@ -47,169 +36,6 @@ static std::string normalizeForBundler(const llvm::Triple &T,
                      : T.normalize();
 }
 
-// Collect undefined __hip_fatbin* and __hip_gpubin_handle* symbols from all
-// input object or archive files.
-class HIPUndefinedFatBinSymbols {
-public:
-  HIPUndefinedFatBinSymbols(const Compilation &C)
-      : C(C), DiagID(C.getDriver().getDiags().getCustomDiagID(
-                  DiagnosticsEngine::Error,
-                  "Error collecting HIP undefined fatbin symbols: %0")),
-        Quiet(C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)),
-        Verbose(C.getArgs().hasArg(options::OPT_v)) {
-    populateSymbols();
-    if (Verbose) {
-      for (auto Name : FatBinSymbols)
-        llvm::errs() << "Found undefined HIP fatbin symbol: " << Name << "\n";
-      for (auto Name : GPUBinHandleSymbols)
-        llvm::errs() << "Found undefined HIP gpubin handle symbol: " << Name
-                     << "\n";
-    }
-  }
-
-  const std::set<std::string> &getFatBinSymbols() const {
-    return FatBinSymbols;
-  }
-
-  const std::set<std::string> &getGPUBinHandleSymbols() const {
-    return GPUBinHandleSymbols;
-  }
-
-private:
-  const Compilation &C;
-  unsigned DiagID;
-  bool Quiet;
-  bool Verbose;
-  std::set<std::string> FatBinSymbols;
-  std::set<std::string> GPUBinHandleSymbols;
-  std::set<std::string> DefinedFatBinSymbols;
-  std::set<std::string> DefinedGPUBinHandleSymbols;
-  const std::string FatBinPrefix = "__hip_fatbin";
-  const std::string GPUBinHandlePrefix = "__hip_gpubin_handle";
-
-  void populateSymbols() {
-    std::deque<const Action *> WorkList;
-    std::set<const Action *> Visited;
-
-    for (const auto &Action : C.getActions())
-      WorkList.push_back(Action);
-
-    while (!WorkList.empty()) {
-      const Action *CurrentAction = WorkList.front();
-      WorkList.pop_front();
-
-      if (!CurrentAction || !Visited.insert(CurrentAction).second)
-        continue;
-
-      if (const auto *IA = dyn_cast<InputAction>(CurrentAction)) {
-        std::string ID = IA->getId().str();
-        if (!ID.empty()) {
-          ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true);
-          FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str());
-          GPUBinHandleSymbols.insert(
-              Twine(GPUBinHandlePrefix + "_" + ID).str());
-          continue;
-        }
-        if (IA->getInputArg().getNumValues() == 0)
-          continue;
-        const char *Filename = IA->getInputArg().getValue();
-        if (!Filename)
-          continue;
-        auto BufferOrErr = llvm::MemoryBuffer::getFile(Filename);
-        // Input action could be options to linker, therefore, ignore it
-        // if cannot read it. If it turns out to be a file that cannot be read,
-        // the error will be caught by the linker.
-        if (!BufferOrErr)
-          continue;
-
-        processInput(BufferOrErr.get()->getMemBufferRef());
-      } else
-        WorkList.insert(WorkList.end(), CurrentAction->getInputs().begin(),
-                        CurrentAction->getInputs().end());
-    }
-  }
-
-  void processInput(const llvm::MemoryBufferRef &Buffer) {
-    // Try processing as object file first.
-    auto ObjFileOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
-    if (ObjFileOrErr) {
-      processSymbols(**ObjFileOrErr);
-      return;
-    }
-
-    // Then try processing as archive files.
-    llvm::consumeError(ObjFileOrErr.takeError());
-    auto ArchiveOrErr = llvm::object::Archive::create(Buffer);
-    if (ArchiveOrErr) {
-      llvm::Error Err = llvm::Error::success();
-      llvm::object::Archive &Archive = *ArchiveOrErr.get();
-      for (auto &Child : Archive.children(Err)) {
-        auto ChildBufOrErr = Child.getMemoryBufferRef();
-        if (ChildBufOrErr)
-          processInput(*ChildBufOrErr);
-        else
-          errorHandler(ChildBufOrErr.takeError());
-      }
-
-      if (Err)
-        errorHandler(std::move(Err));
-      return;
-    }
-
-    // Ignore other files.
-    llvm::consumeError(ArchiveOrErr.takeError());
-  }
-
-  void processSymbols(const llvm::object::ObjectFile &Obj) {
-    for (const auto &Symbol : Obj.symbols()) {
-      auto FlagOrErr = Symbol.getFlags();
-      if (!FlagOrErr) {
-        errorHandler(FlagOrErr.takeError());
-        continue;
-      }
-
-      auto NameOrErr = Symbol.getName();
-      if (!NameOrErr) {
-        errorHandler(NameOrErr.takeError());
-        continue;
-      }
-      llvm::StringRef Name = *NameOrErr;
-
-      bool isUndefined =
-          FlagOrErr.get() & llvm::object::SymbolRef::SF_Undefined;
-      bool isFatBinSymbol = Name.starts_with(FatBinPrefix);
-      bool isGPUBinHandleSymbol = Name.starts_with(GPUBinHandlePrefix);
-
-      // Handling for defined symbols
-      if (!isUndefined) {
-        if (isFatBinSymbol) {
-          DefinedFatBinSymbols.insert(Name.str());
-          FatBinSymbols.erase(Name.str());
-        } else if (isGPUBinHandleSymbol) {
-          DefinedGPUBinHandleSymbols.insert(Name.str());
-          GPUBinHandleSymbols.erase(Name.str());
-        }
-        continue;
-      }
-
-      // Add undefined symbols if they are not in the defined sets
-      if (isFatBinSymbol &&
-          DefinedFatBinSymbols.find(Name.str()) == DefinedFatBinSymbols.end())
-        FatBinSymbols.insert(Name.str());
-      else if (isGPUBinHandleSymbol &&
-               DefinedGPUBinHandleSymbols.find(Name.str()) ==
-                   DefinedGPUBinHandleSymbols.end())
-        GPUBinHandleSymbols.insert(Name.str());
-    }
-  }
-
-  void errorHandler(llvm::Error Err) {
-    if (Quiet)
-      return;
-    C.getDriver().Diag(DiagID) << llvm::toString(std::move(Err));
-  }
-};
-
 // Construct a clang-offload-bundler command to bundle code objects for
 // different devices into a HIP fat binary.
 void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
@@ -304,84 +130,26 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
   auto HostTriple =
       C.getSingleOffloadToolChain<Action::OFK_Host>()->getTriple();
 
-  HIPUndefinedFatBinSymbols Symbols(C);
-
-  std::string PrimaryHipFatbinSymbol;
-  std::string PrimaryGpuBinHandleSymbol;
-  bool FoundPrimaryHipFatbinSymbol = false;
-  bool FoundPrimaryGpuBinHandleSymbol = false;
-
-  std::vector<std::string> AliasHipFatbinSymbols;
-  std::vector<std::string> AliasGpuBinHandleSymbols;
-
-  // Iterate through symbols to find the primary ones and collect others for
-  // aliasing
-  for (const auto &Symbol : Symbols.getFatBinSymbols()) {
-    if (!FoundPrimaryHipFatbinSymbol) {
-      PrimaryHipFatbinSymbol = Symbol;
-      FoundPrimaryHipFatbinSymbol = true;
-    } else
-      AliasHipFatbinSymbols.push_back(Symbol);
-  }
-
-  for (const auto &Symbol : Symbols.getGPUBinHandleSymbols()) {
-    if (!FoundPrimaryGpuBinHandleSymbol) {
-      PrimaryGpuBinHandleSymbol = Symbol;
-      FoundPrimaryGpuBinHandleSymbol = true;
-    } else
-      AliasGpuBinHandleSymbols.push_back(Symbol);
-  }
-
   // Add MC directives to embed target binaries. We ensure that each
   // section and image is 16-byte aligned. This is not mandatory, but
   // increases the likelihood of data to be aligned with a cache block
   // in several main host machines.
   ObjStream << "#       HIP Object Generator\n";
   ObjStream << "# *** Automatically generated by Clang ***\n";
-  if (FoundPrimaryGpuBinHandleSymbol) {
-    // Define the first gpubin handle symbol
-    if (HostTriple.isWindowsMSVCEnvironment())
-      ObjStream << "  .section .hip_gpubin_handle,\"dw\"\n";
-    else {
-      ObjStream << "  .protected " << PrimaryGpuBinHandleSymbol << "\n";
-      ObjStream << "  .type " << PrimaryGpuBinHandleSymbol << ",@object\n";
-      ObjStream << "  .section .hip_gpubin_handle,\"aw\"\n";
-    }
-    ObjStream << "  .globl " << PrimaryGpuBinHandleSymbol << "\n";
-    ObjStream << "  .p2align 3\n"; // Align 8
-    ObjStream << PrimaryGpuBinHandleSymbol << ":\n";
-    ObjStream << "  .zero 8\n"; // Size 8
-
-    // Generate alias directives for other gpubin handle symbols
-    for (const auto &AliasSymbol : AliasGpuBinHandleSymbols) {
-      ObjStream << "  .globl " << AliasSymbol << "\n";
-      ObjStream << "  .set " << AliasSymbol << "," << PrimaryGpuBinHandleSymbol
-                << "\n";
-    }
-  }
-  if (FoundPrimaryHipFatbinSymbol) {
-    // Define the first fatbin symbol
-    if (HostTriple.isWindowsMSVCEnvironment())
-      ObjStream << "  .section .hip_fatbin,\"dw\"\n";
-    else {
-      ObjStream << "  .protected " << PrimaryHipFatbinSymbol << "\n";
-      ObjStream << "  .type " << PrimaryHipFatbinSymbol << ",@object\n";
-      ObjStream << "  .section .hip_fatbin,\"a\",@progbits\n";
-    }
-    ObjStream << "  .globl " << PrimaryHipFatbinSymbol << "\n";
-    ObjStream << "  .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
-              << "\n";
-    // Generate alias directives for other fatbin symbols
-    for (const auto &AliasSymbol : AliasHipFatbinSymbols) {
-      ObjStream << "  .globl " << AliasSymbol << "\n";
-      ObjStream << "  .set " << AliasSymbol << "," << PrimaryHipFatbinSymbol
-                << "\n";
-    }
-    ObjStream << PrimaryHipFatbinSymbol << ":\n";
-    ObjStream << "  .incbin ";
-    llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true);
-    ObjStream << "\n";
+  if (HostTriple.isWindowsMSVCEnvironment()) {
+    ObjStream << "  .section .hip_fatbin, \"dw\"\n";
+  } else {
+    ObjStream << "  .protected __hip_fatbin\n";
+    ObjStream << "  .type __hip_fatbin,@object\n";
+    ObjStream << "  .section .hip_fatbin,\"a\",@progbits\n";
   }
+  ObjStream << "  .globl __hip_fatbin\n";
+  ObjStream << "  .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
+            << "\n";
+  ObjStream << "__hip_fatbin:\n";
+  ObjStream << "  .incbin ";
+  llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true);
+  ObjStream << "\n";
   if (HostTriple.isOSLinux() && HostTriple.isOSBinFormatELF())
     ObjStream << "  .section .note.GNU-stack, \"\", @progbits\n";
   ObjStream.flush();
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 398ae006583a6..20ca2309128c5 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -140,7 +140,6 @@ if( NOT CLANG_BUILT_STANDALONE )
     llvm-strip
     llvm-symbolizer
     llvm-windres
-    obj2yaml
     opt
     split-file
     yaml2obj
diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu
index 60304647bd4c5..d7a7b1bb9fe95 100644
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@@ -50,19 +50,21 @@
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
-// RUN: %clang_cc1 -cuid=123 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
 
 // RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN
 
-// RUN: %clang_cc1 -cuid=123 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
 // RUN:     -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN,HIP,HIPNEF
 
 #include "Inputs/cuda.h"
 
+// HIPNEF: $__hip_gpubin_handle = comdat any
+
 #ifndef NOGLOBALS
 // NORDC-DAG: @device_var = internal global i32
 // RDC-DAG: @device_var = global i32
@@ -159,7 +161,7 @@ __device__ void device_use() {
 // * constant unnamed string with GPU binary
 // CUDA: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",
 // HIPEF: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",{{.*}}align 4096
-// HIPNEF: @[[FATBIN:__hip_fatbin_[0-9a-f]+]] = external constant i8, section ".hip_fatbin"
+// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
 // CUDANORDC-SAME: section ".nv_fatbin", align 8
 // CUDARDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
@@ -175,7 +177,7 @@ __device__ void device_use() {
 // HIP-SAME: section ".hipFatBinSegment"
 // * variable to save GPU binary handle after initialization
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global ptr null
-// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = external hidden global ptr, align 8
+// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global ptr null
 // * constant unnamed string with NVModuleID
 // CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
diff --git a/clang/test/CodeGenCUDA/host-used-device-var.cu b/clang/test/CodeGenCUDA/host-used-device-var.cu
index 5328660c9dc9d..7cb31aff84264 100644
--- a/clang/test/CodeGenCUDA/host-used-device-var.cu
+++ b/clang/test/CodeGenCUDA/host-used-device-var.cu
@@ -1,9 +1,9 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
 // RUN:   -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \
-// RUN:   -cuid=123 | FileCheck -check-prefix=DEV %s
+// RUN:   | FileCheck -check-prefix=DEV %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \
-// RUN:   -std=c++17 -O3 -emit-llvm -o - -cuid=123 | FileCheck -check-prefix=HOST %s
+// RUN:   -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s
 
 // Negative tests.
 
@@ -187,7 +187,6 @@ public:
 // DEV-SAME: {{^[^@]*}} @_ZL2u3
 // DEV-SAME: {{^[^@]*}} @_ZZ4fun1vE11static_var1
 // DEV-SAME: {{^[^@]*}} @_ZZZN21TestStaticVarInLambda3funEvENKUlPcE_clES0_E4var2
-// DEV-SAME: {{^[^@]*}} @__hip_cuid_{{[0-9a-f]+}}
 // DEV-SAME: {{^[^@]*}} @constexpr_var2b
 // DEV-SAME: {{^[^@]*}} @inline_var
 // DEV-SAME: {{^[^@]*}} @u1
diff --git a/clang/test/Driver/Inputs/hip.h b/clang/test/Driver/Inputs/hip.h
deleted file mode 100644
index 5be772a7b3413..0000000000000
--- a/clang/test/Driver/Inputs/hip.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Minimal declarations for HIP support.  Testing purposes only. */
-
-#define __constant__ __attribute__((constant))
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-#define __host__ __attribute__((host))
-#define __shared__ __attribute__((shared))
-#define __managed__ __attribute__((managed))
-
-struct dim3 {
-  unsigned x, y, z;
-  __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
-};
-
-typedef struct hipStream *hipStream_t;
-typedef enum hipError {} hipError_t;
-int hipConfigureCall(dim3 gridSize, dim3 blockSize, unsigned long long sharedSize = 0,
-                     hipStream_t stream = 0);
-extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
-                                                 unsigned long long sharedSize = 0,
-                                                 hipStream_t stream = 0);
-extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
-                                      dim3 blockDim, void **args,
-                                      unsigned long long sharedMem,
-                                      hipStream_t stream);
diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index 6e3369df3b922..3cd48cab0ab20 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -11,7 +11,6 @@
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -emit-llvm -o %t.bc
 // RUN: %clang -O0 -target %itanium_abi_triple %s -S -o %t.s
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.o
-// RUN: obj2yaml %t.o > %t.o.yaml
 // RUN: %clang -O0 -target %itanium_abi_triple %s -emit-ast -o %t.ast
 
 //
@@ -321,13 +320,11 @@
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -input=%t.o -input=%t.tgt1 -input=%t.tgt2 -output=%t.bundle3.o
 // RUN: clang-offload-bundler -type=o -input=%t.bundle3.o -list | FileCheck -check-prefix=CKLST %s
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle
-// RUN: obj2yaml %t.res.o > %t.res.o.yaml
-// RUN: diff %t.o.yaml %t.res.o.yaml
+// RUN: diff %t.bundle3.o %t.res.o
 // RUN: diff %t.tgt1 %t.res.tgt1
 // RUN: diff %t.tgt2 %t.res.tgt2
 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.bundle3.o -unbundle
-// RUN: obj2yaml %t.res.o > %t.res.o.yaml
-// RUN: diff %t.o.yaml %t.res.o.yaml
+// RUN: diff %t.bundle3.o %t.res.o
 // RUN: diff %t.tgt1 %t.res.tgt1
 // RUN: diff %t.tgt2 %t.res.tgt2
 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu -output=%t.res.tgt1 -input=%t.bundle3.o -unbundle
@@ -336,13 +333,11 @@
 // Check if we can unbundle a file with no magic strings.
 // RUN: clang-offload-bundler -type=o -input=%t.o -list | FileCheck -check-prefix=CKLST2 --allow-empty %s
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu -output=%t.res.o -output=%t.res.tgt1 -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles
-// RUN: obj2yaml %t.res.o > %t.res.o.yaml
-// RUN: diff %t.o.yaml %t.res.o.yaml
+// RUN: diff %t.o %t.res.o
 // RUN: diff %t.empty %t.res.tgt1
 // RUN: diff %t.empty %t.res.tgt2
 // RUN: clang-offload-bundler -type=o -targets=openmp-powerpc64le-ibm-linux-gnu,host-%itanium_abi_triple,openmp-x86_64-pc-linux-gnu -output=%t.res.tgt1 -output=%t.res.o -output=%t.res.tgt2 -input=%t.o -unbundle -allow-missing-bundles
-// RUN: obj2yaml %t.res.o > %t.res.o.yaml
-// RUN: diff %t.o.yaml %t.res.o.yaml
+// RUN: diff %t.o %t.res.o
 // RUN: diff %t.empty %t.res.tgt1
 // RUN: diff %t.empty %t.res.tgt2
 
diff --git a/clang/test/Driver/hip-partial-link.hip b/clang/test/Driver/hip-partial-link.hip
deleted file mode 100644
index a1d31f9a65195..0000000000000
--- a/clang/test/Driver/hip-partial-link.hip
+++ /dev/null
@@ -1,97 +0,0 @@
-// REQUIRES: x86-registered-target, amdgpu-registered-target, lld, system-linux
-
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu \
-// RUN:   --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \
-// RUN:   -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.1.o
-
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DLIB \
-// RUN:   --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \
-// RUN:   -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.2.o
-
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -DMAIN \
-// RUN:   --offload-arch=gfx906 -c -nostdinc -nogpuinc -nohipwrapperinc \
-// RUN:   -nogpulib -fgpu-rdc -I%S/Inputs %s -o %t.main.o
-
-// RUN: llvm-nm  %t.1.o | FileCheck -check-prefix=OBJ1 %s
-// OBJ1:  B __hip_cuid_[[ID:[0-9a-f]+]]
-// OBJ1:  U __hip_fatbin_[[ID]]
-// OBJ1:  U __hip_gpubin_handle_[[ID]]
-
-// RUN: llvm-nm  %t.2.o | FileCheck -check-prefix=OBJ2 %s
-// OBJ2:  B __hip_cuid_[[ID:[0-9a-f]+]]
-// OBJ2:  U __hip_fatbin_[[ID]]
-// OBJ2:  U __hip_gpubin_handle_[[ID]]
-
-// Link %t.1.o and %t.2.o by -r and then link with %t.main.o
-
-// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
-// RUN:   --hip-link -fgpu-rdc --offload-arch=gfx906 \
-// RUN:   -r -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.lib.o \
-// RUN:   2>&1 | FileCheck -check-prefix=LD-R %s
-// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]]
-// LD-R: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]]
-// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]]
-// LD-R: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]]
-// LD-R: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle
-// LD-R: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu
-// LD-R: "{{.*}}/clang-offload-bundler"
-// LD-R: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu
-// LD-R: "{{.*}}/ld.lld" {{.*}} -r
-
-// RUN: llvm-nm  %t.lib.o | FileCheck -check-prefix=OBJ %s
-// OBJ:  B __hip_cuid_[[ID1:[0-9a-f]+]]
-// OBJ:  B __hip_cuid_[[ID2:[0-9a-f]+]]
-// OBJ:  R __hip_fatbin_[[ID1]]
-// OBJ:  R __hip_fatbin_[[ID2]]
-// OBJ:  D __hip_gpubin_handle_[[ID1]]
-// OBJ:  D __hip_gpubin_handle_[[ID2]]
-
-// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
-// RUN:   --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \
-// RUN:   -fuse-ld=lld -nostdlib -r %t.main.o %t.lib.o -o %t.final.o \
-// RUN:   2>&1 | FileCheck -check-prefix=LINK-O %s
-// LINK-O-NOT: Found undefined HIP {{.*}}symbol
-
-// Generate a static lib with %t.1.o and %t.2.o then link with %t.main.o
-
-// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
-// RUN:   --hip-link -fgpu-rdc --offload-arch=gfx906 \
-// RUN:   --emit-static-lib -fuse-ld=lld -nostdlib %t.1.o %t.2.o -o %t.a \
-// RUN:   2>&1 | FileCheck -check-prefix=STATIC %s
-// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]]
-// STATIC: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]]
-// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]]
-// STATIC: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]]
-// STATIC: "{{.*}}/clang-offload-bundler" {{.*}}-unbundle
-// STATIC: "{{.*}}/lld" -flavor gnu -m elf64_amdgpu
-// STATIC: "{{.*}}/clang-offload-bundler"
-// STATIC: "{{.*}}/llvm-mc" -triple x86_64-unknown-linux-gnu
-// STATIC: "{{.*}}/llvm-ar"
-
-// RUN: %clang -v --target=x86_64-unknown-linux-gnu \
-// RUN:   --hip-link -no-hip-rt -fgpu-rdc --offload-arch=gfx906 \
-// RUN:   -fuse-ld=lld -nostdlib -r %t.main.o %t.a -o %t.final.o \
-// RUN:   2>&1 | FileCheck -check-prefix=LINK-A %s
-// LINK-A-NOT: Found undefined HIP {{.*}}symbol
-
-#include "hip.h"
-
-#ifdef LIB
-__device__ int x;
-__device__ void libfun() {
-  x = 1;
-}
-#elif !defined(MAIN)
-__device__ void libfun();
-__global__ void kern() {
-  libfun();
-}
-void run() {
-  kern<<<1,1>>>();
-}
-#else
-extern void run();
-int main() {
-  run();
-}
-#endif
diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip
index d19d8ccd6cb29..1827531f9cab7 100644
--- a/clang/test/Driver/hip-toolchain-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-rdc.hip
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: amdgpu-registered-target
 
-// RUN: %clang -### --target=x86_64-linux-gnu -v \
+// RUN: %clang -### --target=x86_64-linux-gnu \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
@@ -12,7 +12,7 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LNX %s
 
-// RUN: %clang -### --target=x86_64-pc-windows-msvc -v \
+// RUN: %clang -### --target=x86_64-pc-windows-msvc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
 // RUN:   --hip-device-lib=lib1.bc --hip-device-lib=lib2.bc \
 // RUN:   --hip-device-lib-path=%S/Inputs/hip_multiple_inputs/lib1 \
@@ -23,31 +23,15 @@
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
 // RUN: 2>&1 | FileCheck -check-prefixes=CHECK,MSVC %s
 
-// check HIP fatbin and gpubin handle symbols and code object alignment in dumped llvm-mc input
-// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID1:[0-9a-f]+]]
-// CHECK: Found undefined HIP fatbin symbol: __hip_fatbin_[[ID2:[0-9a-f]+]]
-// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID1]]
-// CHECK: Found undefined HIP gpubin handle symbol: __hip_gpubin_handle_[[ID2]]
-// LNX:  .protected __hip_gpubin_handle_[[ID1]]
-// LNX:  .type __hip_gpubin_handle_[[ID1]]
-// LNX-LABEL:  .section .hip_gpubin_handle,"aw"
-// MSVC-LABEL: .section .hip_gpubin_handle,"dw"
-// CHECK:  .globl __hip_gpubin_handle_[[ID1]]
-// CHECK-NEXT:  .p2align 3
-// CHECK-NEXT:__hip_gpubin_handle_[[ID1]]:
-// CHECK-NEXT:  .zero 8
-// CHECK-NEXT:  .globl __hip_gpubin_handle_[[ID2]]
-// CHECK-NEXT:  .set __hip_gpubin_handle_[[ID2]],__hip_gpubin_handle_[[ID1]]
-// LNX: .protected __hip_fatbin_[[ID1]]
-// LNX: .type __hip_fatbin_[[ID1]],@object
-// LNX-LABEL: .section .hip_fatbin,"a",@progbits
-// MSVC-LABEL: .section .hip_fatbin,"dw"
-// CHECK: .globl __hip_fatbin_[[ID1]]
-// CHECK-NEXT: .p2align 12
-// CHECK-NEXT:  .globl __hip_fatbin_[[ID2]]
-// CHECK-NEXT:  .set __hip_fatbin_[[ID2]],__hip_fatbin_[[ID1]]
-// CHECK-NEXT: __hip_fatbin_[[ID1]]:
-// CHECK-NEXT: .incbin "[[BUNDLE:.*hipfb]]"
+// check code object alignment in dumped llvm-mc input
+// LNX: .protected __hip_fatbin
+// LNX: .type __hip_fatbin,@object
+// LNX: .section .hip_fatbin,"a",@progbits
+// MSVC: .section .hip_fatbin, "dw"
+// CHECK: .globl __hip_fatbin
+// CHECK: .p2align 12
+// CHECK: __hip_fatbin:
+// CHECK: .incbin "[[BUNDLE:.*hipfb]]"
 // LNX: .section .note.GNU-stack, "", @progbits
 // MSVC-NOT: .note.GNU-stack
 

From bbf3800adf1209d062ddb49555bb4f36120575ad Mon Sep 17 00:00:00 2001
From: sys_ce_bb <sys_ce_bb@intel.com>
Date: Thu, 29 Feb 2024 16:42:21 -0800
Subject: [PATCH 546/546] Revert "Fix OpGroupNonUniformBroadcast version
 requirement (#2378)"

This reverts commit 1d53bf42604a6dd6f5364746f878e1e6ec4288e5.
---
 .../lib/SPIRV/libSPIRV/SPIRVInstruction.h     | 28 -------------------
 .../test/transcoding/subgroup_spirv_1_5.cl    | 17 -----------
 2 files changed, 45 deletions(-)
 delete mode 100644 llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl

diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
index ecfef98e53824..cfd71cfccc923 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -276,9 +276,6 @@ class SPIRVInstTemplateBase : public SPIRVInstruction {
     } else
       SPIRVEntry::setWordCount(WC);
     Ops = TheOps;
-    // The required SPIR-V version depends on the operands for some
-    // instructions.
-    updateModuleVersion();
   }
   void setWordCount(SPIRVWord TheWordCount) override {
     SPIRVEntry::setWordCount(TheWordCount);
@@ -298,11 +295,6 @@ class SPIRVInstTemplateBase : public SPIRVInstruction {
 
   /// Get operand as value.
   /// If the operand is a literal, return it as a uint32 constant.
-  const SPIRVValue *getOpValue(int I) const {
-    return isOperandLiteral(I) ? Module->getLiteralAsConstant(Ops[I])
-                               : getValue(Ops[I]);
-  }
-
   SPIRVValue *getOpValue(int I) {
     return isOperandLiteral(I) ? Module->getLiteralAsConstant(Ops[I])
                                : getValue(Ops[I]);
@@ -323,10 +315,6 @@ class SPIRVInstTemplateBase : public SPIRVInstruction {
     return Operands;
   }
 
-  virtual const SPIRVValue *getOperand(unsigned I) const {
-    return getOpValue(I);
-  }
-
   virtual SPIRVValue *getOperand(unsigned I) {
     return getOpValue(I);
   }
@@ -2515,22 +2503,6 @@ class SPIRVGroupNonUniformBallotInst : public SPIRVInstTemplateBase {
   SPIRVCapVec getRequiredCapability() const override {
     return getVec(CapabilityGroupNonUniformBallot);
   }
-
-  SPIRVWord getRequiredSPIRVVersion() const override {
-    switch (OpCode) {
-    case OpGroupNonUniformBroadcast: {
-      assert(Ops.size() == 3 && "Expecting (Execution, Value, Id) operands");
-      if (!isConstantOpCode(getOperand(2)->getOpCode())) {
-        // Before version 1.5, Id must come from a constant instruction.
-        return static_cast<SPIRVWord>(VersionNumber::SPIRV_1_5);
-      }
-      break;
-    }
-    default:
-      break;
-    }
-    return static_cast<SPIRVWord>(VersionNumber::SPIRV_1_3);
-  }
 };
 
 #define _SPIRV_OP(x, ...)                                                      \
diff --git a/llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl b/llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl
deleted file mode 100644
index be7dbf59a49d6..0000000000000
--- a/llvm-spirv/test/transcoding/subgroup_spirv_1_5.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: %clang_cc1 -triple spir-unknown-unknown -O1 -cl-std=CL2.0 -fdeclare-opencl-builtins -finclude-default-header -emit-llvm-bc %s -o %t.bc
-// RUN: not llvm-spirv --spirv-max-version=1.4 %t.bc 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
-// RUN: llvm-spirv %t.bc -o %t.spv
-// RUN: spirv-val %t.spv
-// RUN: llvm-spirv -r %t.spv -o %t.rev.bc
-// RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
-
-// Before SPIR-V 1.5, the Id operand of OpGroupNonUniformBroadcast must come from a constant instruction.
-// CHECK-ERROR: RequiresVersion: Cannot fulfill SPIR-V version restriction:
-// CHECK-ERROR-NEXT: SPIR-V version was restricted to at most 1.4 (66560) but a construct from the input requires SPIR-V version 1.5 (66816) or above
-
-// CHECK-LLVM-LABEL: @test
-// CHECK-LLVM: call spir_func i16 @_Z31sub_group_non_uniform_broadcasttj(i16 %a, i32 %id)
-
-kernel void test(short a, uint id, global short *res) {
-  res[0] = sub_group_non_uniform_broadcast(a, id);
-}